hf-eda-mcp

Running

App Files Files Community

KhalilGuetari commited on 19 days ago

Commit

2b910cc

1 Parent(s): 43642a4

Use hf_token provided in mcp headers

Browse files

Files changed (10) hide show

README.md +7 -1
src/hf_eda_mcp/error_handling.py +1 -15
src/hf_eda_mcp/integrations/hf_client.py +1 -9
src/hf_eda_mcp/server.py +2 -24
src/hf_eda_mcp/services/dataset_service.py +12 -4
src/hf_eda_mcp/services/dataset_viewer_adapter.py +4 -6
src/hf_eda_mcp/tools/__init__.py +1 -7
src/hf_eda_mcp/tools/analysis.py +5 -19
src/hf_eda_mcp/tools/metadata.py +6 -20
src/hf_eda_mcp/tools/sampling.py +90 -129

README.md CHANGED Viewed

@@ -56,7 +56,13 @@ Replace `YOUR-USERNAME` with your HuggingFace username.
 ## Authentication
-For private datasets, set the `HF_TOKEN` secret in your Space settings.
 ## License

 ## Authentication
+## To Do List
+[ ] Security: Do not cache when a dataset is private or gated
+[ ] Complete MCP server configuration and documentation
 ## License

src/hf_eda_mcp/error_handling.py CHANGED Viewed

@@ -238,7 +238,6 @@ def get_dataset_suggestions(dataset_id: str) -> List[str]:
 def format_authentication_error(
     dataset_id: str,
     is_gated: bool = False,
-    has_token: bool = False
 ) -> Dict[str, Any]:
     """
     Format authentication error with helpful guidance.
@@ -246,7 +245,6 @@ def format_authentication_error(
     Args:
         dataset_id: The dataset identifier
         is_gated: Whether the dataset is gated (requires approval)
-        has_token: Whether a token was provided
     Returns:
         Dictionary with error details and suggestions
@@ -255,7 +253,6 @@ def format_authentication_error(
         "error_type": "authentication_error",
         "dataset_id": dataset_id,
         "is_gated": is_gated,
-        "has_token": has_token,
         "message": "",
         "suggestions": []
     }
@@ -270,16 +267,6 @@ def format_authentication_error(
             "Provide a valid HuggingFace token after receiving access",
             "Check your HuggingFace account for access status"
         ]
-    elif not has_token:
-        error_details["message"] = (
-            f"Dataset '{dataset_id}' is private and requires authentication."
-        )
-        error_details["suggestions"] = [
-            "Provide a HuggingFace authentication token",
-            "Create a token at: https://huggingface.co/settings/tokens",
-            "Set the token in your environment: HF_TOKEN=your_token",
-            "Ensure the token has read access to datasets"
-        ]
     else:
         error_details["message"] = (
             f"Authentication failed for dataset '{dataset_id}'. "
@@ -381,8 +368,7 @@ def format_error_response(
     elif isinstance(error, AuthenticationError):
         dataset_id = context.get("dataset_id", "unknown")
         is_gated = "gated" in str(error).lower()
-        has_token = context.get("has_token", False)
-        return format_authentication_error(dataset_id, is_gated, has_token)
     elif isinstance(error, NetworkError):
         operation = context.get("operation", "operation")

 def format_authentication_error(
     dataset_id: str,
     is_gated: bool = False,
 ) -> Dict[str, Any]:
     """
     Format authentication error with helpful guidance.
     Args:
         dataset_id: The dataset identifier
         is_gated: Whether the dataset is gated (requires approval)
     Returns:
         Dictionary with error details and suggestions
         "error_type": "authentication_error",
         "dataset_id": dataset_id,
         "is_gated": is_gated,
         "message": "",
         "suggestions": []
     }
             "Provide a valid HuggingFace token after receiving access",
             "Check your HuggingFace account for access status"
         ]
     else:
         error_details["message"] = (
             f"Authentication failed for dataset '{dataset_id}'. "
     elif isinstance(error, AuthenticationError):
         dataset_id = context.get("dataset_id", "unknown")
         is_gated = "gated" in str(error).lower()
+        return format_authentication_error(dataset_id, is_gated)
     elif isinstance(error, NetworkError):
         operation = context.get("operation", "operation")

src/hf_eda_mcp/integrations/hf_client.py CHANGED Viewed

@@ -62,10 +62,7 @@ class HfClient:
         """
         self.token = token
         self.api = HfApi(token=token)
-        self._authenticated = False
-        if token:
-            self._authenticate()
     def _authenticate(self) -> None:
         """
@@ -299,8 +296,3 @@ class HfClient:
         except Exception:
             # For other errors (network, etc.), assume dataset exists but there's a temporary issue
             return True
-    @property
-    def is_authenticated(self) -> bool:
-        """Check if client is authenticated."""
-        return self._authenticated

         """
         self.token = token
         self.api = HfApi(token=token)
+        self._authenticate()
     def _authenticate(self) -> None:
         """
         except Exception:
             # For other errors (network, etc.), assume dataset exists but there's a temporary issue
             return True

src/hf_eda_mcp/server.py CHANGED Viewed

@@ -9,16 +9,10 @@ import gradio as gr
 import sys
 from typing import Optional
-# Import configuration
-from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
-# Import EDA tools - these will be automatically exposed as MCP tools
 from hf_eda_mcp.tools.metadata import get_dataset_metadata
 from hf_eda_mcp.tools.sampling import get_dataset_sample
 from hf_eda_mcp.tools.analysis import analyze_dataset_features
-# These functions will be automatically exposed as MCP tools when mcp_server=True
 def create_gradio_app(config: ServerConfig) -> gr.Blocks:
@@ -163,27 +157,11 @@ def create_gradio_app(config: ServerConfig) -> gr.Blocks:
                 3. **analyze_dataset_features**: Perform exploratory data analysis
                 ### MCP Server Configuration
-                To connect MCP clients to this server, use:
-                ```json
-                {{
-                  "mcpServers": {{
-                    "hf-eda-mcp-server": {{
-                      "command": "pdm",
-                      "args": ["run", "hf-eda-mcp"],
-                      "env": {{
-                        "HF_TOKEN": "your_huggingface_token_here"
-                      }}
-                    }}
-                  }}
-                }}
-                ```
                 ### Server Status
                 - **MCP Tools**: 3 tools available
-                - **Authentication**: {"✅ Token configured" if config.hf_token else "⚠️ No token (public datasets only)"}
                 - **MCP Schema**: Available at `/gradio_api/mcp/schema`
                 - **Cache Directory**: {config.cache_dir or "Default system cache"}
                 - **Max Sample Size**: {config.max_sample_size:,}

 import sys
 from typing import Optional
 from hf_eda_mcp.tools.metadata import get_dataset_metadata
 from hf_eda_mcp.tools.sampling import get_dataset_sample
 from hf_eda_mcp.tools.analysis import analyze_dataset_features
+from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
 def create_gradio_app(config: ServerConfig) -> gr.Blocks:
                 3. **analyze_dataset_features**: Perform exploratory data analysis
                 ### MCP Server Configuration
                 ### Server Status
                 - **MCP Tools**: 3 tools available
                 - **MCP Schema**: Available at `/gradio_api/mcp/schema`
                 - **Cache Directory**: {config.cache_dir or "Default system cache"}
                 - **Max Sample Size**: {config.max_sample_size:,}

src/hf_eda_mcp/services/dataset_service.py CHANGED Viewed

@@ -14,6 +14,7 @@ from pathlib import Path
 from datasets import load_dataset
 from datasets.utils.logging import disable_progress_bar
 from hf_eda_mcp.integrations.hf_client import (
     HfClient,
     DatasetNotFoundError,
@@ -806,7 +807,14 @@ class DatasetService:
         """
         return self.hf_client.validate_dataset_access(dataset_id, config_name)
-    @property
-    def is_authenticated(self) -> bool:
-        """Check if the service is authenticated with HuggingFace."""
-        return self.hf_client.is_authenticated

 from datasets import load_dataset
 from datasets.utils.logging import disable_progress_bar
+from hf_eda_mcp.config import get_config
 from hf_eda_mcp.integrations.hf_client import (
     HfClient,
     DatasetNotFoundError,
         """
         return self.hf_client.validate_dataset_access(dataset_id, config_name)
+def get_dataset_service(hf_api_token: str) -> DatasetService:
+    """Get or create the global dataset service instance using current config."""
+    config = get_config()
+    if hf_api_token is None:
+        hf_api_token = config.hf_token
+    dataset_service = DatasetService(
+        cache_dir=config.cache_dir,
+        token=hf_api_token
+    )
+    return dataset_service

src/hf_eda_mcp/services/dataset_viewer_adapter.py CHANGED Viewed

@@ -22,7 +22,7 @@ class DatasetViewerAdapter():
     def __init__(
         self,
-        token: Optional[str] = None,
     ):
         """
         Initialize dataset service with optional caching and authentication.
@@ -32,8 +32,6 @@ class DatasetViewerAdapter():
         """
         if token:
             self.token = token
-        else:
-            self.token = os.environ.get("HF_TOKEN")
         self.base_url = "https://datasets-server.huggingface.co/"
     def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
@@ -160,7 +158,7 @@ class DatasetViewerAdapter():
         self,
         dataset_name: str,
         config: str,
-        split_name: str
     ) -> dict:
         """
         Get detailed statistics for a dataset split from the Dataset Viewer API.
@@ -200,7 +198,7 @@ class DatasetViewerAdapter():
         try:
             result = self._api_get(
                 route="statistics",
-                params=params
             )
             # Check for errors in response
@@ -222,7 +220,7 @@ class DatasetViewerAdapter():
     def check_statistics_availability(
         self,
-        dataset_name: str,
         config: Optional[str] = None
     ) -> dict:
         """

     def __init__(
         self,
+        token: str,
     ):
         """
         Initialize dataset service with optional caching and authentication.
         """
         if token:
             self.token = token
         self.base_url = "https://datasets-server.huggingface.co/"
     def _api_get(self, route: str, params: dict, extra_headers: Optional[dict] = None) -> dict:
         self,
         dataset_name: str,
         config: str,
+        split_name: str,
     ) -> dict:
         """
         Get detailed statistics for a dataset split from the Dataset Viewer API.
         try:
             result = self._api_get(
                 route="statistics",
+                params=params,
             )
             # Check for errors in response
     def check_statistics_availability(
         self,
+        dataset_name: str,
         config: Optional[str] = None
     ) -> dict:
         """

src/hf_eda_mcp/tools/__init__.py CHANGED Viewed

@@ -5,11 +5,7 @@ This package contains individual EDA functions that will be exposed as MCP tools
 """
 from hf_eda_mcp.tools.metadata import get_dataset_metadata
-from hf_eda_mcp.tools.sampling import (
-    get_dataset_sample,
-    get_dataset_sample_with_indices,
-    get_available_splits,
-)
 from hf_eda_mcp.tools.analysis import analyze_dataset_features
 __all__ = [
@@ -18,8 +14,6 @@ __all__ = [
     # Sampling tools
     'get_dataset_sample',
-    'get_dataset_sample_with_indices',
-    'get_available_splits',
     # Analysis tools
     'analyze_dataset_features',

 """
 from hf_eda_mcp.tools.metadata import get_dataset_metadata
+from hf_eda_mcp.tools.sampling import get_dataset_sample
 from hf_eda_mcp.tools.analysis import analyze_dataset_features
 __all__ = [
     # Sampling tools
     'get_dataset_sample',
     # Analysis tools
     'analyze_dataset_features',

src/hf_eda_mcp/tools/analysis.py CHANGED Viewed

@@ -7,10 +7,10 @@ feature statistics and missing value analysis.
 import logging
 import statistics
 from typing import Optional, Dict, Any, List
 from collections import Counter
-from hf_eda_mcp.config import get_config
-from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
 from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
 from hf_eda_mcp.validation import (
     validate_dataset_id,
@@ -22,30 +22,17 @@ from hf_eda_mcp.validation import (
 )
 from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
-logger = logging.getLogger(__name__)
-# Global dataset service instance
-_dataset_service: Optional[DatasetService] = None
 # Default constants (can be overridden by config)
 DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
 MAX_UNIQUE_VALUES_TO_SHOW = 20
-def get_dataset_service() -> DatasetService:
-    """Get or create the global dataset service instance using current config."""
-    global _dataset_service
-    if _dataset_service is None:
-        config = get_config()
-        _dataset_service = DatasetService(
-            cache_dir=config.cache_dir,
-            token=config.hf_token
-        )
-    return _dataset_service
 def analyze_dataset_features(
     dataset_id: str,
     split: str = "train",
     sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
     config_name: Optional[str] = None,
@@ -118,7 +105,7 @@ def analyze_dataset_features(
     try:
         # Get dataset service
-        service = get_dataset_service()
         # Try to get statistics from Dataset Viewer API first (more efficient and complete)
         viewer_stats = service.get_dataset_statistics(
@@ -198,7 +185,6 @@ def analyze_dataset_features(
     except AuthenticationError as e:
         log_error_with_context(e, context, level=logging.WARNING)
-        context["has_token"] = get_dataset_service().is_authenticated
         error_response = format_error_response(e, context)
         logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
         raise

 import logging
 import statistics
+import gradio as gr
 from typing import Optional, Dict, Any, List
 from collections import Counter
+from hf_eda_mcp.services.dataset_service import get_dataset_service, DatasetServiceError
 from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
 from hf_eda_mcp.validation import (
     validate_dataset_id,
 )
 from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
+logger = logging.getLogger(__name__)
 # Default constants (can be overridden by config)
 DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
 MAX_UNIQUE_VALUES_TO_SHOW = 20
 def analyze_dataset_features(
     dataset_id: str,
+    hf_api_token: gr.Header,
     split: str = "train",
     sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
     config_name: Optional[str] = None,
     try:
         # Get dataset service
+        service = get_dataset_service(hf_api_token=hf_api_token)
         # Try to get statistics from Dataset Viewer API first (more efficient and complete)
         viewer_stats = service.get_dataset_statistics(
     except AuthenticationError as e:
         log_error_with_context(e, context, level=logging.WARNING)
         error_response = format_error_response(e, context)
         logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
         raise

src/hf_eda_mcp/tools/metadata.py CHANGED Viewed

@@ -6,9 +6,9 @@ HuggingFace datasets including size, features, splits, and configuration details
 """
 import logging
 from typing import Optional, Dict, Any
-from hf_eda_mcp.config import get_config
-from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
 from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
 from hf_eda_mcp.validation import (
     validate_dataset_id,
@@ -18,25 +18,11 @@ from hf_eda_mcp.validation import (
 )
 from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
-logger = logging.getLogger(__name__)
-# Global dataset service instance
-_dataset_service: Optional[DatasetService] = None
-def get_dataset_service() -> DatasetService:
-    """Get or create the global dataset service instance using current config."""
-    global _dataset_service
-    if _dataset_service is None:
-        config = get_config()
-        _dataset_service = DatasetService(
-            cache_dir=config.cache_dir,
-            token=config.hf_token
-        )
-    return _dataset_service
-def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
     """
     Retrieve comprehensive metadata for a HuggingFace dataset.
@@ -46,6 +32,7 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
     Args:
         dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
         config_name: Optional configuration name for multi-config datasets
     Returns:
@@ -102,7 +89,7 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
     try:
         # Get dataset service and retrieve metadata
-        service = get_dataset_service()
         metadata = service.load_dataset_info(dataset_id, config_name)
         # Add the requested config name to the response if specified
@@ -156,7 +143,6 @@ def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) ->
     except AuthenticationError as e:
         # Add helpful context to the error
         log_error_with_context(e, context, level=logging.WARNING)
-        context["has_token"] = get_dataset_service().is_authenticated
         error_response = format_error_response(e, context)
         logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
         raise

 """
 import logging
+import gradio as gr
 from typing import Optional, Dict, Any
+from hf_eda_mcp.services.dataset_service import DatasetServiceError, get_dataset_service
 from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
 from hf_eda_mcp.validation import (
     validate_dataset_id,
 )
 from hf_eda_mcp.error_handling import format_error_response, log_error_with_context
+logger = logging.getLogger(__name__)
+def get_dataset_metadata(dataset_id: str, hf_api_token: gr.Header, config_name: Optional[str] = None) -> Dict[str, Any]:
     """
     Retrieve comprehensive metadata for a HuggingFace dataset.
     Args:
         dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
+        hf_api_token: Header parsed by Gradio when hf_api_token is provided in MCP configuration headers
         config_name: Optional configuration name for multi-config datasets
     Returns:
     try:
         # Get dataset service and retrieve metadata
+        service = get_dataset_service(hf_api_token=hf_api_token)
         metadata = service.load_dataset_info(dataset_id, config_name)
         # Add the requested config name to the response if specified
     except AuthenticationError as e:
         # Add helpful context to the error
         log_error_with_context(e, context, level=logging.WARNING)
         error_response = format_error_response(e, context)
         logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
         raise

src/hf_eda_mcp/tools/sampling.py CHANGED Viewed

@@ -6,16 +6,16 @@ with support for different splits, configurable sample sizes, and streaming for
 """
 import logging
-from typing import Optional, Dict, Any, List
 from hf_eda_mcp.config import get_config
-from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
 from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
 from hf_eda_mcp.validation import (
     validate_dataset_id,
     validate_config_name,
     validate_split_name,
     validate_sample_size,
-    validate_indices,
     ValidationError,
     format_validation_error,
 )
@@ -23,27 +23,14 @@ from hf_eda_mcp.error_handling import format_error_response, log_error_with_cont
 logger = logging.getLogger(__name__)
-# Global dataset service instance
-_dataset_service: Optional[DatasetService] = None
 # Default constants (can be overridden by config)
 DEFAULT_SAMPLE_SIZE = 10
 VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
-def get_dataset_service() -> DatasetService:
-    """Get or create the global dataset service instance using current config."""
-    global _dataset_service
-    if _dataset_service is None:
-        config = get_config()
-        _dataset_service = DatasetService(
-            cache_dir=config.cache_dir, token=config.hf_token
-        )
-    return _dataset_service
 def get_dataset_sample(
     dataset_id: str,
     split: str = "train",
     num_samples: int = DEFAULT_SAMPLE_SIZE,
     config_name: Optional[str] = None,
@@ -121,7 +108,7 @@ def get_dataset_sample(
     try:
         # Get dataset service and load sample
-        service = get_dataset_service()
         sample_data = service.load_dataset_sample(
             dataset_id=dataset_id,
             split=split,
@@ -169,7 +156,6 @@ def get_dataset_sample(
     except AuthenticationError as e:
         log_error_with_context(e, context, level=logging.WARNING)
-        context["has_token"] = get_dataset_service().is_authenticated
         error_response = format_error_response(e, context)
         logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
         raise
@@ -185,92 +171,92 @@ def get_dataset_sample(
         raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
-def get_dataset_sample_with_indices(
-    dataset_id: str,
-    indices: List[int],
-    split: str = "train",
-    config_name: Optional[str] = None,
-) -> Dict[str, Any]:
-    """
-    Retrieve specific samples by their indices from a HuggingFace dataset.
-    This function allows for targeted sampling by specifying exact row indices.
-    Note: This requires loading the dataset in non-streaming mode.
-    Args:
-        dataset_id: HuggingFace dataset identifier
-        indices: List of row indices to retrieve
-        split: Dataset split to sample from (default: 'train')
-        config_name: Optional configuration name for multi-config datasets
-    Returns:
-        Dictionary containing the requested samples and metadata
-    Raises:
-        ValueError: If inputs are invalid
-        DatasetServiceError: If sampling fails
-    """
-    # Handle empty strings from Gradio (convert to None)
-    if config_name == "":
-        config_name = None
-    # Input validation using centralized validation
-    try:
-        dataset_id = validate_dataset_id(dataset_id)
-        config_name = validate_config_name(config_name)
-        split = validate_split_name(split)
-        indices = validate_indices(indices)
-    except ValidationError as e:
-        logger.error(f"Validation error: {format_validation_error(e)}")
-        raise ValueError(format_validation_error(e))
-    logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
-    try:
-        from datasets import load_dataset
-        # Load dataset without streaming to access by index
-        dataset = load_dataset(
-            dataset_id, name=config_name, split=split, streaming=False
-        )
-        # Validate indices are within bounds
-        max_index = max(indices)
-        if max_index >= len(dataset):
-            raise ValueError(
-                f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows"
-            )
-        # Get samples by indices
-        samples = [dataset[i] for i in indices]
-        # Get dataset info for schema
-        service = get_dataset_service()
-        dataset_info = service.load_dataset_info(dataset_id, config_name)
-        # Prepare response
-        sample_data = {
-            "dataset_id": dataset_id,
-            "config_name": config_name,
-            "split": split,
-            "num_samples": len(samples),
-            "requested_indices": indices,
-            "data": samples,
-            "schema": dataset_info.get("features", {}),
-            "sample_info": {
-                "sampling_strategy": "by_indices",
-                "streaming_used": False,
-                "indices_requested": len(indices),
-            },
-        }
-        sample_data["summary"] = _generate_sample_summary(sample_data)
-        return sample_data
-    except Exception as e:
-        logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
-        raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
 def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
@@ -307,28 +293,3 @@ def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
         summary_parts.append("Strategy: first N rows")
     return " | ".join(summary_parts)
-def get_available_splits(
-    dataset_id: str, config_name: Optional[str] = None
-) -> List[str]:
-    """
-    Get available splits for a dataset.
-    Args:
-        dataset_id: HuggingFace dataset identifier
-        config_name: Optional configuration name
-    Returns:
-        List of available split names
-    Raises:
-        DatasetServiceError: If unable to retrieve split information
-    """
-    try:
-        service = get_dataset_service()
-        metadata = service.load_dataset_info(dataset_id, config_name)
-        return list(metadata.get("splits", {}).keys())
-    except Exception as e:
-        logger.error(f"Failed to get splits for {dataset_id}: {str(e)}")
-        raise DatasetServiceError(f"Failed to get available splits: {str(e)}")

 """
 import logging
+import gradio as gr
+from typing import Optional, Dict, Any
 from hf_eda_mcp.config import get_config
+from hf_eda_mcp.services.dataset_service import get_dataset_service, DatasetServiceError
 from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError, NetworkError
 from hf_eda_mcp.validation import (
     validate_dataset_id,
     validate_config_name,
     validate_split_name,
     validate_sample_size,
     ValidationError,
     format_validation_error,
 )
 logger = logging.getLogger(__name__)
 # Default constants (can be overridden by config)
 DEFAULT_SAMPLE_SIZE = 10
 VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
 def get_dataset_sample(
     dataset_id: str,
+    hf_api_token: gr.Header,
     split: str = "train",
     num_samples: int = DEFAULT_SAMPLE_SIZE,
     config_name: Optional[str] = None,
     try:
         # Get dataset service and load sample
+        service = get_dataset_service(hf_api_token=hf_api_token)
         sample_data = service.load_dataset_sample(
             dataset_id=dataset_id,
             split=split,
     except AuthenticationError as e:
         log_error_with_context(e, context, level=logging.WARNING)
         error_response = format_error_response(e, context)
         logger.info(f"Authentication error guidance: {error_response.get('suggestions', [])}")
         raise
         raise DatasetServiceError(f"Failed to sample dataset: {str(e)}") from e
+# def get_dataset_sample_with_indices(
+#     dataset_id: str,
+#     indices: List[int],
+#     split: str = "train",
+#     config_name: Optional[str] = None,
+# ) -> Dict[str, Any]:
+#     """
+#     Retrieve specific samples by their indices from a HuggingFace dataset.
+#     This function allows for targeted sampling by specifying exact row indices.
+#     Note: This requires loading the dataset in non-streaming mode.
+#     Args:
+#         dataset_id: HuggingFace dataset identifier
+#         indices: List of row indices to retrieve
+#         split: Dataset split to sample from (default: 'train')
+#         config_name: Optional configuration name for multi-config datasets
+#     Returns:
+#         Dictionary containing the requested samples and metadata
+#     Raises:
+#         ValueError: If inputs are invalid
+#         DatasetServiceError: If sampling fails
+#     """
+#     # Handle empty strings from Gradio (convert to None)
+#     if config_name == "":
+#         config_name = None
+#     # Input validation using centralized validation
+#     try:
+#         dataset_id = validate_dataset_id(dataset_id)
+#         config_name = validate_config_name(config_name)
+#         split = validate_split_name(split)
+#         indices = validate_indices(indices)
+#     except ValidationError as e:
+#         logger.error(f"Validation error: {format_validation_error(e)}")
+#         raise ValueError(format_validation_error(e))
+#     logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
+#     try:
+#         from datasets import load_dataset
+#         # Load dataset without streaming to access by index
+#         dataset = load_dataset(
+#             dataset_id, name=config_name, split=split, streaming=False
+#         )
+#         # Validate indices are within bounds
+#         max_index = max(indices)
+#         if max_index >= len(dataset):
+#             raise ValueError(
+#                 f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows"
+#             )
+#         # Get samples by indices
+#         samples = [dataset[i] for i in indices]
+#         # Get dataset info for schema
+#         service = get_dataset_service(hf_api_token=hf_api_token)
+#         dataset_info = service.load_dataset_info(dataset_id, config_name)
+#         # Prepare response
+#         sample_data = {
+#             "dataset_id": dataset_id,
+#             "config_name": config_name,
+#             "split": split,
+#             "num_samples": len(samples),
+#             "requested_indices": indices,
+#             "data": samples,
+#             "schema": dataset_info.get("features", {}),
+#             "sample_info": {
+#                 "sampling_strategy": "by_indices",
+#                 "streaming_used": False,
+#                 "indices_requested": len(indices),
+#             },
+#         }
+#         sample_data["summary"] = _generate_sample_summary(sample_data)
+#         return sample_data
+#     except Exception as e:
+#         logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
+#         raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
 def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
         summary_parts.append("Strategy: first N rows")
     return " | ".join(summary_parts)