hf-eda-mcp

Running

App Files Files Community

KhalilGuetari commited on Oct 27

Commit

c7dd7b8

1 Parent(s): 11df203

Implement client and dataset service

Browse files

Files changed (8) hide show

.kiro/specs/hf-eda-mcp-server/tasks.md +2 -2
.kiro/steering/tech.md +7 -0
pyproject.toml +3 -2
scripts/__init__.py +0 -0
scripts/playground/__init__.py +0 -0
scripts/playground/hf_client_playground.py +31 -0
src/hf_eda_mcp/integrations/hf_client.py +253 -2
src/hf_eda_mcp/services/dataset_service.py +352 -2

.kiro/specs/hf-eda-mcp-server/tasks.md CHANGED Viewed

@@ -7,13 +7,13 @@
   - _Requirements: 3.1, 4.1, 4.2_
 - [ ] 2. Implement HuggingFace integration layer
-  - [ ] 2.1 Create HuggingFace client wrapper
     - Write HfClient class to handle authentication and API interactions
     - Implement dataset info retrieval using huggingface_hub
     - Add error handling for authentication and network issues
     - _Requirements: 1.2, 4.3_
-  - [ ] 2.2 Implement dataset service with caching
     - Create DatasetService class for centralized dataset operations
     - Add metadata caching to reduce API calls
     - Implement dataset loading and sampling functionality

   - _Requirements: 3.1, 4.1, 4.2_
 - [ ] 2. Implement HuggingFace integration layer
+  - [x] 2.1 Create HuggingFace client wrapper
     - Write HfClient class to handle authentication and API interactions
     - Implement dataset info retrieval using huggingface_hub
     - Add error handling for authentication and network issues
     - _Requirements: 1.2, 4.3_
+  - [x] 2.2 Implement dataset service with caching
     - Create DatasetService class for centralized dataset operations
     - Add metadata caching to reduce API calls
     - Implement dataset loading and sampling functionality

.kiro/steering/tech.md CHANGED Viewed

@@ -30,6 +30,13 @@ ruff check .
 ruff format .
 ```
 ## MCP Integration
 - Designed to run as an MCP server
 - Provides tools accessible to MCP-compatible AI systems

 ruff format .
 ```
+Use pdm when tests or scripts need to be ran once they are defined in pyproject.tomml
+```bash
+# Example to run server
+pdm run hf-eda-mcp
+```
 ## MCP Integration
 - Designed to run as an MCP server
 - Provides tools accessible to MCP-compatible AI systems

pyproject.toml CHANGED Viewed

@@ -22,8 +22,9 @@ requires = ["pdm-backend"]
 build-backend = "pdm.backend"
-[project.scripts]
-hf-eda-mcp = "hf_eda_mcp.server:launch_server"
 [tool.pdm]
 distribution = true

 build-backend = "pdm.backend"
+[tool.pdm.scripts]
+hf-eda-mcp = "python -m hf_eda_mcp"
+hf_client_playground = "python -m scripts.playground.hf_client_playground"
 [tool.pdm]
 distribution = true

scripts/__init__.py ADDED Viewed

File without changes

scripts/playground/__init__.py ADDED Viewed

File without changes

scripts/playground/hf_client_playground.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import logging
+from pprint import pprint
+from hf_eda_mcp.integrations.hf_client import HfClient
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    filename="scripts.log",
+    encoding='utf-8',
+    level=logging.DEBUG,
+    filemode="w",
+    format='%(asctime)s - %(levelname)s - %(message)s',
+)
+def authenticate():
+    client = HfClient()
+    client._authenticate()
+    return client
+def get_dataset_info(client: HfClient, dataset_id: str = "squad"):
+    metadata = client.get_dataset_info(dataset_id)
+    logger.info("Fetched %s dataset", dataset_id)
+    pprint(metadata, indent=4)
+if __name__ == "__main__":
+    client = authenticate()
+    get_dataset_info(client=client, dataset_id="nyu-mll/glue")

src/hf_eda_mcp/integrations/hf_client.py CHANGED Viewed

@@ -1,7 +1,258 @@
 """
 HuggingFace client wrapper for API interactions.
-This module will be implemented in task 2.1.
 """
-# Placeholder - will be implemented in task 2.1

 """
 HuggingFace client wrapper for API interactions.
+This module provides a wrapper around HuggingFace Hub API for dataset operations,
+including authentication, dataset info retrieval, and error handling.
 """
+import logging
+from typing import Optional, Dict, Any, List
+from huggingface_hub import HfApi
+from huggingface_hub.utils import RepositoryNotFoundError, GatedRepoError
+from requests.exceptions import RequestException, ConnectionError, Timeout
+logger = logging.getLogger()
+class HfClientError(Exception):
+    """Base exception for HuggingFace client errors."""
+    pass
+class AuthenticationError(HfClientError):
+    """Raised when authentication fails."""
+    pass
+class DatasetNotFoundError(HfClientError):
+    """Raised when a dataset is not found."""
+    pass
+class NetworkError(HfClientError):
+    """Raised when network operations fail."""
+    pass
+class HfClient:
+    """
+    HuggingFace client wrapper for dataset operations.
+    Handles authentication, dataset info retrieval, and provides
+    comprehensive error handling for API interactions.
+    """
+    def __init__(self, token: Optional[str] = None):
+        """
+        Initialize HuggingFace client.
+        Args:
+            token: Optional HuggingFace authentication token
+        """
+        self.token = token
+        self.api = HfApi(token=token)
+        self._authenticated = False
+        if token:
+            self._authenticate()
+    def _authenticate(self) -> None:
+        """
+        Authenticate with HuggingFace Hub using the provided token.
+        Raises:
+            AuthenticationError: If authentication fails
+        """
+        try:
+            # Test authentication by getting user info
+            user_info = self.api.whoami()
+            self._authenticated = True
+            logger.info(
+                f"Successfully authenticated as {user_info.get('name', 'unknown')}"
+            )
+        except Exception as e:
+            logger.error(f"Authentication failed: {str(e)}")
+            raise AuthenticationError(
+                f"Failed to authenticate with HuggingFace Hub: {str(e)}"
+            )
+    def get_dataset_info(
+        self, dataset_id: str, config_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Retrieve comprehensive dataset information from HuggingFace Hub.
+        Args:
+            dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue')
+            config_name: Optional configuration name for multi-config datasets
+        Returns:
+            Dictionary containing dataset metadata including:
+            - Basic info (size, splits, features)
+            - Configuration details
+            - Download statistics
+            - Dataset card information
+        Raises:
+            DatasetNotFoundError: If dataset doesn't exist
+            AuthenticationError: If dataset is private and authentication fails
+            NetworkError: If network request fails
+        """
+        try:
+            # Get dataset info from HuggingFace Hub
+            dataset_info = self.api.dataset_info(repo_id=dataset_id, revision="main")
+            # Format the response
+            metadata = {
+                "id": dataset_info.id,
+                "author": dataset_info.author or "unknown",
+                "description": dataset_info.description or "",
+                "tags": dataset_info.tags or [],
+                "downloads": getattr(dataset_info, "downloads", 0),
+                "likes": getattr(dataset_info, "likes", 0),
+                "created_at": dataset_info.created_at.isoformat()
+                if dataset_info.created_at
+                else None,
+                "last_modified": dataset_info.last_modified.isoformat()
+                if dataset_info.last_modified
+                else None,
+                "size_bytes": getattr(dataset_info, "size_in_bytes", 0),
+                "configs": [],
+                "splits": {},
+                "features": {},
+            }
+            # Extract configuration information
+            if hasattr(dataset_info, "card_data") and dataset_info.card_data:
+                configs = getattr(dataset_info.card_data, "configs", [])
+                if configs:
+                    # Handle both dict and object configs
+                    config_names = []
+                    for config in configs:
+                        if hasattr(config, "config_name"):
+                            config_names.append(config.config_name)
+                        elif isinstance(config, dict) and "config_name" in config:
+                            config_names.append(config["config_name"])
+                    metadata["configs"] = config_names
+            # If no configs found in card_data, try to get from siblings
+            if not metadata["configs"] and dataset_info.siblings:
+                # Look for config files to infer configurations
+                config_files = [
+                    s.rfilename
+                    for s in dataset_info.siblings
+                    if s.rfilename.endswith(".json") and "/" in s.rfilename
+                ]
+                if config_files:
+                    metadata["configs"] = list(
+                        set([f.split("/")[0] for f in config_files])
+                    )
+            # Try to get more detailed info using datasets library approach
+            try:
+                from datasets import get_dataset_config_names, get_dataset_split_names
+                # Get available configurations
+                try:
+                    config_names = get_dataset_config_names(dataset_id)
+                    if config_names:
+                        metadata["configs"] = config_names
+                except Exception:
+                    # If we can't get config names, use what we have
+                    pass
+                # Get splits for the specified or default configuration
+                target_config = config_name or (
+                    metadata["configs"][0] if metadata["configs"] else None
+                )
+                if target_config:
+                    try:
+                        split_names = get_dataset_split_names(
+                            dataset_id, config_name=target_config
+                        )
+                        metadata["splits"] = {
+                            split: 0 for split in split_names
+                        }  # Size will be filled later
+                    except Exception:
+                        # If we can't get split info, continue without it
+                        pass
+            except ImportError:
+                logger.warning(
+                    "datasets library not available for detailed config info"
+                )
+            return metadata
+        except RepositoryNotFoundError:
+            raise DatasetNotFoundError(
+                f"Dataset '{dataset_id}' not found on HuggingFace Hub"
+            )
+        except GatedRepoError:
+            raise AuthenticationError(
+                f"Dataset '{dataset_id}' is private or gated. "
+                "Please provide a valid authentication token or request access."
+            )
+        except (ConnectionError, Timeout) as e:
+            raise NetworkError(f"Network error while fetching dataset info: {str(e)}")
+        except RequestException as e:
+            raise NetworkError(f"Request failed: {str(e)}")
+        except Exception as e:
+            logger.error(
+                f"Unexpected error getting dataset info for {dataset_id}: {str(e)}"
+            )
+            raise HfClientError(f"Failed to get dataset info: {str(e)}")
+    def list_dataset_configs(self, dataset_id: str) -> List[str]:
+        """
+        List available configurations for a dataset.
+        Args:
+            dataset_id: HuggingFace dataset identifier
+        Returns:
+            List of configuration names
+        Raises:
+            DatasetNotFoundError: If dataset doesn't exist
+            NetworkError: If network request fails
+        """
+        try:
+            from datasets import get_dataset_config_names
+            return get_dataset_config_names(dataset_id)
+        except Exception:
+            # Fallback to getting info and extracting configs
+            dataset_info = self.get_dataset_info(dataset_id)
+            return dataset_info.get("configs", [])
+    def validate_dataset_access(
+        self, dataset_id: str, config_name: Optional[str] = None
+    ) -> bool:
+        """
+        Validate that a dataset can be accessed with current authentication.
+        Args:
+            dataset_id: HuggingFace dataset identifier
+            config_name: Optional configuration name
+        Returns:
+            True if dataset is accessible, False otherwise
+        """
+        try:
+            self.get_dataset_info(dataset_id, config_name)
+            return True
+        except (DatasetNotFoundError, AuthenticationError):
+            return False
+        except Exception:
+            # For other errors (network, etc.), assume dataset exists but there's a temporary issue
+            return True
+    @property
+    def is_authenticated(self) -> bool:
+        """Check if client is authenticated."""
+        return self._authenticated

src/hf_eda_mcp/services/dataset_service.py CHANGED Viewed

@@ -1,7 +1,357 @@
 """
 Dataset service for centralized dataset operations and caching.
-This module will be implemented in task 2.2.
 """
-# Placeholder - will be implemented in task 2.2

 """
 Dataset service for centralized dataset operations and caching.
+This module provides a centralized service for dataset operations including
+metadata caching, dataset loading, and sampling functionality.
 """
+import logging
+import os
+import json
+import time
+from typing import Optional, Dict, Any
+from pathlib import Path
+from datasets import load_dataset
+from datasets.utils.logging import disable_progress_bar
+from hf_eda_mcp.integrations.hf_client import HfClient, HfClientError, DatasetNotFoundError
+logger = logging.getLogger(__name__)
+# Disable datasets progress bars for cleaner logging
+disable_progress_bar()
+class DatasetServiceError(Exception):
+    """Base exception for dataset service errors."""
+    pass
+class CacheError(DatasetServiceError):
+    """Raised when cache operations fail."""
+    pass
+class DatasetService:
+    """
+    Centralized service for dataset operations with caching support.
+    Provides metadata caching, dataset loading, and sampling functionality
+    while managing authentication and error handling.
+    """
+    def __init__(
+        self,
+        cache_dir: Optional[str] = None,
+        token: Optional[str] = None,
+        cache_ttl: int = 3600  # 1 hour default TTL
+    ):
+        """
+        Initialize dataset service with optional caching and authentication.
+        Args:
+            cache_dir: Directory for caching metadata and samples
+            token: HuggingFace authentication token
+            cache_ttl: Cache time-to-live in seconds (default: 1 hour)
+        """
+        self.hf_client = HfClient(token=token)
+        self.cache_ttl = cache_ttl
+        # Set up cache directory
+        if cache_dir is None:
+            cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "hf_eda_mcp")
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        # Cache subdirectories
+        self.metadata_cache_dir = self.cache_dir / "metadata"
+        self.sample_cache_dir = self.cache_dir / "samples"
+        self.metadata_cache_dir.mkdir(exist_ok=True)
+        self.sample_cache_dir.mkdir(exist_ok=True)
+        logger.info(f"DatasetService initialized with cache dir: {self.cache_dir}")
+    def _get_cache_key(self, dataset_id: str, config_name: Optional[str] = None) -> str:
+        """Generate cache key for dataset metadata."""
+        if config_name:
+            return f"{dataset_id}_{config_name}".replace("/", "_")
+        return dataset_id.replace("/", "_")
+    def _get_sample_cache_key(
+        self,
+        dataset_id: str,
+        split: str,
+        num_samples: int,
+        config_name: Optional[str] = None
+    ) -> str:
+        """Generate cache key for dataset samples."""
+        base_key = self._get_cache_key(dataset_id, config_name)
+        return f"{base_key}_{split}_{num_samples}"
+    def _is_cache_valid(self, cache_file: Path) -> bool:
+        """Check if cache file exists and is within TTL."""
+        if not cache_file.exists():
+            return False
+        # Check if cache is within TTL
+        cache_age = time.time() - cache_file.stat().st_mtime
+        return cache_age < self.cache_ttl
+    def _save_to_cache(self, cache_file: Path, data: Dict[str, Any]) -> None:
+        """Save data to cache file."""
+        try:
+            cache_file.parent.mkdir(parents=True, exist_ok=True)
+            with open(cache_file, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
+            logger.debug(f"Saved data to cache: {cache_file}")
+        except Exception as e:
+            logger.warning(f"Failed to save cache file {cache_file}: {e}")
+            raise CacheError(f"Failed to save cache: {e}")
+    def _load_from_cache(self, cache_file: Path) -> Optional[Dict[str, Any]]:
+        """Load data from cache file."""
+        try:
+            if not self._is_cache_valid(cache_file):
+                return None
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            logger.debug(f"Loaded data from cache: {cache_file}")
+            return data
+        except Exception as e:
+            logger.warning(f"Failed to load cache file {cache_file}: {e}")
+            return None
+    def load_dataset_info(self, dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Load dataset information from HuggingFace Hub with caching.
+        Args:
+            dataset_id: HuggingFace dataset identifier
+            config_name: Optional configuration name
+        Returns:
+            Dictionary containing dataset metadata
+        Raises:
+            DatasetNotFoundError: If dataset doesn't exist
+            AuthenticationError: If dataset is private and authentication fails
+        """
+        cache_key = self._get_cache_key(dataset_id, config_name)
+        cache_file = self.metadata_cache_dir / f"{cache_key}.json"
+        # Try to load from cache first
+        cached_data = self._load_from_cache(cache_file)
+        if cached_data is not None:
+            logger.debug(f"Using cached metadata for {dataset_id}")
+            return cached_data
+        # Fetch from HuggingFace Hub
+        try:
+            logger.info(f"Fetching metadata for dataset: {dataset_id}")
+            metadata = self.hf_client.get_dataset_info(dataset_id, config_name)
+            # Add cache timestamp
+            metadata['_cached_at'] = time.time()
+            # Save to cache
+            self._save_to_cache(cache_file, metadata)
+            return metadata
+        except HfClientError:
+            # Re-raise HfClient errors as-is
+            raise
+    def load_dataset_sample(
+        self,
+        dataset_id: str,
+        split: str = "train",
+        num_samples: int = 10,
+        config_name: Optional[str] = None,
+        streaming: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Load samples from the specified dataset with caching.
+        Args:
+            dataset_id: HuggingFace dataset identifier
+            split: Dataset split to sample from
+            num_samples: Number of samples to retrieve
+            config_name: Optional configuration name
+            streaming: Whether to use streaming mode for large datasets
+        Returns:
+            Dictionary containing sampled data and metadata
+        Raises:
+            DatasetNotFoundError: If dataset or split doesn't exist
+            DatasetServiceError: If sampling fails
+        """
+        # For small samples, check cache first
+        if num_samples <= 100:  # Only cache small samples
+            cache_key = self._get_sample_cache_key(dataset_id, split, num_samples, config_name)
+            cache_file = self.sample_cache_dir / f"{cache_key}.json"
+            cached_data = self._load_from_cache(cache_file)
+            if cached_data is not None:
+                logger.debug(f"Using cached sample for {dataset_id}")
+                return cached_data
+        try:
+            logger.info(f"Loading sample from dataset: {dataset_id}, split: {split}")
+            # Load dataset with streaming for efficiency
+            dataset = load_dataset(
+                dataset_id,
+                name=config_name,
+                split=split,
+                streaming=streaming
+            )
+            # Take the requested number of samples
+            if streaming:
+                # For streaming datasets, take samples from iterator
+                samples = []
+                for i, sample in enumerate(dataset):
+                    if i >= num_samples:
+                        break
+                    samples.append(sample)
+            else:
+                # For non-streaming datasets, use select
+                max_samples = min(num_samples, len(dataset))
+                samples = dataset.select(range(max_samples))
+                samples = [samples[i] for i in range(len(samples))]
+            # Get dataset info for schema
+            dataset_info = self.load_dataset_info(dataset_id, config_name)
+            # Prepare response
+            sample_data = {
+                'dataset_id': dataset_id,
+                'config_name': config_name,
+                'split': split,
+                'num_samples': len(samples),
+                'requested_samples': num_samples,
+                'data': samples,
+                'schema': dataset_info.get('features', {}),
+                '_sampled_at': time.time()
+            }
+            # Cache small samples
+            if num_samples <= 100:
+                try:
+                    self._save_to_cache(cache_file, sample_data)
+                except CacheError:
+                    # Don't fail if caching fails
+                    pass
+            return sample_data
+        except Exception as e:
+            logger.error(f"Failed to load dataset sample: {e}")
+            if "not found" in str(e).lower():
+                raise DatasetNotFoundError(f"Dataset '{dataset_id}' or split '{split}' not found")
+            raise DatasetServiceError(f"Failed to load dataset sample: {e}")
+    def get_cached_metadata(self, dataset_id: str, config_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
+        """
+        Retrieve cached metadata without making API calls.
+        Args:
+            dataset_id: HuggingFace dataset identifier
+            config_name: Optional configuration name
+        Returns:
+            Cached metadata dictionary or None if not cached/expired
+        """
+        cache_key = self._get_cache_key(dataset_id, config_name)
+        cache_file = self.metadata_cache_dir / f"{cache_key}.json"
+        return self._load_from_cache(cache_file)
+    def clear_cache(self, dataset_id: Optional[str] = None) -> None:
+        """
+        Clear cached data for a specific dataset or all datasets.
+        Args:
+            dataset_id: Optional dataset ID to clear. If None, clears all cache.
+        """
+        try:
+            if dataset_id is None:
+                # Clear all cache
+                for cache_file in self.metadata_cache_dir.glob("*.json"):
+                    cache_file.unlink()
+                for cache_file in self.sample_cache_dir.glob("*.json"):
+                    cache_file.unlink()
+                logger.info("Cleared all cache")
+            else:
+                # Clear cache for specific dataset
+                cache_key = self._get_cache_key(dataset_id)
+                # Clear metadata cache
+                for cache_file in self.metadata_cache_dir.glob(f"{cache_key}*.json"):
+                    cache_file.unlink()
+                # Clear sample cache
+                for cache_file in self.sample_cache_dir.glob(f"{cache_key}*.json"):
+                    cache_file.unlink()
+                logger.info(f"Cleared cache for dataset: {dataset_id}")
+        except Exception as e:
+            logger.warning(f"Failed to clear cache: {e}")
+            raise CacheError(f"Failed to clear cache: {e}")
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """
+        Get statistics about the current cache.
+        Returns:
+            Dictionary with cache statistics
+        """
+        try:
+            metadata_files = list(self.metadata_cache_dir.glob("*.json"))
+            sample_files = list(self.sample_cache_dir.glob("*.json"))
+            # Calculate cache sizes
+            metadata_size = sum(f.stat().st_size for f in metadata_files)
+            sample_size = sum(f.stat().st_size for f in sample_files)
+            return {
+                'cache_dir': str(self.cache_dir),
+                'metadata_files': len(metadata_files),
+                'sample_files': len(sample_files),
+                'total_files': len(metadata_files) + len(sample_files),
+                'metadata_size_bytes': metadata_size,
+                'sample_size_bytes': sample_size,
+                'total_size_bytes': metadata_size + sample_size,
+                'cache_ttl_seconds': self.cache_ttl
+            }
+        except Exception as e:
+            logger.warning(f"Failed to get cache stats: {e}")
+            return {'error': str(e)}
+    def validate_dataset_access(
+        self,
+        dataset_id: str,
+        config_name: Optional[str] = None
+    ) -> bool:
+        """
+        Validate that a dataset can be accessed with current authentication.
+        Args:
+            dataset_id: HuggingFace dataset identifier
+            config_name: Optional configuration name
+        Returns:
+            True if dataset is accessible, False otherwise
+        """
+        return self.hf_client.validate_dataset_access(dataset_id, config_name)
+    @property
+    def is_authenticated(self) -> bool:
+        """Check if the service is authenticated with HuggingFace."""
+        return self.hf_client.is_authenticated