Spaces:
Running
Running
Commit
·
2762e2a
1
Parent(s):
c7dd7b8
Core EDA tools
Browse files
.kiro/specs/hf-eda-mcp-server/tasks.md
CHANGED
|
@@ -6,7 +6,7 @@
|
|
| 6 |
- Set up basic package initialization files
|
| 7 |
- _Requirements: 3.1, 4.1, 4.2_
|
| 8 |
|
| 9 |
-
- [
|
| 10 |
- [x] 2.1 Create HuggingFace client wrapper
|
| 11 |
- Write HfClient class to handle authentication and API interactions
|
| 12 |
- Implement dataset info retrieval using huggingface_hub
|
|
@@ -19,20 +19,20 @@
|
|
| 19 |
- Implement dataset loading and sampling functionality
|
| 20 |
- _Requirements: 1.1, 2.1, 2.2_
|
| 21 |
|
| 22 |
-
- [
|
| 23 |
-
- [
|
| 24 |
- Write get_dataset_metadata function to retrieve comprehensive dataset information
|
| 25 |
- Format metadata response with dataset size, features, splits, and configuration details
|
| 26 |
- Handle multi-configuration datasets appropriately
|
| 27 |
- _Requirements: 1.1, 1.3, 1.4_
|
| 28 |
|
| 29 |
-
- [
|
| 30 |
- Create get_dataset_sample function for retrieving dataset samples
|
| 31 |
- Support different splits (train, validation, test) and configurable sample sizes
|
| 32 |
- Implement efficient sampling strategies for large datasets
|
| 33 |
- _Requirements: 2.1, 2.2, 2.3_
|
| 34 |
|
| 35 |
-
- [
|
| 36 |
- Write analyze_dataset_features function for exploratory data analysis
|
| 37 |
- Generate feature statistics, missing value analysis, and data quality insights
|
| 38 |
- Handle different data types (numerical, categorical, text) appropriately
|
|
|
|
| 6 |
- Set up basic package initialization files
|
| 7 |
- _Requirements: 3.1, 4.1, 4.2_
|
| 8 |
|
| 9 |
+
- [x] 2. Implement HuggingFace integration layer
|
| 10 |
- [x] 2.1 Create HuggingFace client wrapper
|
| 11 |
- Write HfClient class to handle authentication and API interactions
|
| 12 |
- Implement dataset info retrieval using huggingface_hub
|
|
|
|
| 19 |
- Implement dataset loading and sampling functionality
|
| 20 |
- _Requirements: 1.1, 2.1, 2.2_
|
| 21 |
|
| 22 |
+
- [x] 3. Create core EDA tools
|
| 23 |
+
- [x] 3.1 Implement dataset metadata tool
|
| 24 |
- Write get_dataset_metadata function to retrieve comprehensive dataset information
|
| 25 |
- Format metadata response with dataset size, features, splits, and configuration details
|
| 26 |
- Handle multi-configuration datasets appropriately
|
| 27 |
- _Requirements: 1.1, 1.3, 1.4_
|
| 28 |
|
| 29 |
+
- [x] 3.2 Implement dataset sampling tool
|
| 30 |
- Create get_dataset_sample function for retrieving dataset samples
|
| 31 |
- Support different splits (train, validation, test) and configurable sample sizes
|
| 32 |
- Implement efficient sampling strategies for large datasets
|
| 33 |
- _Requirements: 2.1, 2.2, 2.3_
|
| 34 |
|
| 35 |
+
- [x] 3.3 Implement basic analysis tool
|
| 36 |
- Write analyze_dataset_features function for exploratory data analysis
|
| 37 |
- Generate feature statistics, missing value analysis, and data quality insights
|
| 38 |
- Handle different data types (numerical, categorical, text) appropriately
|
src/hf_eda_mcp/tools/__init__.py
CHANGED
|
@@ -4,4 +4,21 @@ EDA tools module for HuggingFace datasets.
|
|
| 4 |
This package contains individual EDA functions that will be exposed as MCP tools.
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
This package contains individual EDA functions that will be exposed as MCP tools.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from .metadata import get_dataset_metadata, validate_dataset_metadata_inputs
|
| 8 |
+
from .sampling import get_dataset_sample, get_dataset_sample_with_indices, get_available_splits
|
| 9 |
+
from .analysis import analyze_dataset_features, validate_analysis_inputs
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
# Metadata tools
|
| 13 |
+
'get_dataset_metadata',
|
| 14 |
+
'validate_dataset_metadata_inputs',
|
| 15 |
+
|
| 16 |
+
# Sampling tools
|
| 17 |
+
'get_dataset_sample',
|
| 18 |
+
'get_dataset_sample_with_indices',
|
| 19 |
+
'get_available_splits',
|
| 20 |
+
|
| 21 |
+
# Analysis tools
|
| 22 |
+
'analyze_dataset_features',
|
| 23 |
+
'validate_analysis_inputs'
|
| 24 |
+
]
|
src/hf_eda_mcp/tools/analysis.py
CHANGED
|
@@ -1,7 +1,544 @@
|
|
| 1 |
"""
|
| 2 |
-
Basic analysis
|
| 3 |
|
| 4 |
-
This module
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Basic analysis tools for exploratory data analysis of HuggingFace datasets.
|
| 3 |
|
| 4 |
+
This module provides tools for performing exploratory data analysis including
|
| 5 |
+
feature statistics, missing value analysis, and data quality insights.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import logging
|
| 9 |
+
import statistics
|
| 10 |
+
from typing import Optional, Dict, Any, List
|
| 11 |
+
from collections import Counter
|
| 12 |
+
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 13 |
+
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Global dataset service instance
|
| 18 |
+
_dataset_service: Optional[DatasetService] = None
|
| 19 |
+
|
| 20 |
+
# Constants for analysis
|
| 21 |
+
DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
|
| 22 |
+
MAX_ANALYSIS_SAMPLE_SIZE = 50000
|
| 23 |
+
MAX_UNIQUE_VALUES_TO_SHOW = 20
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_dataset_service() -> DatasetService:
|
| 27 |
+
"""Get or create the global dataset service instance."""
|
| 28 |
+
global _dataset_service
|
| 29 |
+
if _dataset_service is None:
|
| 30 |
+
_dataset_service = DatasetService()
|
| 31 |
+
return _dataset_service
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def analyze_dataset_features(
|
| 35 |
+
dataset_id: str,
|
| 36 |
+
split: str = "train",
|
| 37 |
+
sample_size: int = DEFAULT_ANALYSIS_SAMPLE_SIZE,
|
| 38 |
+
config_name: Optional[str] = None,
|
| 39 |
+
) -> Dict[str, Any]:
|
| 40 |
+
"""
|
| 41 |
+
Perform basic exploratory analysis on dataset features.
|
| 42 |
+
|
| 43 |
+
This function analyzes dataset features to provide insights into data types,
|
| 44 |
+
distributions, missing values, and data quality. It handles different data
|
| 45 |
+
types (numerical, categorical, text) appropriately and generates comprehensive
|
| 46 |
+
statistics for each feature.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
dataset_id: HuggingFace dataset identifier (e.g., 'imdb', 'squad')
|
| 50 |
+
split: Dataset split to analyze (default: 'train')
|
| 51 |
+
sample_size: Number of samples to use for analysis (default: 1000, max: 50000)
|
| 52 |
+
config_name: Optional configuration name for multi-config datasets
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Dictionary containing comprehensive feature analysis:
|
| 56 |
+
- dataset_info: Basic dataset information
|
| 57 |
+
- sample_info: Information about the sample used for analysis
|
| 58 |
+
- features: Dictionary with analysis for each feature including:
|
| 59 |
+
- feature_type: Detected type (numerical, categorical, text, etc.)
|
| 60 |
+
- missing_count: Number of missing/null values
|
| 61 |
+
- missing_percentage: Percentage of missing values
|
| 62 |
+
- unique_count: Number of unique values
|
| 63 |
+
- statistics: Type-specific statistics (mean, std for numerical; top values for categorical)
|
| 64 |
+
- summary: Overall analysis summary
|
| 65 |
+
- data_quality: Data quality assessment
|
| 66 |
+
|
| 67 |
+
Raises:
|
| 68 |
+
ValueError: If inputs are invalid
|
| 69 |
+
DatasetNotFoundError: If dataset or split doesn't exist
|
| 70 |
+
AuthenticationError: If dataset is private and authentication fails
|
| 71 |
+
DatasetServiceError: If analysis fails for other reasons
|
| 72 |
+
|
| 73 |
+
Example:
|
| 74 |
+
>>> analysis = analyze_dataset_features("imdb", sample_size=500)
|
| 75 |
+
>>> for feature_name, feature_analysis in analysis['features'].items():
|
| 76 |
+
... print(f"{feature_name}: {feature_analysis['feature_type']}")
|
| 77 |
+
... print(f" Missing: {feature_analysis['missing_percentage']:.1f}%")
|
| 78 |
+
|
| 79 |
+
>>> # Check data quality
|
| 80 |
+
>>> quality = analysis['data_quality']
|
| 81 |
+
>>> print(f"Overall quality score: {quality['quality_score']:.2f}")
|
| 82 |
+
"""
|
| 83 |
+
# Input validation
|
| 84 |
+
validate_analysis_inputs(dataset_id, split, sample_size, config_name)
|
| 85 |
+
|
| 86 |
+
logger.info(
|
| 87 |
+
f"Analyzing features for dataset: {dataset_id}, split: {split}, "
|
| 88 |
+
f"sample_size: {sample_size}"
|
| 89 |
+
+ (f", config: {config_name}" if config_name else "")
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
# Get dataset service and load sample for analysis
|
| 94 |
+
service = get_dataset_service()
|
| 95 |
+
sample_data = service.load_dataset_sample(
|
| 96 |
+
dataset_id=dataset_id,
|
| 97 |
+
split=split,
|
| 98 |
+
num_samples=sample_size,
|
| 99 |
+
config_name=config_name,
|
| 100 |
+
streaming=True,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Note: We could get dataset metadata here for additional context if needed
|
| 104 |
+
|
| 105 |
+
# Perform feature analysis
|
| 106 |
+
features_analysis = {}
|
| 107 |
+
data_samples = sample_data["data"]
|
| 108 |
+
|
| 109 |
+
if not data_samples:
|
| 110 |
+
raise DatasetServiceError("No data samples available for analysis")
|
| 111 |
+
|
| 112 |
+
# Determine feature names from first sample
|
| 113 |
+
first_sample = data_samples[0]
|
| 114 |
+
if not isinstance(first_sample, dict):
|
| 115 |
+
raise DatasetServiceError(
|
| 116 |
+
"Dataset samples are not in expected dictionary format"
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
feature_names = list(first_sample.keys())
|
| 120 |
+
|
| 121 |
+
# Analyze each feature
|
| 122 |
+
for feature_name in feature_names:
|
| 123 |
+
logger.debug(f"Analyzing feature: {feature_name}")
|
| 124 |
+
feature_analysis = _analyze_single_feature(feature_name, data_samples)
|
| 125 |
+
features_analysis[feature_name] = feature_analysis
|
| 126 |
+
|
| 127 |
+
# Generate overall analysis
|
| 128 |
+
analysis_result = {
|
| 129 |
+
"dataset_info": {
|
| 130 |
+
"dataset_id": dataset_id,
|
| 131 |
+
"config_name": config_name,
|
| 132 |
+
"split": split,
|
| 133 |
+
"total_features": len(feature_names),
|
| 134 |
+
"sample_size_used": len(data_samples),
|
| 135 |
+
"sample_size_requested": sample_size,
|
| 136 |
+
},
|
| 137 |
+
"sample_info": {
|
| 138 |
+
"sampling_method": "sequential_head",
|
| 139 |
+
"represents_full_dataset": len(data_samples) >= sample_size,
|
| 140 |
+
"analysis_timestamp": sample_data.get("_sampled_at"),
|
| 141 |
+
},
|
| 142 |
+
"features": features_analysis,
|
| 143 |
+
"data_quality": _assess_data_quality(features_analysis),
|
| 144 |
+
"summary": _generate_analysis_summary(features_analysis, len(data_samples)),
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
logger.info(
|
| 148 |
+
f"Successfully analyzed {len(feature_names)} features from {dataset_id}"
|
| 149 |
+
)
|
| 150 |
+
return analysis_result
|
| 151 |
+
|
| 152 |
+
except (DatasetNotFoundError, AuthenticationError):
|
| 153 |
+
# Re-raise these specific errors as-is
|
| 154 |
+
raise
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Failed to analyze dataset {dataset_id}: {str(e)}")
|
| 157 |
+
raise DatasetServiceError(f"Failed to analyze dataset features: {str(e)}")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def _analyze_single_feature(
|
| 161 |
+
feature_name: str, data_samples: List[Dict[str, Any]]
|
| 162 |
+
) -> Dict[str, Any]:
|
| 163 |
+
"""
|
| 164 |
+
Analyze a single feature across all data samples.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
feature_name: Name of the feature to analyze
|
| 168 |
+
data_samples: List of data sample dictionaries
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
Dictionary containing feature analysis results
|
| 172 |
+
"""
|
| 173 |
+
# Extract values for this feature
|
| 174 |
+
values = []
|
| 175 |
+
missing_count = 0
|
| 176 |
+
|
| 177 |
+
for sample in data_samples:
|
| 178 |
+
value = sample.get(feature_name)
|
| 179 |
+
if (
|
| 180 |
+
value is None
|
| 181 |
+
or value == ""
|
| 182 |
+
or (isinstance(value, float) and str(value).lower() == "nan")
|
| 183 |
+
):
|
| 184 |
+
missing_count += 1
|
| 185 |
+
else:
|
| 186 |
+
values.append(value)
|
| 187 |
+
|
| 188 |
+
total_count = len(data_samples)
|
| 189 |
+
missing_percentage = (missing_count / total_count) * 100 if total_count > 0 else 0
|
| 190 |
+
|
| 191 |
+
# Determine feature type and compute statistics
|
| 192 |
+
feature_type, statistics_dict = _determine_feature_type_and_stats(values)
|
| 193 |
+
|
| 194 |
+
# Count unique values
|
| 195 |
+
unique_count = len(set(str(v) for v in values)) if values else 0
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
"feature_type": feature_type,
|
| 199 |
+
"missing_count": missing_count,
|
| 200 |
+
"missing_percentage": missing_percentage,
|
| 201 |
+
"unique_count": unique_count,
|
| 202 |
+
"total_count": total_count,
|
| 203 |
+
"non_missing_count": len(values),
|
| 204 |
+
"statistics": statistics_dict,
|
| 205 |
+
"sample_values": values[:5] if values else [], # First 5 values as examples
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _determine_feature_type_and_stats(values: List[Any]) -> tuple[str, Dict[str, Any]]:
|
| 210 |
+
"""
|
| 211 |
+
Determine the type of a feature and compute appropriate statistics.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
values: List of non-missing values for the feature
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
Tuple of (feature_type, statistics_dict)
|
| 218 |
+
"""
|
| 219 |
+
if not values:
|
| 220 |
+
return "unknown", {}
|
| 221 |
+
|
| 222 |
+
# Check if all values are numeric
|
| 223 |
+
numeric_values = []
|
| 224 |
+
for value in values:
|
| 225 |
+
try:
|
| 226 |
+
if isinstance(value, (int, float)):
|
| 227 |
+
numeric_values.append(float(value))
|
| 228 |
+
elif isinstance(value, str):
|
| 229 |
+
# Try to convert string to number
|
| 230 |
+
numeric_values.append(float(value))
|
| 231 |
+
else:
|
| 232 |
+
# Not numeric
|
| 233 |
+
break
|
| 234 |
+
except (ValueError, TypeError):
|
| 235 |
+
# Not numeric
|
| 236 |
+
break
|
| 237 |
+
else:
|
| 238 |
+
# All values are numeric
|
| 239 |
+
if len(numeric_values) == len(values):
|
| 240 |
+
return "numerical", _compute_numerical_statistics(numeric_values)
|
| 241 |
+
|
| 242 |
+
# Check if values are boolean-like
|
| 243 |
+
boolean_values = set(str(v).lower() for v in values)
|
| 244 |
+
if boolean_values.issubset({"true", "false", "1", "0", "yes", "no"}):
|
| 245 |
+
return "boolean", _compute_categorical_statistics(values)
|
| 246 |
+
|
| 247 |
+
# Check if it's text (strings with average length > 10)
|
| 248 |
+
if all(isinstance(v, str) for v in values):
|
| 249 |
+
avg_length = sum(len(v) for v in values) / len(values)
|
| 250 |
+
if avg_length > 10:
|
| 251 |
+
return "text", _compute_text_statistics(values)
|
| 252 |
+
|
| 253 |
+
# Default to categorical
|
| 254 |
+
return "categorical", _compute_categorical_statistics(values)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def _compute_numerical_statistics(values: List[float]) -> Dict[str, Any]:
|
| 258 |
+
"""Compute statistics for numerical features."""
|
| 259 |
+
if not values:
|
| 260 |
+
return {}
|
| 261 |
+
|
| 262 |
+
try:
|
| 263 |
+
stats = {
|
| 264 |
+
"count": len(values),
|
| 265 |
+
"mean": statistics.mean(values),
|
| 266 |
+
"median": statistics.median(values),
|
| 267 |
+
"min": min(values),
|
| 268 |
+
"max": max(values),
|
| 269 |
+
"range": max(values) - min(values),
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
if len(values) > 1:
|
| 273 |
+
stats["std"] = statistics.stdev(values)
|
| 274 |
+
stats["variance"] = statistics.variance(values)
|
| 275 |
+
|
| 276 |
+
# Quartiles
|
| 277 |
+
sorted_values = sorted(values)
|
| 278 |
+
n = len(sorted_values)
|
| 279 |
+
if n >= 4:
|
| 280 |
+
stats["q1"] = sorted_values[n // 4]
|
| 281 |
+
stats["q3"] = sorted_values[3 * n // 4]
|
| 282 |
+
stats["iqr"] = stats["q3"] - stats["q1"]
|
| 283 |
+
|
| 284 |
+
return stats
|
| 285 |
+
except Exception as e:
|
| 286 |
+
logger.warning(f"Failed to compute numerical statistics: {e}")
|
| 287 |
+
return {"count": len(values), "error": str(e)}
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def _compute_categorical_statistics(values: List[Any]) -> Dict[str, Any]:
|
| 291 |
+
"""Compute statistics for categorical features."""
|
| 292 |
+
if not values:
|
| 293 |
+
return {}
|
| 294 |
+
|
| 295 |
+
try:
|
| 296 |
+
# Convert all values to strings for consistent counting
|
| 297 |
+
str_values = [str(v) for v in values]
|
| 298 |
+
value_counts = Counter(str_values)
|
| 299 |
+
|
| 300 |
+
stats = {
|
| 301 |
+
"count": len(values),
|
| 302 |
+
"unique_count": len(value_counts),
|
| 303 |
+
"most_common": value_counts.most_common(MAX_UNIQUE_VALUES_TO_SHOW),
|
| 304 |
+
"top_value": value_counts.most_common(1)[0] if value_counts else None,
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
# Calculate entropy (measure of diversity)
|
| 308 |
+
if len(value_counts) > 1:
|
| 309 |
+
total = len(str_values)
|
| 310 |
+
entropy = -sum(
|
| 311 |
+
(count / total) * (count / total).bit_length()
|
| 312 |
+
for count in value_counts.values()
|
| 313 |
+
if count > 0
|
| 314 |
+
)
|
| 315 |
+
stats["entropy"] = entropy
|
| 316 |
+
|
| 317 |
+
return stats
|
| 318 |
+
except Exception as e:
|
| 319 |
+
logger.warning(f"Failed to compute categorical statistics: {e}")
|
| 320 |
+
return {"count": len(values), "error": str(e)}
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def _compute_text_statistics(values: List[str]) -> Dict[str, Any]:
|
| 324 |
+
"""Compute statistics for text features."""
|
| 325 |
+
if not values:
|
| 326 |
+
return {}
|
| 327 |
+
|
| 328 |
+
try:
|
| 329 |
+
lengths = [len(v) for v in values]
|
| 330 |
+
word_counts = [len(v.split()) for v in values]
|
| 331 |
+
|
| 332 |
+
stats = {
|
| 333 |
+
"count": len(values),
|
| 334 |
+
"avg_length": statistics.mean(lengths),
|
| 335 |
+
"min_length": min(lengths),
|
| 336 |
+
"max_length": max(lengths),
|
| 337 |
+
"avg_word_count": statistics.mean(word_counts),
|
| 338 |
+
"min_word_count": min(word_counts),
|
| 339 |
+
"max_word_count": max(word_counts),
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
# Sample of values (first few)
|
| 343 |
+
stats["sample_texts"] = values[:3]
|
| 344 |
+
|
| 345 |
+
return stats
|
| 346 |
+
except Exception as e:
|
| 347 |
+
logger.warning(f"Failed to compute text statistics: {e}")
|
| 348 |
+
return {"count": len(values), "error": str(e)}
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def _assess_data_quality(
|
| 352 |
+
features_analysis: Dict[str, Dict[str, Any]],
|
| 353 |
+
) -> Dict[str, Any]:
|
| 354 |
+
"""
|
| 355 |
+
Assess overall data quality based on feature analysis.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
features_analysis: Dictionary of feature analyses
|
| 359 |
+
|
| 360 |
+
Returns:
|
| 361 |
+
Dictionary containing data quality assessment
|
| 362 |
+
"""
|
| 363 |
+
if not features_analysis:
|
| 364 |
+
return {"quality_score": 0.0, "issues": ["No features to analyze"]}
|
| 365 |
+
|
| 366 |
+
total_features = len(features_analysis)
|
| 367 |
+
issues = []
|
| 368 |
+
quality_factors = []
|
| 369 |
+
|
| 370 |
+
# Check missing value rates
|
| 371 |
+
high_missing_features = 0
|
| 372 |
+
total_missing_rate = 0
|
| 373 |
+
|
| 374 |
+
for feature_name, analysis in features_analysis.items():
|
| 375 |
+
missing_pct = analysis.get("missing_percentage", 0)
|
| 376 |
+
total_missing_rate += missing_pct
|
| 377 |
+
|
| 378 |
+
if missing_pct > 50:
|
| 379 |
+
high_missing_features += 1
|
| 380 |
+
issues.append(
|
| 381 |
+
f"Feature '{feature_name}' has {missing_pct:.1f}% missing values"
|
| 382 |
+
)
|
| 383 |
+
elif missing_pct > 20:
|
| 384 |
+
issues.append(
|
| 385 |
+
f"Feature '{feature_name}' has {missing_pct:.1f}% missing values"
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
avg_missing_rate = total_missing_rate / total_features
|
| 389 |
+
|
| 390 |
+
# Quality score calculation (0-1 scale)
|
| 391 |
+
missing_score = max(0, 1 - (avg_missing_rate / 100))
|
| 392 |
+
quality_factors.append(("missing_values", missing_score))
|
| 393 |
+
|
| 394 |
+
# Check for features with very low diversity
|
| 395 |
+
low_diversity_features = 0
|
| 396 |
+
for feature_name, analysis in features_analysis.items():
|
| 397 |
+
unique_count = analysis.get("unique_count", 0)
|
| 398 |
+
total_count = analysis.get("total_count", 1)
|
| 399 |
+
diversity_ratio = unique_count / total_count if total_count > 0 else 0
|
| 400 |
+
|
| 401 |
+
if diversity_ratio < 0.01 and analysis.get("feature_type") != "boolean":
|
| 402 |
+
low_diversity_features += 1
|
| 403 |
+
issues.append(
|
| 404 |
+
f"Feature '{feature_name}' has very low diversity ({unique_count} unique values)"
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
diversity_score = max(0, 1 - (low_diversity_features / total_features))
|
| 408 |
+
quality_factors.append(("diversity", diversity_score))
|
| 409 |
+
|
| 410 |
+
# Overall quality score (weighted average)
|
| 411 |
+
weights = {"missing_values": 0.6, "diversity": 0.4}
|
| 412 |
+
quality_score = sum(weights[factor] * score for factor, score in quality_factors)
|
| 413 |
+
|
| 414 |
+
# Quality assessment
|
| 415 |
+
if quality_score >= 0.8:
|
| 416 |
+
quality_level = "high"
|
| 417 |
+
elif quality_score >= 0.6:
|
| 418 |
+
quality_level = "medium"
|
| 419 |
+
else:
|
| 420 |
+
quality_level = "low"
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
"quality_score": quality_score,
|
| 424 |
+
"quality_level": quality_level,
|
| 425 |
+
"avg_missing_rate": avg_missing_rate,
|
| 426 |
+
"high_missing_features": high_missing_features,
|
| 427 |
+
"low_diversity_features": low_diversity_features,
|
| 428 |
+
"issues": issues,
|
| 429 |
+
"recommendations": _generate_quality_recommendations(issues, quality_score),
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def _generate_quality_recommendations(
|
| 434 |
+
issues: List[str], quality_score: float
|
| 435 |
+
) -> List[str]:
|
| 436 |
+
"""Generate recommendations based on data quality issues."""
|
| 437 |
+
recommendations = []
|
| 438 |
+
|
| 439 |
+
if quality_score < 0.6:
|
| 440 |
+
recommendations.append(
|
| 441 |
+
"Consider data cleaning and preprocessing before analysis"
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
if any("missing values" in issue for issue in issues):
|
| 445 |
+
recommendations.append("Handle missing values through imputation or removal")
|
| 446 |
+
|
| 447 |
+
if any("low diversity" in issue for issue in issues):
|
| 448 |
+
recommendations.append(
|
| 449 |
+
"Review features with low diversity - they may not be informative"
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
if not recommendations:
|
| 453 |
+
recommendations.append("Data quality looks good for analysis")
|
| 454 |
+
|
| 455 |
+
return recommendations
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def _generate_analysis_summary(
|
| 459 |
+
features_analysis: Dict[str, Dict[str, Any]], sample_size: int
|
| 460 |
+
) -> str:
|
| 461 |
+
"""Generate a human-readable summary of the analysis."""
|
| 462 |
+
if not features_analysis:
|
| 463 |
+
return "No features analyzed"
|
| 464 |
+
|
| 465 |
+
total_features = len(features_analysis)
|
| 466 |
+
|
| 467 |
+
# Count feature types
|
| 468 |
+
type_counts = Counter(
|
| 469 |
+
analysis.get("feature_type", "unknown")
|
| 470 |
+
for analysis in features_analysis.values()
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
# Calculate average missing rate
|
| 474 |
+
missing_rates = [
|
| 475 |
+
analysis.get("missing_percentage", 0) for analysis in features_analysis.values()
|
| 476 |
+
]
|
| 477 |
+
avg_missing = statistics.mean(missing_rates) if missing_rates else 0
|
| 478 |
+
|
| 479 |
+
summary_parts = [f"Analyzed {total_features} features from {sample_size} samples"]
|
| 480 |
+
|
| 481 |
+
# Feature type breakdown
|
| 482 |
+
type_summary = []
|
| 483 |
+
for ftype, count in type_counts.most_common():
|
| 484 |
+
type_summary.append(f"{count} {ftype}")
|
| 485 |
+
|
| 486 |
+
if type_summary:
|
| 487 |
+
summary_parts.append(f"Types: {', '.join(type_summary)}")
|
| 488 |
+
|
| 489 |
+
# Missing data summary
|
| 490 |
+
if avg_missing > 0:
|
| 491 |
+
summary_parts.append(f"Avg missing: {avg_missing:.1f}%")
|
| 492 |
+
|
| 493 |
+
return " | ".join(summary_parts)
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
def validate_analysis_inputs(
|
| 497 |
+
dataset_id: str, split: str, sample_size: int, config_name: Optional[str] = None
|
| 498 |
+
) -> None:
|
| 499 |
+
"""
|
| 500 |
+
Validate inputs for dataset analysis.
|
| 501 |
+
|
| 502 |
+
Args:
|
| 503 |
+
dataset_id: Dataset identifier to validate
|
| 504 |
+
split: Split name to validate
|
| 505 |
+
sample_size: Sample size to validate
|
| 506 |
+
config_name: Optional configuration name to validate
|
| 507 |
+
|
| 508 |
+
Raises:
|
| 509 |
+
ValueError: If any input is invalid
|
| 510 |
+
"""
|
| 511 |
+
# Validate dataset_id
|
| 512 |
+
if not dataset_id or not isinstance(dataset_id, str):
|
| 513 |
+
raise ValueError("dataset_id must be a non-empty string")
|
| 514 |
+
|
| 515 |
+
dataset_id = dataset_id.strip()
|
| 516 |
+
if not dataset_id:
|
| 517 |
+
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 518 |
+
|
| 519 |
+
# Validate split
|
| 520 |
+
if not split or not isinstance(split, str):
|
| 521 |
+
raise ValueError("split must be a non-empty string")
|
| 522 |
+
|
| 523 |
+
split = split.strip()
|
| 524 |
+
if not split:
|
| 525 |
+
raise ValueError("split cannot be empty or whitespace")
|
| 526 |
+
|
| 527 |
+
# Validate sample_size
|
| 528 |
+
if not isinstance(sample_size, int):
|
| 529 |
+
raise ValueError("sample_size must be an integer")
|
| 530 |
+
|
| 531 |
+
if sample_size <= 0:
|
| 532 |
+
raise ValueError("sample_size must be positive")
|
| 533 |
+
|
| 534 |
+
if sample_size > MAX_ANALYSIS_SAMPLE_SIZE:
|
| 535 |
+
raise ValueError(f"sample_size cannot exceed {MAX_ANALYSIS_SAMPLE_SIZE}")
|
| 536 |
+
|
| 537 |
+
# Validate config_name
|
| 538 |
+
if config_name is not None:
|
| 539 |
+
if not isinstance(config_name, str):
|
| 540 |
+
raise ValueError("config_name must be a string")
|
| 541 |
+
|
| 542 |
+
config_name = config_name.strip()
|
| 543 |
+
if not config_name:
|
| 544 |
+
raise ValueError("config_name cannot be empty or whitespace")
|
src/hf_eda_mcp/tools/metadata.py
CHANGED
|
@@ -1,7 +1,217 @@
|
|
| 1 |
"""
|
| 2 |
Dataset metadata tool for retrieving HuggingFace dataset information.
|
| 3 |
|
| 4 |
-
This module
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Dataset metadata tool for retrieving HuggingFace dataset information.
|
| 3 |
|
| 4 |
+
This module provides tools for retrieving comprehensive metadata about
|
| 5 |
+
HuggingFace datasets including size, features, splits, and configuration details.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Optional, Dict, Any
|
| 10 |
+
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 11 |
+
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# Global dataset service instance
|
| 16 |
+
_dataset_service: Optional[DatasetService] = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_dataset_service() -> DatasetService:
|
| 20 |
+
"""Get or create the global dataset service instance."""
|
| 21 |
+
global _dataset_service
|
| 22 |
+
if _dataset_service is None:
|
| 23 |
+
_dataset_service = DatasetService()
|
| 24 |
+
return _dataset_service
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_dataset_metadata(dataset_id: str, config_name: Optional[str] = None) -> Dict[str, Any]:
|
| 28 |
+
"""
|
| 29 |
+
Retrieve comprehensive metadata for a HuggingFace dataset.
|
| 30 |
+
|
| 31 |
+
This function fetches detailed information about a dataset including its size,
|
| 32 |
+
features, available splits, configurations, and other metadata. It handles
|
| 33 |
+
multi-configuration datasets appropriately and provides caching for efficiency.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
dataset_id: HuggingFace dataset identifier (e.g., 'squad', 'glue', 'imdb')
|
| 37 |
+
config_name: Optional configuration name for multi-config datasets
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Dictionary containing comprehensive dataset metadata:
|
| 41 |
+
- id: Dataset identifier
|
| 42 |
+
- author: Dataset author/organization
|
| 43 |
+
- description: Dataset description
|
| 44 |
+
- features: Dictionary of feature names and types
|
| 45 |
+
- splits: Dictionary of split names and their sizes
|
| 46 |
+
- configs: List of available configurations
|
| 47 |
+
- size_bytes: Dataset size in bytes
|
| 48 |
+
- downloads: Number of downloads
|
| 49 |
+
- likes: Number of likes
|
| 50 |
+
- tags: List of dataset tags
|
| 51 |
+
- created_at: Creation timestamp
|
| 52 |
+
- last_modified: Last modification timestamp
|
| 53 |
+
|
| 54 |
+
Raises:
|
| 55 |
+
ValueError: If dataset_id is empty or invalid
|
| 56 |
+
DatasetNotFoundError: If dataset doesn't exist on HuggingFace Hub
|
| 57 |
+
AuthenticationError: If dataset is private and authentication fails
|
| 58 |
+
DatasetServiceError: If metadata retrieval fails for other reasons
|
| 59 |
+
|
| 60 |
+
Example:
|
| 61 |
+
>>> metadata = get_dataset_metadata("imdb")
|
| 62 |
+
>>> print(f"Dataset: {metadata['id']}")
|
| 63 |
+
>>> print(f"Splits: {list(metadata['splits'].keys())}")
|
| 64 |
+
>>> print(f"Features: {list(metadata['features'].keys())}")
|
| 65 |
+
|
| 66 |
+
>>> # For multi-config dataset
|
| 67 |
+
>>> metadata = get_dataset_metadata("glue", config_name="cola")
|
| 68 |
+
>>> print(f"Config: {metadata.get('config_name', 'default')}")
|
| 69 |
+
"""
|
| 70 |
+
# Input validation
|
| 71 |
+
if not dataset_id or not isinstance(dataset_id, str):
|
| 72 |
+
raise ValueError("dataset_id must be a non-empty string")
|
| 73 |
+
|
| 74 |
+
dataset_id = dataset_id.strip()
|
| 75 |
+
if not dataset_id:
|
| 76 |
+
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 77 |
+
|
| 78 |
+
if config_name is not None:
|
| 79 |
+
config_name = config_name.strip()
|
| 80 |
+
if not config_name:
|
| 81 |
+
config_name = None
|
| 82 |
+
|
| 83 |
+
logger.info(f"Retrieving metadata for dataset: {dataset_id}" +
|
| 84 |
+
(f", config: {config_name}" if config_name else ""))
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
# Get dataset service and retrieve metadata
|
| 88 |
+
service = get_dataset_service()
|
| 89 |
+
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 90 |
+
|
| 91 |
+
# Add the requested config name to the response if specified
|
| 92 |
+
if config_name:
|
| 93 |
+
metadata['config_name'] = config_name
|
| 94 |
+
|
| 95 |
+
# Enhance metadata with additional computed fields
|
| 96 |
+
metadata['total_configs'] = len(metadata.get('configs', []))
|
| 97 |
+
metadata['total_splits'] = len(metadata.get('splits', {}))
|
| 98 |
+
metadata['has_multiple_configs'] = metadata['total_configs'] > 1
|
| 99 |
+
|
| 100 |
+
# Format size for human readability
|
| 101 |
+
size_bytes = metadata.get('size_bytes', 0)
|
| 102 |
+
if size_bytes > 0:
|
| 103 |
+
metadata['size_human'] = _format_bytes(size_bytes)
|
| 104 |
+
else:
|
| 105 |
+
metadata['size_human'] = 'Unknown'
|
| 106 |
+
|
| 107 |
+
# Add summary information
|
| 108 |
+
metadata['summary'] = _generate_metadata_summary(metadata)
|
| 109 |
+
|
| 110 |
+
logger.info(f"Successfully retrieved metadata for {dataset_id}")
|
| 111 |
+
return metadata
|
| 112 |
+
|
| 113 |
+
except (DatasetNotFoundError, AuthenticationError):
|
| 114 |
+
# Re-raise these specific errors as-is
|
| 115 |
+
raise
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Failed to retrieve metadata for {dataset_id}: {str(e)}")
|
| 118 |
+
raise DatasetServiceError(f"Failed to retrieve dataset metadata: {str(e)}")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _format_bytes(size_bytes: int) -> str:
|
| 122 |
+
"""Format byte size in human-readable format."""
|
| 123 |
+
if size_bytes == 0:
|
| 124 |
+
return "0 B"
|
| 125 |
+
|
| 126 |
+
units = ['B', 'KB', 'MB', 'GB', 'TB']
|
| 127 |
+
size = float(size_bytes)
|
| 128 |
+
unit_index = 0
|
| 129 |
+
|
| 130 |
+
while size >= 1024 and unit_index < len(units) - 1:
|
| 131 |
+
size /= 1024
|
| 132 |
+
unit_index += 1
|
| 133 |
+
|
| 134 |
+
if unit_index == 0:
|
| 135 |
+
return f"{int(size)} {units[unit_index]}"
|
| 136 |
+
else:
|
| 137 |
+
return f"{size:.1f} {units[unit_index]}"
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _generate_metadata_summary(metadata: Dict[str, Any]) -> str:
|
| 141 |
+
"""Generate a human-readable summary of dataset metadata."""
|
| 142 |
+
summary_parts = []
|
| 143 |
+
|
| 144 |
+
# Basic info
|
| 145 |
+
summary_parts.append(f"Dataset: {metadata.get('id', 'Unknown')}")
|
| 146 |
+
|
| 147 |
+
if metadata.get('author'):
|
| 148 |
+
summary_parts.append(f"Author: {metadata['author']}")
|
| 149 |
+
|
| 150 |
+
# Size and popularity
|
| 151 |
+
if metadata.get('size_human'):
|
| 152 |
+
summary_parts.append(f"Size: {metadata['size_human']}")
|
| 153 |
+
|
| 154 |
+
downloads = metadata.get('downloads', 0)
|
| 155 |
+
if downloads > 0:
|
| 156 |
+
summary_parts.append(f"Downloads: {downloads:,}")
|
| 157 |
+
|
| 158 |
+
likes = metadata.get('likes', 0)
|
| 159 |
+
if likes > 0:
|
| 160 |
+
summary_parts.append(f"Likes: {likes:,}")
|
| 161 |
+
|
| 162 |
+
# Structure info
|
| 163 |
+
configs = metadata.get('configs', [])
|
| 164 |
+
if configs:
|
| 165 |
+
if len(configs) == 1:
|
| 166 |
+
summary_parts.append(f"Configuration: {configs[0]}")
|
| 167 |
+
else:
|
| 168 |
+
summary_parts.append(f"Configurations: {len(configs)} available")
|
| 169 |
+
|
| 170 |
+
splits = metadata.get('splits', {})
|
| 171 |
+
if splits:
|
| 172 |
+
split_names = list(splits.keys())
|
| 173 |
+
if len(split_names) <= 3:
|
| 174 |
+
summary_parts.append(f"Splits: {', '.join(split_names)}")
|
| 175 |
+
else:
|
| 176 |
+
summary_parts.append(f"Splits: {len(split_names)} available")
|
| 177 |
+
|
| 178 |
+
features = metadata.get('features', {})
|
| 179 |
+
if features:
|
| 180 |
+
summary_parts.append(f"Features: {len(features)} columns")
|
| 181 |
+
|
| 182 |
+
return " | ".join(summary_parts)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def validate_dataset_metadata_inputs(dataset_id: str, config_name: Optional[str] = None) -> None:
|
| 186 |
+
"""
|
| 187 |
+
Validate inputs for dataset metadata retrieval.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
dataset_id: Dataset identifier to validate
|
| 191 |
+
config_name: Optional configuration name to validate
|
| 192 |
+
|
| 193 |
+
Raises:
|
| 194 |
+
ValueError: If inputs are invalid
|
| 195 |
+
"""
|
| 196 |
+
if not dataset_id or not isinstance(dataset_id, str):
|
| 197 |
+
raise ValueError("dataset_id must be a non-empty string")
|
| 198 |
+
|
| 199 |
+
dataset_id = dataset_id.strip()
|
| 200 |
+
if not dataset_id:
|
| 201 |
+
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 202 |
+
|
| 203 |
+
# Basic format validation for dataset_id
|
| 204 |
+
if not all(c.isalnum() or c in '-_/.@' for c in dataset_id):
|
| 205 |
+
raise ValueError("dataset_id contains invalid characters")
|
| 206 |
+
|
| 207 |
+
if config_name is not None:
|
| 208 |
+
if not isinstance(config_name, str):
|
| 209 |
+
raise ValueError("config_name must be a string")
|
| 210 |
+
|
| 211 |
+
config_name = config_name.strip()
|
| 212 |
+
if not config_name:
|
| 213 |
+
raise ValueError("config_name cannot be empty or whitespace")
|
| 214 |
+
|
| 215 |
+
# Basic format validation for config_name
|
| 216 |
+
if not all(c.isalnum() or c in '-_.' for c in config_name):
|
| 217 |
+
raise ValueError("config_name contains invalid characters")
|
src/hf_eda_mcp/tools/sampling.py
CHANGED
|
@@ -1,7 +1,330 @@
|
|
| 1 |
"""
|
| 2 |
-
Dataset sampling tool for retrieving
|
| 3 |
|
| 4 |
-
This module
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Dataset sampling tool for retrieving samples from HuggingFace datasets.
|
| 3 |
|
| 4 |
+
This module provides tools for efficiently sampling data from HuggingFace datasets
|
| 5 |
+
with support for different splits, configurable sample sizes, and streaming for large datasets.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Optional, Dict, Any, List
|
| 10 |
+
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 11 |
+
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# Global dataset service instance
|
| 16 |
+
_dataset_service: Optional[DatasetService] = None
|
| 17 |
+
|
| 18 |
+
# Constants for sampling limits
|
| 19 |
+
MAX_SAMPLE_SIZE = 10000 # Maximum samples to prevent memory issues
|
| 20 |
+
DEFAULT_SAMPLE_SIZE = 10
|
| 21 |
+
VALID_SPLITS = {'train', 'validation', 'test', 'dev', 'val'}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_dataset_service() -> DatasetService:
|
| 25 |
+
"""Get or create the global dataset service instance."""
|
| 26 |
+
global _dataset_service
|
| 27 |
+
if _dataset_service is None:
|
| 28 |
+
_dataset_service = DatasetService()
|
| 29 |
+
return _dataset_service
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_dataset_sample(
|
| 33 |
+
dataset_id: str,
|
| 34 |
+
split: str = "train",
|
| 35 |
+
num_samples: int = DEFAULT_SAMPLE_SIZE,
|
| 36 |
+
config_name: Optional[str] = None,
|
| 37 |
+
streaming: bool = True
|
| 38 |
+
) -> Dict[str, Any]:
|
| 39 |
+
"""
|
| 40 |
+
Retrieve a sample of rows from a HuggingFace dataset.
|
| 41 |
+
|
| 42 |
+
This function efficiently samples data from datasets with support for different
|
| 43 |
+
splits and configurable sample sizes. It uses streaming by default for large
|
| 44 |
+
datasets to minimize memory usage and loading time.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
dataset_id: HuggingFace dataset identifier (e.g., 'imdb', 'squad', 'glue')
|
| 48 |
+
split: Dataset split to sample from (default: 'train')
|
| 49 |
+
num_samples: Number of samples to retrieve (default: 10, max: 10000)
|
| 50 |
+
config_name: Optional configuration name for multi-config datasets
|
| 51 |
+
streaming: Whether to use streaming mode for efficient loading (default: True)
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
Dictionary containing sampled data and metadata:
|
| 55 |
+
- dataset_id: Original dataset identifier
|
| 56 |
+
- config_name: Configuration name used (if any)
|
| 57 |
+
- split: Split name sampled from
|
| 58 |
+
- num_samples: Actual number of samples returned
|
| 59 |
+
- requested_samples: Number of samples originally requested
|
| 60 |
+
- data: List of sample dictionaries
|
| 61 |
+
- schema: Dictionary describing the dataset features/columns
|
| 62 |
+
- sample_info: Additional information about the sampling process
|
| 63 |
+
|
| 64 |
+
Raises:
|
| 65 |
+
ValueError: If inputs are invalid (empty dataset_id, invalid split, etc.)
|
| 66 |
+
DatasetNotFoundError: If dataset or split doesn't exist
|
| 67 |
+
AuthenticationError: If dataset is private and authentication fails
|
| 68 |
+
DatasetServiceError: If sampling fails for other reasons
|
| 69 |
+
|
| 70 |
+
Example:
|
| 71 |
+
>>> # Basic sampling
|
| 72 |
+
>>> sample = get_dataset_sample("imdb", split="train", num_samples=5)
|
| 73 |
+
>>> print(f"Got {sample['num_samples']} samples from {sample['dataset_id']}")
|
| 74 |
+
>>> for i, row in enumerate(sample['data']):
|
| 75 |
+
... print(f"Sample {i+1}: {list(row.keys())}")
|
| 76 |
+
|
| 77 |
+
>>> # Multi-config dataset sampling
|
| 78 |
+
>>> sample = get_dataset_sample("glue", split="validation",
|
| 79 |
+
... num_samples=3, config_name="cola")
|
| 80 |
+
>>> print(f"Schema: {sample['schema']}")
|
| 81 |
+
"""
|
| 82 |
+
# Input validation
|
| 83 |
+
validate_sampling_inputs(dataset_id, split, num_samples, config_name)
|
| 84 |
+
|
| 85 |
+
logger.info(f"Sampling {num_samples} rows from dataset: {dataset_id}, "
|
| 86 |
+
f"split: {split}" + (f", config: {config_name}" if config_name else ""))
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
# Get dataset service and load sample
|
| 90 |
+
service = get_dataset_service()
|
| 91 |
+
sample_data = service.load_dataset_sample(
|
| 92 |
+
dataset_id=dataset_id,
|
| 93 |
+
split=split,
|
| 94 |
+
num_samples=num_samples,
|
| 95 |
+
config_name=config_name,
|
| 96 |
+
streaming=streaming
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Enhance the response with additional metadata
|
| 100 |
+
sample_data['sample_info'] = {
|
| 101 |
+
'streaming_used': streaming,
|
| 102 |
+
'sampling_strategy': 'sequential_head', # We take first N samples
|
| 103 |
+
'max_sample_size': MAX_SAMPLE_SIZE,
|
| 104 |
+
'truncated': sample_data['num_samples'] < sample_data['requested_samples']
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# Add data preview information
|
| 108 |
+
if sample_data['data']:
|
| 109 |
+
first_sample = sample_data['data'][0]
|
| 110 |
+
sample_data['sample_info']['preview'] = {
|
| 111 |
+
'columns': list(first_sample.keys()) if isinstance(first_sample, dict) else [],
|
| 112 |
+
'first_sample_types': {
|
| 113 |
+
k: type(v).__name__ for k, v in first_sample.items()
|
| 114 |
+
} if isinstance(first_sample, dict) else {}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Add summary
|
| 118 |
+
sample_data['summary'] = _generate_sample_summary(sample_data)
|
| 119 |
+
|
| 120 |
+
logger.info(f"Successfully sampled {sample_data['num_samples']} rows from {dataset_id}")
|
| 121 |
+
return sample_data
|
| 122 |
+
|
| 123 |
+
except (DatasetNotFoundError, AuthenticationError):
|
| 124 |
+
# Re-raise these specific errors as-is
|
| 125 |
+
raise
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Failed to sample from dataset {dataset_id}: {str(e)}")
|
| 128 |
+
raise DatasetServiceError(f"Failed to sample dataset: {str(e)}")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def get_dataset_sample_with_indices(
|
| 132 |
+
dataset_id: str,
|
| 133 |
+
indices: List[int],
|
| 134 |
+
split: str = "train",
|
| 135 |
+
config_name: Optional[str] = None
|
| 136 |
+
) -> Dict[str, Any]:
|
| 137 |
+
"""
|
| 138 |
+
Retrieve specific samples by their indices from a HuggingFace dataset.
|
| 139 |
+
|
| 140 |
+
This function allows for targeted sampling by specifying exact row indices.
|
| 141 |
+
Note: This requires loading the dataset in non-streaming mode.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
dataset_id: HuggingFace dataset identifier
|
| 145 |
+
indices: List of row indices to retrieve
|
| 146 |
+
split: Dataset split to sample from (default: 'train')
|
| 147 |
+
config_name: Optional configuration name for multi-config datasets
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Dictionary containing the requested samples and metadata
|
| 151 |
+
|
| 152 |
+
Raises:
|
| 153 |
+
ValueError: If inputs are invalid
|
| 154 |
+
DatasetServiceError: If sampling fails
|
| 155 |
+
"""
|
| 156 |
+
# Input validation
|
| 157 |
+
if not indices or not isinstance(indices, list):
|
| 158 |
+
raise ValueError("indices must be a non-empty list")
|
| 159 |
+
|
| 160 |
+
if not all(isinstance(i, int) and i >= 0 for i in indices):
|
| 161 |
+
raise ValueError("All indices must be non-negative integers")
|
| 162 |
+
|
| 163 |
+
if len(indices) > MAX_SAMPLE_SIZE:
|
| 164 |
+
raise ValueError(f"Too many indices requested. Maximum: {MAX_SAMPLE_SIZE}")
|
| 165 |
+
|
| 166 |
+
validate_sampling_inputs(dataset_id, split, len(indices), config_name)
|
| 167 |
+
|
| 168 |
+
logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
from datasets import load_dataset
|
| 172 |
+
|
| 173 |
+
# Load dataset without streaming to access by index
|
| 174 |
+
dataset = load_dataset(
|
| 175 |
+
dataset_id,
|
| 176 |
+
name=config_name,
|
| 177 |
+
split=split,
|
| 178 |
+
streaming=False
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Validate indices are within bounds
|
| 182 |
+
max_index = max(indices)
|
| 183 |
+
if max_index >= len(dataset):
|
| 184 |
+
raise ValueError(f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows")
|
| 185 |
+
|
| 186 |
+
# Get samples by indices
|
| 187 |
+
samples = [dataset[i] for i in indices]
|
| 188 |
+
|
| 189 |
+
# Get dataset info for schema
|
| 190 |
+
service = get_dataset_service()
|
| 191 |
+
dataset_info = service.load_dataset_info(dataset_id, config_name)
|
| 192 |
+
|
| 193 |
+
# Prepare response
|
| 194 |
+
sample_data = {
|
| 195 |
+
'dataset_id': dataset_id,
|
| 196 |
+
'config_name': config_name,
|
| 197 |
+
'split': split,
|
| 198 |
+
'num_samples': len(samples),
|
| 199 |
+
'requested_indices': indices,
|
| 200 |
+
'data': samples,
|
| 201 |
+
'schema': dataset_info.get('features', {}),
|
| 202 |
+
'sample_info': {
|
| 203 |
+
'sampling_strategy': 'by_indices',
|
| 204 |
+
'streaming_used': False,
|
| 205 |
+
'indices_requested': len(indices)
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
sample_data['summary'] = _generate_sample_summary(sample_data)
|
| 210 |
+
|
| 211 |
+
return sample_data
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
|
| 215 |
+
raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def validate_sampling_inputs(
|
| 219 |
+
dataset_id: str,
|
| 220 |
+
split: str,
|
| 221 |
+
num_samples: int,
|
| 222 |
+
config_name: Optional[str] = None
|
| 223 |
+
) -> None:
|
| 224 |
+
"""
|
| 225 |
+
Validate inputs for dataset sampling.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
dataset_id: Dataset identifier to validate
|
| 229 |
+
split: Split name to validate
|
| 230 |
+
num_samples: Number of samples to validate
|
| 231 |
+
config_name: Optional configuration name to validate
|
| 232 |
+
|
| 233 |
+
Raises:
|
| 234 |
+
ValueError: If any input is invalid
|
| 235 |
+
"""
|
| 236 |
+
# Validate dataset_id
|
| 237 |
+
if not dataset_id or not isinstance(dataset_id, str):
|
| 238 |
+
raise ValueError("dataset_id must be a non-empty string")
|
| 239 |
+
|
| 240 |
+
dataset_id = dataset_id.strip()
|
| 241 |
+
if not dataset_id:
|
| 242 |
+
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 243 |
+
|
| 244 |
+
# Validate split
|
| 245 |
+
if not split or not isinstance(split, str):
|
| 246 |
+
raise ValueError("split must be a non-empty string")
|
| 247 |
+
|
| 248 |
+
split = split.strip().lower()
|
| 249 |
+
if not split:
|
| 250 |
+
raise ValueError("split cannot be empty or whitespace")
|
| 251 |
+
|
| 252 |
+
# Note: We don't strictly enforce VALID_SPLITS as datasets may have custom split names
|
| 253 |
+
|
| 254 |
+
# Validate num_samples
|
| 255 |
+
if not isinstance(num_samples, int):
|
| 256 |
+
raise ValueError("num_samples must be an integer")
|
| 257 |
+
|
| 258 |
+
if num_samples <= 0:
|
| 259 |
+
raise ValueError("num_samples must be positive")
|
| 260 |
+
|
| 261 |
+
if num_samples > MAX_SAMPLE_SIZE:
|
| 262 |
+
raise ValueError(f"num_samples cannot exceed {MAX_SAMPLE_SIZE}")
|
| 263 |
+
|
| 264 |
+
# Validate config_name
|
| 265 |
+
if config_name is not None:
|
| 266 |
+
if not isinstance(config_name, str):
|
| 267 |
+
raise ValueError("config_name must be a string")
|
| 268 |
+
|
| 269 |
+
config_name = config_name.strip()
|
| 270 |
+
if not config_name:
|
| 271 |
+
raise ValueError("config_name cannot be empty or whitespace")
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
| 275 |
+
"""Generate a human-readable summary of the sample data."""
|
| 276 |
+
summary_parts = []
|
| 277 |
+
|
| 278 |
+
# Basic info
|
| 279 |
+
summary_parts.append(f"Dataset: {sample_data.get('dataset_id', 'Unknown')}")
|
| 280 |
+
summary_parts.append(f"Split: {sample_data.get('split', 'Unknown')}")
|
| 281 |
+
|
| 282 |
+
if sample_data.get('config_name'):
|
| 283 |
+
summary_parts.append(f"Config: {sample_data['config_name']}")
|
| 284 |
+
|
| 285 |
+
# Sample info
|
| 286 |
+
num_samples = sample_data.get('num_samples', 0)
|
| 287 |
+
requested = sample_data.get('requested_samples', num_samples)
|
| 288 |
+
|
| 289 |
+
if num_samples == requested:
|
| 290 |
+
summary_parts.append(f"Samples: {num_samples}")
|
| 291 |
+
else:
|
| 292 |
+
summary_parts.append(f"Samples: {num_samples}/{requested} (truncated)")
|
| 293 |
+
|
| 294 |
+
# Schema info
|
| 295 |
+
schema = sample_data.get('schema', {})
|
| 296 |
+
if schema:
|
| 297 |
+
summary_parts.append(f"Columns: {len(schema)}")
|
| 298 |
+
|
| 299 |
+
# Sampling strategy
|
| 300 |
+
sample_info = sample_data.get('sample_info', {})
|
| 301 |
+
strategy = sample_info.get('sampling_strategy', 'unknown')
|
| 302 |
+
if strategy == 'by_indices':
|
| 303 |
+
summary_parts.append("Strategy: by indices")
|
| 304 |
+
elif strategy == 'sequential_head':
|
| 305 |
+
summary_parts.append("Strategy: first N rows")
|
| 306 |
+
|
| 307 |
+
return " | ".join(summary_parts)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def get_available_splits(dataset_id: str, config_name: Optional[str] = None) -> List[str]:
|
| 311 |
+
"""
|
| 312 |
+
Get available splits for a dataset.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
dataset_id: HuggingFace dataset identifier
|
| 316 |
+
config_name: Optional configuration name
|
| 317 |
+
|
| 318 |
+
Returns:
|
| 319 |
+
List of available split names
|
| 320 |
+
|
| 321 |
+
Raises:
|
| 322 |
+
DatasetServiceError: If unable to retrieve split information
|
| 323 |
+
"""
|
| 324 |
+
try:
|
| 325 |
+
service = get_dataset_service()
|
| 326 |
+
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 327 |
+
return list(metadata.get('splits', {}).keys())
|
| 328 |
+
except Exception as e:
|
| 329 |
+
logger.error(f"Failed to get splits for {dataset_id}: {str(e)}")
|
| 330 |
+
raise DatasetServiceError(f"Failed to get available splits: {str(e)}")
|