Spaces:
Running
Running
Commit
·
ab96cfe
1
Parent(s):
7023fcd
Server configuration added
Browse files- .gitignore +3 -0
- .kiro/specs/hf-eda-mcp-server/tasks.md +2 -2
- CONFIGURATION.md +104 -0
- MCP_USAGE.md +52 -0
- config.example.env +27 -0
- pyproject.toml +1 -1
- src/hf_eda_mcp/__main__.py +60 -23
- src/hf_eda_mcp/config.py +164 -0
- src/hf_eda_mcp/server.py +122 -48
- src/hf_eda_mcp/tools/analysis.py +14 -6
- src/hf_eda_mcp/tools/metadata.py +7 -2
- src/hf_eda_mcp/tools/sampling.py +129 -112
.gitignore
CHANGED
|
@@ -205,3 +205,6 @@ cython_debug/
|
|
| 205 |
marimo/_static/
|
| 206 |
marimo/_lsp/
|
| 207 |
__marimo__/
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
marimo/_static/
|
| 206 |
marimo/_lsp/
|
| 207 |
__marimo__/
|
| 208 |
+
|
| 209 |
+
# Cache
|
| 210 |
+
cache/
|
.kiro/specs/hf-eda-mcp-server/tasks.md
CHANGED
|
@@ -38,7 +38,7 @@
|
|
| 38 |
- Handle different data types (numerical, categorical, text) appropriately
|
| 39 |
- _Requirements: 5.1, 5.2, 5.3, 5.4_
|
| 40 |
|
| 41 |
-
- [
|
| 42 |
- [x] 4.1 Design Gradio interfaces for each EDA tool
|
| 43 |
- Create Gradio interface for metadata retrieval with appropriate input/output components
|
| 44 |
- Build interface for dataset sampling with split and sample size controls
|
|
@@ -51,7 +51,7 @@
|
|
| 51 |
- Configure proper tool descriptions and schemas for MCP exposure
|
| 52 |
- _Requirements: 3.1, 3.2, 3.3_
|
| 53 |
|
| 54 |
-
- [
|
| 55 |
- Implement server launch function with configurable parameters
|
| 56 |
- Add environment variable support for authentication and configuration
|
| 57 |
- Include proper logging and error handling for server operations
|
|
|
|
| 38 |
- Handle different data types (numerical, categorical, text) appropriately
|
| 39 |
- _Requirements: 5.1, 5.2, 5.3, 5.4_
|
| 40 |
|
| 41 |
+
- [x] 4. Create Gradio interfaces and MCP server
|
| 42 |
- [x] 4.1 Design Gradio interfaces for each EDA tool
|
| 43 |
- Create Gradio interface for metadata retrieval with appropriate input/output components
|
| 44 |
- Build interface for dataset sampling with split and sample size controls
|
|
|
|
| 51 |
- Configure proper tool descriptions and schemas for MCP exposure
|
| 52 |
- _Requirements: 3.1, 3.2, 3.3_
|
| 53 |
|
| 54 |
+
- [x] 4.3 Add server configuration and startup
|
| 55 |
- Implement server launch function with configurable parameters
|
| 56 |
- Add environment variable support for authentication and configuration
|
| 57 |
- Include proper logging and error handling for server operations
|
CONFIGURATION.md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration Guide
|
| 2 |
+
|
| 3 |
+
The HF EDA MCP Server uses a centralized configuration system that supports both environment variables and command-line arguments.
|
| 4 |
+
|
| 5 |
+
## Configuration Module
|
| 6 |
+
|
| 7 |
+
The configuration is managed by the `src/hf_eda_mcp/config.py` module, which provides:
|
| 8 |
+
|
| 9 |
+
- `ServerConfig` dataclass with all configuration options
|
| 10 |
+
- Environment variable loading with `ServerConfig.from_env()`
|
| 11 |
+
- Global configuration management with `get_config()` and `set_config()`
|
| 12 |
+
- Logging setup and validation utilities
|
| 13 |
+
|
| 14 |
+
## Configuration Options
|
| 15 |
+
|
| 16 |
+
### Server Settings
|
| 17 |
+
- `HF_EDA_PORT` (default: 7860) - Server port
|
| 18 |
+
- `HF_EDA_HOST` (default: 127.0.0.1) - Server host
|
| 19 |
+
- `HF_EDA_MCP_ENABLED` (default: true) - Enable MCP server functionality
|
| 20 |
+
- `HF_EDA_SHARE` (default: false) - Enable public sharing via Gradio
|
| 21 |
+
|
| 22 |
+
### Authentication
|
| 23 |
+
- `HF_TOKEN` - HuggingFace access token for private datasets
|
| 24 |
+
|
| 25 |
+
### Logging
|
| 26 |
+
- `HF_EDA_LOG_LEVEL` (default: INFO) - Logging level (DEBUG, INFO, WARNING, ERROR)
|
| 27 |
+
|
| 28 |
+
### Performance and Caching
|
| 29 |
+
- `HF_EDA_CACHE_DIR` - Directory for caching datasets (optional)
|
| 30 |
+
- `HF_EDA_MAX_CACHE_SIZE` (default: 1000) - Maximum cache size in MB
|
| 31 |
+
- `HF_EDA_MAX_SAMPLE_SIZE` (default: 50000) - Maximum sample size for tools
|
| 32 |
+
- `HF_EDA_MAX_CONCURRENT` (default: 10) - Maximum concurrent requests
|
| 33 |
+
- `HF_EDA_REQUEST_TIMEOUT` (default: 300) - Request timeout in seconds
|
| 34 |
+
|
| 35 |
+
## How Configuration is Used
|
| 36 |
+
|
| 37 |
+
### Server Startup
|
| 38 |
+
The server loads configuration from environment variables and applies command-line overrides:
|
| 39 |
+
|
| 40 |
+
```python
|
| 41 |
+
from hf_eda_mcp.config import ServerConfig
|
| 42 |
+
from hf_eda_mcp.server import launch_server
|
| 43 |
+
|
| 44 |
+
config = ServerConfig.from_env()
|
| 45 |
+
launch_server(config)
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### Tools Integration
|
| 49 |
+
All EDA tools (metadata, sampling, analysis) use the global configuration:
|
| 50 |
+
|
| 51 |
+
```python
|
| 52 |
+
from hf_eda_mcp.config import get_config
|
| 53 |
+
|
| 54 |
+
config = get_config()
|
| 55 |
+
# Tools respect config.max_sample_size, config.cache_dir, config.hf_token
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Dataset Service
|
| 59 |
+
The `DatasetService` is initialized with configuration values:
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
service = DatasetService(
|
| 63 |
+
cache_dir=config.cache_dir,
|
| 64 |
+
token=config.hf_token
|
| 65 |
+
)
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Configuration Priority
|
| 69 |
+
|
| 70 |
+
1. Command-line arguments (highest priority)
|
| 71 |
+
2. Environment variables
|
| 72 |
+
3. Default values (lowest priority)
|
| 73 |
+
|
| 74 |
+
## Example Usage
|
| 75 |
+
|
| 76 |
+
### Environment Variables
|
| 77 |
+
```bash
|
| 78 |
+
export HF_TOKEN="your_token_here"
|
| 79 |
+
export HF_EDA_CACHE_DIR="/tmp/hf-cache"
|
| 80 |
+
export HF_EDA_MAX_SAMPLE_SIZE=25000
|
| 81 |
+
pdm run hf-eda-mcp
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
### Command Line
|
| 85 |
+
```bash
|
| 86 |
+
pdm run hf-eda-mcp --cache-dir /tmp/cache --max-sample-size 25000 --verbose
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
### Configuration File
|
| 90 |
+
Copy `config.example.env` to `.env` and modify as needed, then load with:
|
| 91 |
+
```bash
|
| 92 |
+
source .env
|
| 93 |
+
pdm run hf-eda-mcp
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## Validation
|
| 97 |
+
|
| 98 |
+
The configuration system includes validation for:
|
| 99 |
+
- Port ranges (1024-65535)
|
| 100 |
+
- Cache directory permissions
|
| 101 |
+
- Sample size limits
|
| 102 |
+
- Timeout values
|
| 103 |
+
|
| 104 |
+
Invalid configurations will cause the server to exit with helpful error messages.
|
MCP_USAGE.md
CHANGED
|
@@ -88,6 +88,18 @@ pdm run hf-eda-mcp --verbose
|
|
| 88 |
|
| 89 |
# Start without MCP server functionality
|
| 90 |
pdm run hf-eda-mcp --no-mcp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
```
|
| 92 |
|
| 93 |
### Server Modes
|
|
@@ -96,8 +108,48 @@ The server provides both a web interface and MCP server functionality in a singl
|
|
| 96 |
|
| 97 |
### Environment Variables
|
| 98 |
|
|
|
|
|
|
|
|
|
|
| 99 |
- `HF_TOKEN`: HuggingFace access token for private datasets (optional)
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
## Example Usage
|
| 102 |
|
| 103 |
Once connected to an MCP client, you can use the tools like this:
|
|
|
|
| 88 |
|
| 89 |
# Start without MCP server functionality
|
| 90 |
pdm run hf-eda-mcp --no-mcp
|
| 91 |
+
|
| 92 |
+
# Start with custom host (listen on all interfaces)
|
| 93 |
+
pdm run hf-eda-mcp --host 0.0.0.0
|
| 94 |
+
|
| 95 |
+
# Start with public sharing enabled
|
| 96 |
+
pdm run hf-eda-mcp --share
|
| 97 |
+
|
| 98 |
+
# Start with custom cache directory
|
| 99 |
+
pdm run hf-eda-mcp --cache-dir /path/to/cache
|
| 100 |
+
|
| 101 |
+
# Start with custom maximum sample size
|
| 102 |
+
pdm run hf-eda-mcp --max-sample-size 100000
|
| 103 |
```
|
| 104 |
|
| 105 |
### Server Modes
|
|
|
|
| 108 |
|
| 109 |
### Environment Variables
|
| 110 |
|
| 111 |
+
The server supports comprehensive configuration via environment variables:
|
| 112 |
+
|
| 113 |
+
#### Authentication
|
| 114 |
- `HF_TOKEN`: HuggingFace access token for private datasets (optional)
|
| 115 |
|
| 116 |
+
#### Server Configuration
|
| 117 |
+
- `HF_EDA_PORT`: Server port (default: 7860)
|
| 118 |
+
- `HF_EDA_HOST`: Server host (default: 127.0.0.1)
|
| 119 |
+
- `HF_EDA_MCP_ENABLED`: Enable MCP server functionality (default: true)
|
| 120 |
+
- `HF_EDA_SHARE`: Enable public sharing via Gradio (default: false)
|
| 121 |
+
|
| 122 |
+
#### Logging Configuration
|
| 123 |
+
- `HF_EDA_LOG_LEVEL`: Logging level - DEBUG, INFO, WARNING, ERROR (default: INFO)
|
| 124 |
+
|
| 125 |
+
#### Performance and Caching
|
| 126 |
+
- `HF_EDA_CACHE_DIR`: Directory for caching datasets (optional)
|
| 127 |
+
- `HF_EDA_MAX_CACHE_SIZE`: Maximum cache size in MB (default: 1000)
|
| 128 |
+
- `HF_EDA_MAX_SAMPLE_SIZE`: Maximum sample size for analysis (default: 50000)
|
| 129 |
+
- `HF_EDA_MAX_CONCURRENT`: Maximum concurrent requests (default: 10)
|
| 130 |
+
- `HF_EDA_REQUEST_TIMEOUT`: Request timeout in seconds (default: 300)
|
| 131 |
+
|
| 132 |
+
### Configuration Examples
|
| 133 |
+
|
| 134 |
+
#### Production Configuration
|
| 135 |
+
```bash
|
| 136 |
+
export HF_TOKEN="your_token_here"
|
| 137 |
+
export HF_EDA_HOST="0.0.0.0"
|
| 138 |
+
export HF_EDA_PORT="8080"
|
| 139 |
+
export HF_EDA_LOG_LEVEL="WARNING"
|
| 140 |
+
export HF_EDA_CACHE_DIR="/var/cache/hf-eda"
|
| 141 |
+
export HF_EDA_MAX_CONCURRENT="20"
|
| 142 |
+
pdm run hf-eda-mcp
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
#### Development Configuration
|
| 146 |
+
```bash
|
| 147 |
+
export HF_TOKEN="your_token_here"
|
| 148 |
+
export HF_EDA_LOG_LEVEL="DEBUG"
|
| 149 |
+
export HF_EDA_CACHE_DIR="./cache"
|
| 150 |
+
pdm run hf-eda-mcp --verbose
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
## Example Usage
|
| 154 |
|
| 155 |
Once connected to an MCP client, you can use the tools like this:
|
config.example.env
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HuggingFace EDA MCP Server Configuration Example
|
| 2 |
+
# Copy this file to .env and modify as needed
|
| 3 |
+
# All settings can also be passed as command-line arguments
|
| 4 |
+
|
| 5 |
+
# Authentication
|
| 6 |
+
HF_TOKEN=your_huggingface_token_here
|
| 7 |
+
|
| 8 |
+
# Server Configuration
|
| 9 |
+
HF_EDA_PORT=7860
|
| 10 |
+
HF_EDA_HOST=127.0.0.1
|
| 11 |
+
HF_EDA_MCP_ENABLED=true
|
| 12 |
+
HF_EDA_SHARE=false
|
| 13 |
+
|
| 14 |
+
# Logging Configuration
|
| 15 |
+
HF_EDA_LOG_LEVEL=INFO
|
| 16 |
+
|
| 17 |
+
# Performance and Caching
|
| 18 |
+
# Cache directory for dataset metadata and samples
|
| 19 |
+
HF_EDA_CACHE_DIR=./cache
|
| 20 |
+
# Maximum cache size in MB
|
| 21 |
+
HF_EDA_MAX_CACHE_SIZE=1000
|
| 22 |
+
# Maximum sample size for analysis and sampling tools
|
| 23 |
+
HF_EDA_MAX_SAMPLE_SIZE=50000
|
| 24 |
+
# Maximum concurrent requests
|
| 25 |
+
HF_EDA_MAX_CONCURRENT=10
|
| 26 |
+
# Request timeout in seconds
|
| 27 |
+
HF_EDA_REQUEST_TIMEOUT=300
|
pyproject.toml
CHANGED
|
@@ -23,7 +23,7 @@ build-backend = "pdm.backend"
|
|
| 23 |
|
| 24 |
|
| 25 |
[tool.pdm.scripts]
|
| 26 |
-
hf-eda-mcp = "python -m hf_eda_mcp"
|
| 27 |
hf_client_playground = "python -m scripts.playground.hf_client_playground"
|
| 28 |
|
| 29 |
[tool.pdm]
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
[tool.pdm.scripts]
|
| 26 |
+
hf-eda-mcp = {cmd="python -m hf_eda_mcp", env_file= ".env"}
|
| 27 |
hf_client_playground = "python -m scripts.playground.hf_client_playground"
|
| 28 |
|
| 29 |
[tool.pdm]
|
src/hf_eda_mcp/__main__.py
CHANGED
|
@@ -7,6 +7,7 @@ python -m hf_eda_mcp
|
|
| 7 |
|
| 8 |
import argparse
|
| 9 |
import sys
|
|
|
|
| 10 |
from .server import launch_server
|
| 11 |
|
| 12 |
|
|
@@ -18,21 +19,36 @@ def main():
|
|
| 18 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 19 |
epilog="""
|
| 20 |
Examples:
|
| 21 |
-
python -m hf_eda_mcp # Start server
|
| 22 |
python -m hf_eda_mcp --port 8080 # Start server on port 8080
|
| 23 |
python -m hf_eda_mcp --no-mcp # Start without MCP server functionality
|
| 24 |
python -m hf_eda_mcp --share # Create public shareable link
|
|
|
|
| 25 |
|
| 26 |
Environment Variables:
|
| 27 |
-
HF_TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
)
|
| 30 |
|
| 31 |
parser.add_argument(
|
| 32 |
"--port",
|
| 33 |
type=int,
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
parser.add_argument(
|
|
@@ -50,32 +66,53 @@ Environment Variables:
|
|
| 50 |
parser.add_argument(
|
| 51 |
"--verbose", "-v",
|
| 52 |
action="store_true",
|
| 53 |
-
help="Enable verbose logging"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
|
| 56 |
args = parser.parse_args()
|
| 57 |
|
| 58 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
if args.verbose:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
try:
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
print(f"🌐 Web interface will be available at: http://localhost:{args.port}")
|
| 68 |
-
if not args.no_mcp:
|
| 69 |
-
print(f"🔗 MCP server functionality enabled")
|
| 70 |
-
if args.share:
|
| 71 |
-
print(f"🌍 Public sharing enabled")
|
| 72 |
-
print()
|
| 73 |
-
|
| 74 |
-
launch_server(
|
| 75 |
-
port=args.port,
|
| 76 |
-
mcp_server=not args.no_mcp,
|
| 77 |
-
share=args.share
|
| 78 |
-
)
|
| 79 |
except KeyboardInterrupt:
|
| 80 |
print("\n👋 Server stopped by user")
|
| 81 |
sys.exit(0)
|
|
|
|
| 7 |
|
| 8 |
import argparse
|
| 9 |
import sys
|
| 10 |
+
from .config import ServerConfig
|
| 11 |
from .server import launch_server
|
| 12 |
|
| 13 |
|
|
|
|
| 19 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 20 |
epilog="""
|
| 21 |
Examples:
|
| 22 |
+
python -m hf_eda_mcp # Start server with default settings
|
| 23 |
python -m hf_eda_mcp --port 8080 # Start server on port 8080
|
| 24 |
python -m hf_eda_mcp --no-mcp # Start without MCP server functionality
|
| 25 |
python -m hf_eda_mcp --share # Create public shareable link
|
| 26 |
+
python -m hf_eda_mcp --host 0.0.0.0 # Listen on all interfaces
|
| 27 |
|
| 28 |
Environment Variables:
|
| 29 |
+
HF_TOKEN HuggingFace access token for private datasets
|
| 30 |
+
HF_EDA_PORT Server port (default: 7860)
|
| 31 |
+
HF_EDA_HOST Server host (default: 127.0.0.1)
|
| 32 |
+
HF_EDA_MCP_ENABLED Enable MCP server (default: true)
|
| 33 |
+
HF_EDA_SHARE Enable public sharing (default: false)
|
| 34 |
+
HF_EDA_LOG_LEVEL Logging level (default: INFO)
|
| 35 |
+
HF_EDA_CACHE_DIR Cache directory for datasets
|
| 36 |
+
HF_EDA_MAX_SAMPLE_SIZE Maximum sample size (default: 50000)
|
| 37 |
+
HF_EDA_MAX_CONCURRENT Max concurrent requests (default: 10)
|
| 38 |
+
HF_EDA_REQUEST_TIMEOUT Request timeout in seconds (default: 300)
|
| 39 |
"""
|
| 40 |
)
|
| 41 |
|
| 42 |
parser.add_argument(
|
| 43 |
"--port",
|
| 44 |
type=int,
|
| 45 |
+
help="Port to run the server on (overrides HF_EDA_PORT)"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--host",
|
| 50 |
+
type=str,
|
| 51 |
+
help="Host to bind the server to (overrides HF_EDA_HOST)"
|
| 52 |
)
|
| 53 |
|
| 54 |
parser.add_argument(
|
|
|
|
| 66 |
parser.add_argument(
|
| 67 |
"--verbose", "-v",
|
| 68 |
action="store_true",
|
| 69 |
+
help="Enable verbose logging (DEBUG level)"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
parser.add_argument(
|
| 73 |
+
"--log-level",
|
| 74 |
+
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
| 75 |
+
help="Set logging level (overrides HF_EDA_LOG_LEVEL)"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--cache-dir",
|
| 80 |
+
type=str,
|
| 81 |
+
help="Directory for caching datasets (overrides HF_EDA_CACHE_DIR)"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
parser.add_argument(
|
| 85 |
+
"--max-sample-size",
|
| 86 |
+
type=int,
|
| 87 |
+
help="Maximum sample size for analysis (overrides HF_EDA_MAX_SAMPLE_SIZE)"
|
| 88 |
)
|
| 89 |
|
| 90 |
args = parser.parse_args()
|
| 91 |
|
| 92 |
+
# Load base configuration from environment
|
| 93 |
+
config = ServerConfig.from_env()
|
| 94 |
+
|
| 95 |
+
# Override with command line arguments
|
| 96 |
+
if args.port is not None:
|
| 97 |
+
config.port = args.port
|
| 98 |
+
if args.host is not None:
|
| 99 |
+
config.host = args.host
|
| 100 |
+
if args.no_mcp:
|
| 101 |
+
config.mcp_server = False
|
| 102 |
+
if args.share:
|
| 103 |
+
config.share = True
|
| 104 |
if args.verbose:
|
| 105 |
+
config.log_level = "DEBUG"
|
| 106 |
+
if args.log_level:
|
| 107 |
+
config.log_level = args.log_level
|
| 108 |
+
if args.cache_dir:
|
| 109 |
+
config.cache_dir = args.cache_dir
|
| 110 |
+
if args.max_sample_size is not None:
|
| 111 |
+
config.max_sample_size = args.max_sample_size
|
| 112 |
|
| 113 |
try:
|
| 114 |
+
# Launch server with configuration
|
| 115 |
+
launch_server(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
except KeyboardInterrupt:
|
| 117 |
print("\n👋 Server stopped by user")
|
| 118 |
sys.exit(0)
|
src/hf_eda_mcp/config.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management for the HF EDA MCP Server.
|
| 3 |
+
|
| 4 |
+
This module provides configuration classes and utilities for managing
|
| 5 |
+
server settings, authentication, caching, and performance parameters.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import logging
|
| 10 |
+
import sys
|
| 11 |
+
from typing import Optional, Dict, Any
|
| 12 |
+
from dataclasses import dataclass, field
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class ServerConfig:
|
| 17 |
+
"""Configuration class for the HF EDA MCP Server."""
|
| 18 |
+
|
| 19 |
+
# Server settings
|
| 20 |
+
port: int = 7860
|
| 21 |
+
host: str = "127.0.0.1"
|
| 22 |
+
mcp_server: bool = True
|
| 23 |
+
share: bool = False
|
| 24 |
+
|
| 25 |
+
# Authentication settings
|
| 26 |
+
hf_token: Optional[str] = None
|
| 27 |
+
|
| 28 |
+
# Logging settings
|
| 29 |
+
log_level: str = "INFO"
|
| 30 |
+
log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 31 |
+
|
| 32 |
+
# Cache settings
|
| 33 |
+
cache_dir: Optional[str] = None
|
| 34 |
+
max_cache_size: int = 1000 # MB
|
| 35 |
+
|
| 36 |
+
# Performance settings
|
| 37 |
+
max_sample_size: int = 50000
|
| 38 |
+
max_concurrent_requests: int = 10
|
| 39 |
+
request_timeout: int = 300 # seconds
|
| 40 |
+
|
| 41 |
+
# Additional Gradio settings
|
| 42 |
+
gradio_settings: Dict[str, Any] = field(default_factory=dict)
|
| 43 |
+
|
| 44 |
+
@classmethod
|
| 45 |
+
def from_env(cls) -> "ServerConfig":
|
| 46 |
+
"""Create configuration from environment variables."""
|
| 47 |
+
config = cls()
|
| 48 |
+
|
| 49 |
+
# Server settings
|
| 50 |
+
config.port = int(os.getenv("HF_EDA_PORT", config.port))
|
| 51 |
+
config.host = os.getenv("HF_EDA_HOST", config.host)
|
| 52 |
+
config.mcp_server = os.getenv("HF_EDA_MCP_ENABLED", "true").lower() == "true"
|
| 53 |
+
config.share = os.getenv("HF_EDA_SHARE", "false").lower() == "true"
|
| 54 |
+
|
| 55 |
+
# Authentication
|
| 56 |
+
config.hf_token = os.getenv("HF_TOKEN")
|
| 57 |
+
|
| 58 |
+
# Logging
|
| 59 |
+
config.log_level = os.getenv("HF_EDA_LOG_LEVEL", config.log_level).upper()
|
| 60 |
+
|
| 61 |
+
# Cache settings
|
| 62 |
+
config.cache_dir = os.getenv("HF_EDA_CACHE_DIR")
|
| 63 |
+
config.max_cache_size = int(
|
| 64 |
+
os.getenv("HF_EDA_MAX_CACHE_SIZE", config.max_cache_size)
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Performance settings
|
| 68 |
+
config.max_sample_size = int(
|
| 69 |
+
os.getenv("HF_EDA_MAX_SAMPLE_SIZE", config.max_sample_size)
|
| 70 |
+
)
|
| 71 |
+
config.max_concurrent_requests = int(
|
| 72 |
+
os.getenv("HF_EDA_MAX_CONCURRENT", config.max_concurrent_requests)
|
| 73 |
+
)
|
| 74 |
+
config.request_timeout = int(
|
| 75 |
+
os.getenv("HF_EDA_REQUEST_TIMEOUT", config.request_timeout)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
return config
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def setup_logging(config: ServerConfig) -> logging.Logger:
|
| 82 |
+
"""Set up logging configuration."""
|
| 83 |
+
# Configure root logger
|
| 84 |
+
logging.basicConfig(
|
| 85 |
+
level=getattr(logging, config.log_level),
|
| 86 |
+
format=config.log_format,
|
| 87 |
+
handlers=[
|
| 88 |
+
logging.StreamHandler(sys.stdout),
|
| 89 |
+
],
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Create logger for this module
|
| 93 |
+
logger = logging.getLogger(__name__)
|
| 94 |
+
|
| 95 |
+
# Set specific log levels for external libraries
|
| 96 |
+
logging.getLogger("gradio").setLevel(logging.WARNING)
|
| 97 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 98 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
| 99 |
+
|
| 100 |
+
return logger
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def validate_config(config: ServerConfig) -> None:
|
| 104 |
+
"""Validate server configuration and log warnings for potential issues."""
|
| 105 |
+
logger = logging.getLogger(__name__)
|
| 106 |
+
|
| 107 |
+
# Validate port range
|
| 108 |
+
if not (1024 <= config.port <= 65535):
|
| 109 |
+
logger.warning(
|
| 110 |
+
f"Port {config.port} may require elevated privileges or be invalid"
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Check cache directory
|
| 114 |
+
if config.cache_dir:
|
| 115 |
+
try:
|
| 116 |
+
os.makedirs(config.cache_dir, exist_ok=True)
|
| 117 |
+
if not os.access(config.cache_dir, os.W_OK):
|
| 118 |
+
logger.error(f"Cache directory {config.cache_dir} is not writable")
|
| 119 |
+
raise PermissionError(
|
| 120 |
+
f"Cannot write to cache directory: {config.cache_dir}"
|
| 121 |
+
)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(
|
| 124 |
+
f"Failed to create/access cache directory {config.cache_dir}: {e}"
|
| 125 |
+
)
|
| 126 |
+
raise
|
| 127 |
+
|
| 128 |
+
# Validate performance settings
|
| 129 |
+
if config.max_sample_size > 100000:
|
| 130 |
+
logger.warning(
|
| 131 |
+
f"Large max_sample_size ({config.max_sample_size}) may cause memory issues"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
if config.request_timeout < 30:
|
| 135 |
+
logger.warning(
|
| 136 |
+
f"Short request timeout ({config.request_timeout}s) may cause failures for large datasets"
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Check authentication
|
| 140 |
+
if not config.hf_token:
|
| 141 |
+
logger.warning(
|
| 142 |
+
"No HuggingFace token configured - only public datasets will be accessible"
|
| 143 |
+
)
|
| 144 |
+
logger.info("Set HF_TOKEN environment variable to access private datasets")
|
| 145 |
+
else:
|
| 146 |
+
logger.info("HuggingFace token configured - private datasets accessible")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# Global configuration instance
|
| 150 |
+
_global_config: Optional[ServerConfig] = None
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def get_config() -> ServerConfig:
|
| 154 |
+
"""Get the global configuration instance."""
|
| 155 |
+
global _global_config
|
| 156 |
+
if _global_config is None:
|
| 157 |
+
_global_config = ServerConfig.from_env()
|
| 158 |
+
return _global_config
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def set_config(config: ServerConfig) -> None:
|
| 162 |
+
"""Set the global configuration instance."""
|
| 163 |
+
global _global_config
|
| 164 |
+
_global_config = config
|
src/hf_eda_mcp/server.py
CHANGED
|
@@ -6,26 +6,22 @@ creating Gradio interfaces for EDA tools and enabling MCP server functionality.
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
-
import
|
| 10 |
-
import
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Import EDA tools - these will be automatically exposed as MCP tools
|
| 13 |
from hf_eda_mcp.tools.metadata import get_dataset_metadata
|
| 14 |
from hf_eda_mcp.tools.sampling import get_dataset_sample
|
| 15 |
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 16 |
|
| 17 |
-
# Configure logging
|
| 18 |
-
logging.basicConfig(level=logging.INFO)
|
| 19 |
-
logger = logging.getLogger(__name__)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
# These functions will be automatically exposed as MCP tools when mcp_server=True
|
| 26 |
|
| 27 |
|
| 28 |
-
def create_gradio_app() -> gr.Blocks:
|
| 29 |
"""Create and configure the main Gradio application with MCP server."""
|
| 30 |
|
| 31 |
# Create main app with MCP tool interfaces
|
|
@@ -55,14 +51,14 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 55 |
fn=get_dataset_metadata,
|
| 56 |
inputs=[
|
| 57 |
gr.Textbox(
|
| 58 |
-
label="dataset_id",
|
| 59 |
placeholder="e.g., imdb, squad, glue",
|
| 60 |
-
info="HuggingFace dataset identifier"
|
| 61 |
),
|
| 62 |
gr.Textbox(
|
| 63 |
-
label="config_name",
|
| 64 |
placeholder="e.g., cola, sst2 (optional)",
|
| 65 |
-
info="Configuration name for multi-config datasets"
|
| 66 |
),
|
| 67 |
],
|
| 68 |
outputs=gr.JSON(label="Dataset Metadata"),
|
|
@@ -81,9 +77,9 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 81 |
fn=get_dataset_sample,
|
| 82 |
inputs=[
|
| 83 |
gr.Textbox(
|
| 84 |
-
label="dataset_id",
|
| 85 |
placeholder="e.g., imdb, squad, glue",
|
| 86 |
-
info="HuggingFace dataset identifier"
|
| 87 |
),
|
| 88 |
gr.Dropdown(
|
| 89 |
choices=["train", "validation", "test", "dev", "val"],
|
|
@@ -101,9 +97,9 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 101 |
info="Number of samples to retrieve (max: 10000 for MCP)",
|
| 102 |
),
|
| 103 |
gr.Textbox(
|
| 104 |
-
label="config_name",
|
| 105 |
placeholder="e.g., cola, sst2 (optional)",
|
| 106 |
-
info="Configuration name for multi-config datasets"
|
| 107 |
),
|
| 108 |
],
|
| 109 |
outputs=gr.JSON(label="Dataset Sample"),
|
|
@@ -122,9 +118,9 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 122 |
fn=analyze_dataset_features,
|
| 123 |
inputs=[
|
| 124 |
gr.Textbox(
|
| 125 |
-
label="dataset_id",
|
| 126 |
placeholder="e.g., imdb, squad, glue",
|
| 127 |
-
info="HuggingFace dataset identifier"
|
| 128 |
),
|
| 129 |
gr.Dropdown(
|
| 130 |
choices=["train", "validation", "test", "dev", "val"],
|
|
@@ -142,9 +138,9 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 142 |
info="Number of samples to use for analysis (max: 50000 for MCP)",
|
| 143 |
),
|
| 144 |
gr.Textbox(
|
| 145 |
-
label="config_name",
|
| 146 |
placeholder="e.g., cola, sst2 (optional)",
|
| 147 |
-
info="Configuration name for multi-config datasets"
|
| 148 |
),
|
| 149 |
],
|
| 150 |
outputs=gr.JSON(label="Analysis Results"),
|
|
@@ -193,8 +189,11 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 193 |
### Server Status
|
| 194 |
|
| 195 |
- **MCP Tools**: 3 tools available
|
| 196 |
-
- **Authentication**: {"✅ Token configured" if
|
| 197 |
- **MCP Schema**: Available at `/gradio_api/mcp/schema`
|
|
|
|
|
|
|
|
|
|
| 198 |
"""
|
| 199 |
)
|
| 200 |
|
|
@@ -202,56 +201,131 @@ def create_gradio_app() -> gr.Blocks:
|
|
| 202 |
|
| 203 |
|
| 204 |
def launch_server(
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
| 206 |
) -> None:
|
| 207 |
"""
|
| 208 |
Launch the Gradio app with MCP server enabled.
|
| 209 |
|
| 210 |
Args:
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
| 214 |
"""
|
| 215 |
-
|
| 216 |
-
|
|
|
|
| 217 |
|
| 218 |
-
#
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
# Create the Gradio app
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
# Configure launch parameters
|
| 230 |
launch_kwargs = {
|
| 231 |
-
"
|
| 232 |
-
"
|
|
|
|
| 233 |
"show_error": True,
|
| 234 |
"quiet": False,
|
| 235 |
"show_api": True, # Enable API documentation
|
|
|
|
| 236 |
}
|
| 237 |
|
|
|
|
|
|
|
|
|
|
| 238 |
# Add MCP server configuration
|
| 239 |
-
if mcp_server:
|
| 240 |
launch_kwargs["mcp_server"] = True
|
| 241 |
-
logger.info("MCP server functionality enabled")
|
| 242 |
-
logger.info("MCP tools
|
| 243 |
-
logger.info("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
else:
|
| 245 |
-
logger.info("Running in web-only mode (MCP disabled)")
|
| 246 |
|
| 247 |
# Launch the server
|
| 248 |
try:
|
| 249 |
-
logger.info("Launching Gradio application...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
app.launch(**launch_kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
except Exception as e:
|
| 252 |
-
logger.error(f"Failed to launch server: {e}")
|
| 253 |
-
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
if __name__ == "__main__":
|
| 257 |
-
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
+
import sys
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
# Import configuration
|
| 13 |
+
from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
|
| 14 |
|
| 15 |
# Import EDA tools - these will be automatically exposed as MCP tools
|
| 16 |
from hf_eda_mcp.tools.metadata import get_dataset_metadata
|
| 17 |
from hf_eda_mcp.tools.sampling import get_dataset_sample
|
| 18 |
from hf_eda_mcp.tools.analysis import analyze_dataset_features
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# These functions will be automatically exposed as MCP tools when mcp_server=True
|
| 22 |
|
| 23 |
|
| 24 |
+
def create_gradio_app(config: ServerConfig) -> gr.Blocks:
|
| 25 |
"""Create and configure the main Gradio application with MCP server."""
|
| 26 |
|
| 27 |
# Create main app with MCP tool interfaces
|
|
|
|
| 51 |
fn=get_dataset_metadata,
|
| 52 |
inputs=[
|
| 53 |
gr.Textbox(
|
| 54 |
+
label="dataset_id",
|
| 55 |
placeholder="e.g., imdb, squad, glue",
|
| 56 |
+
info="HuggingFace dataset identifier",
|
| 57 |
),
|
| 58 |
gr.Textbox(
|
| 59 |
+
label="config_name",
|
| 60 |
placeholder="e.g., cola, sst2 (optional)",
|
| 61 |
+
info="Configuration name for multi-config datasets",
|
| 62 |
),
|
| 63 |
],
|
| 64 |
outputs=gr.JSON(label="Dataset Metadata"),
|
|
|
|
| 77 |
fn=get_dataset_sample,
|
| 78 |
inputs=[
|
| 79 |
gr.Textbox(
|
| 80 |
+
label="dataset_id",
|
| 81 |
placeholder="e.g., imdb, squad, glue",
|
| 82 |
+
info="HuggingFace dataset identifier",
|
| 83 |
),
|
| 84 |
gr.Dropdown(
|
| 85 |
choices=["train", "validation", "test", "dev", "val"],
|
|
|
|
| 97 |
info="Number of samples to retrieve (max: 10000 for MCP)",
|
| 98 |
),
|
| 99 |
gr.Textbox(
|
| 100 |
+
label="config_name",
|
| 101 |
placeholder="e.g., cola, sst2 (optional)",
|
| 102 |
+
info="Configuration name for multi-config datasets",
|
| 103 |
),
|
| 104 |
],
|
| 105 |
outputs=gr.JSON(label="Dataset Sample"),
|
|
|
|
| 118 |
fn=analyze_dataset_features,
|
| 119 |
inputs=[
|
| 120 |
gr.Textbox(
|
| 121 |
+
label="dataset_id",
|
| 122 |
placeholder="e.g., imdb, squad, glue",
|
| 123 |
+
info="HuggingFace dataset identifier",
|
| 124 |
),
|
| 125 |
gr.Dropdown(
|
| 126 |
choices=["train", "validation", "test", "dev", "val"],
|
|
|
|
| 138 |
info="Number of samples to use for analysis (max: 50000 for MCP)",
|
| 139 |
),
|
| 140 |
gr.Textbox(
|
| 141 |
+
label="config_name",
|
| 142 |
placeholder="e.g., cola, sst2 (optional)",
|
| 143 |
+
info="Configuration name for multi-config datasets",
|
| 144 |
),
|
| 145 |
],
|
| 146 |
outputs=gr.JSON(label="Analysis Results"),
|
|
|
|
| 189 |
### Server Status
|
| 190 |
|
| 191 |
- **MCP Tools**: 3 tools available
|
| 192 |
+
- **Authentication**: {"✅ Token configured" if config.hf_token else "⚠️ No token (public datasets only)"}
|
| 193 |
- **MCP Schema**: Available at `/gradio_api/mcp/schema`
|
| 194 |
+
- **Cache Directory**: {config.cache_dir or "Default system cache"}
|
| 195 |
+
- **Max Sample Size**: {config.max_sample_size:,}
|
| 196 |
+
- **Request Timeout**: {config.request_timeout}s
|
| 197 |
"""
|
| 198 |
)
|
| 199 |
|
|
|
|
| 201 |
|
| 202 |
|
| 203 |
def launch_server(
|
| 204 |
+
config: Optional[ServerConfig] = None,
|
| 205 |
+
port: Optional[int] = None,
|
| 206 |
+
mcp_server: Optional[bool] = None,
|
| 207 |
+
share: Optional[bool] = None,
|
| 208 |
) -> None:
|
| 209 |
"""
|
| 210 |
Launch the Gradio app with MCP server enabled.
|
| 211 |
|
| 212 |
Args:
|
| 213 |
+
config: Server configuration object. If None, loads from environment
|
| 214 |
+
port: Port to run the server on (overrides config)
|
| 215 |
+
mcp_server: Whether to enable MCP server functionality (overrides config)
|
| 216 |
+
share: Whether to create a public shareable link (overrides config)
|
| 217 |
"""
|
| 218 |
+
# Load configuration
|
| 219 |
+
if config is None:
|
| 220 |
+
config = ServerConfig.from_env()
|
| 221 |
|
| 222 |
+
# Override config with explicit parameters
|
| 223 |
+
if port is not None:
|
| 224 |
+
config.port = port
|
| 225 |
+
if mcp_server is not None:
|
| 226 |
+
config.mcp_server = mcp_server
|
| 227 |
+
if share is not None:
|
| 228 |
+
config.share = share
|
| 229 |
+
|
| 230 |
+
# Set global configuration for tools to use
|
| 231 |
+
set_config(config)
|
| 232 |
+
|
| 233 |
+
# Set up logging
|
| 234 |
+
logger = setup_logging(config)
|
| 235 |
+
|
| 236 |
+
logger.info("=" * 60)
|
| 237 |
+
logger.info("🚀 Starting HuggingFace EDA MCP Server")
|
| 238 |
+
logger.info("=" * 60)
|
| 239 |
+
|
| 240 |
+
# Validate configuration
|
| 241 |
+
try:
|
| 242 |
+
validate_config(config)
|
| 243 |
+
except Exception as e:
|
| 244 |
+
logger.error(f"Configuration validation failed: {e}")
|
| 245 |
+
sys.exit(1)
|
| 246 |
+
|
| 247 |
+
# Log configuration
|
| 248 |
+
logger.info("Server configuration:")
|
| 249 |
+
logger.info(f" - Host: {config.host}")
|
| 250 |
+
logger.info(f" - Port: {config.port}")
|
| 251 |
+
logger.info(f" - MCP server enabled: {config.mcp_server}")
|
| 252 |
+
logger.info(f" - Share enabled: {config.share}")
|
| 253 |
+
logger.info(f" - Log level: {config.log_level}")
|
| 254 |
+
logger.info(f" - Cache directory: {config.cache_dir or 'Default system cache'}")
|
| 255 |
+
logger.info(f" - Max sample size: {config.max_sample_size:,}")
|
| 256 |
+
logger.info(f" - Request timeout: {config.request_timeout}s")
|
| 257 |
+
logger.info(f" - Max concurrent requests: {config.max_concurrent_requests}")
|
| 258 |
|
| 259 |
# Create the Gradio app
|
| 260 |
+
try:
|
| 261 |
+
logger.info("Creating Gradio application with EDA tools...")
|
| 262 |
+
app = create_gradio_app(config)
|
| 263 |
+
logger.info("✅ Gradio application created successfully")
|
| 264 |
+
except Exception as e:
|
| 265 |
+
logger.error(f"Failed to create Gradio application: {e}")
|
| 266 |
+
logger.exception("Full traceback:")
|
| 267 |
+
sys.exit(1)
|
| 268 |
|
| 269 |
# Configure launch parameters
|
| 270 |
launch_kwargs = {
|
| 271 |
+
"server_name": config.host,
|
| 272 |
+
"server_port": config.port,
|
| 273 |
+
"share": config.share,
|
| 274 |
"show_error": True,
|
| 275 |
"quiet": False,
|
| 276 |
"show_api": True, # Enable API documentation
|
| 277 |
+
"max_threads": config.max_concurrent_requests,
|
| 278 |
}
|
| 279 |
|
| 280 |
+
# Add additional Gradio settings from config
|
| 281 |
+
launch_kwargs.update(config.gradio_settings)
|
| 282 |
+
|
| 283 |
# Add MCP server configuration
|
| 284 |
+
if config.mcp_server:
|
| 285 |
launch_kwargs["mcp_server"] = True
|
| 286 |
+
logger.info("🔗 MCP server functionality enabled")
|
| 287 |
+
logger.info("📊 MCP tools available:")
|
| 288 |
+
logger.info(" - get_dataset_metadata: Retrieve dataset information")
|
| 289 |
+
logger.info(" - get_dataset_sample: Sample data from datasets")
|
| 290 |
+
logger.info(" - analyze_dataset_features: Perform EDA analysis")
|
| 291 |
+
logger.info(
|
| 292 |
+
f"🌐 MCP schema available at: http://{config.host}:{config.port}/gradio_api/mcp/schema"
|
| 293 |
+
)
|
| 294 |
else:
|
| 295 |
+
logger.info("🌐 Running in web-only mode (MCP disabled)")
|
| 296 |
|
| 297 |
# Launch the server
|
| 298 |
try:
|
| 299 |
+
logger.info("🚀 Launching Gradio application...")
|
| 300 |
+
logger.info(f"🌐 Web interface: http://{config.host}:{config.port}")
|
| 301 |
+
if config.share:
|
| 302 |
+
logger.info("🌍 Public sharing enabled - shareable link will be generated")
|
| 303 |
+
|
| 304 |
+
logger.info("=" * 60)
|
| 305 |
+
logger.info("Server is starting... Press Ctrl+C to stop")
|
| 306 |
+
logger.info("=" * 60)
|
| 307 |
+
|
| 308 |
app.launch(**launch_kwargs)
|
| 309 |
+
|
| 310 |
+
except KeyboardInterrupt:
|
| 311 |
+
logger.info("👋 Server stopped by user (Ctrl+C)")
|
| 312 |
+
sys.exit(0)
|
| 313 |
+
except OSError as e:
|
| 314 |
+
if "Address already in use" in str(e):
|
| 315 |
+
logger.error(f"❌ Port {config.port} is already in use")
|
| 316 |
+
logger.info(
|
| 317 |
+
"💡 Try using a different port with --port or HF_EDA_PORT environment variable"
|
| 318 |
+
)
|
| 319 |
+
else:
|
| 320 |
+
logger.error(f"❌ Network error: {e}")
|
| 321 |
+
sys.exit(1)
|
| 322 |
except Exception as e:
|
| 323 |
+
logger.error(f"❌ Failed to launch server: {e}")
|
| 324 |
+
logger.exception("Full traceback:")
|
| 325 |
+
sys.exit(1)
|
| 326 |
|
| 327 |
|
| 328 |
if __name__ == "__main__":
|
| 329 |
+
# Load configuration from environment and launch server
|
| 330 |
+
config = ServerConfig.from_env()
|
| 331 |
+
launch_server(config)
|
src/hf_eda_mcp/tools/analysis.py
CHANGED
|
@@ -9,6 +9,7 @@ import logging
|
|
| 9 |
import statistics
|
| 10 |
from typing import Optional, Dict, Any, List
|
| 11 |
from collections import Counter
|
|
|
|
| 12 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 13 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 14 |
|
|
@@ -17,17 +18,20 @@ logger = logging.getLogger(__name__)
|
|
| 17 |
# Global dataset service instance
|
| 18 |
_dataset_service: Optional[DatasetService] = None
|
| 19 |
|
| 20 |
-
#
|
| 21 |
DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
|
| 22 |
-
MAX_ANALYSIS_SAMPLE_SIZE = 50000
|
| 23 |
MAX_UNIQUE_VALUES_TO_SHOW = 20
|
| 24 |
|
| 25 |
|
| 26 |
def get_dataset_service() -> DatasetService:
|
| 27 |
-
"""Get or create the global dataset service instance."""
|
| 28 |
global _dataset_service
|
| 29 |
if _dataset_service is None:
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
return _dataset_service
|
| 32 |
|
| 33 |
|
|
@@ -531,8 +535,12 @@ def validate_analysis_inputs(
|
|
| 531 |
if sample_size <= 0:
|
| 532 |
raise ValueError("sample_size must be positive")
|
| 533 |
|
| 534 |
-
|
| 535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
| 537 |
# Validate config_name
|
| 538 |
if config_name is not None:
|
|
|
|
| 9 |
import statistics
|
| 10 |
from typing import Optional, Dict, Any, List
|
| 11 |
from collections import Counter
|
| 12 |
+
from hf_eda_mcp.config import get_config
|
| 13 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 14 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 15 |
|
|
|
|
| 18 |
# Global dataset service instance
|
| 19 |
_dataset_service: Optional[DatasetService] = None
|
| 20 |
|
| 21 |
+
# Default constants (can be overridden by config)
|
| 22 |
DEFAULT_ANALYSIS_SAMPLE_SIZE = 1000
|
|
|
|
| 23 |
MAX_UNIQUE_VALUES_TO_SHOW = 20
|
| 24 |
|
| 25 |
|
| 26 |
def get_dataset_service() -> DatasetService:
|
| 27 |
+
"""Get or create the global dataset service instance using current config."""
|
| 28 |
global _dataset_service
|
| 29 |
if _dataset_service is None:
|
| 30 |
+
config = get_config()
|
| 31 |
+
_dataset_service = DatasetService(
|
| 32 |
+
cache_dir=config.cache_dir,
|
| 33 |
+
token=config.hf_token
|
| 34 |
+
)
|
| 35 |
return _dataset_service
|
| 36 |
|
| 37 |
|
|
|
|
| 535 |
if sample_size <= 0:
|
| 536 |
raise ValueError("sample_size must be positive")
|
| 537 |
|
| 538 |
+
# Get max sample size from config
|
| 539 |
+
config = get_config()
|
| 540 |
+
max_sample_size = config.max_sample_size
|
| 541 |
+
|
| 542 |
+
if sample_size > max_sample_size:
|
| 543 |
+
raise ValueError(f"sample_size cannot exceed {max_sample_size}")
|
| 544 |
|
| 545 |
# Validate config_name
|
| 546 |
if config_name is not None:
|
src/hf_eda_mcp/tools/metadata.py
CHANGED
|
@@ -7,6 +7,7 @@ HuggingFace datasets including size, features, splits, and configuration details
|
|
| 7 |
|
| 8 |
import logging
|
| 9 |
from typing import Optional, Dict, Any
|
|
|
|
| 10 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 11 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 12 |
|
|
@@ -17,10 +18,14 @@ _dataset_service: Optional[DatasetService] = None
|
|
| 17 |
|
| 18 |
|
| 19 |
def get_dataset_service() -> DatasetService:
|
| 20 |
-
"""Get or create the global dataset service instance."""
|
| 21 |
global _dataset_service
|
| 22 |
if _dataset_service is None:
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
return _dataset_service
|
| 25 |
|
| 26 |
|
|
|
|
| 7 |
|
| 8 |
import logging
|
| 9 |
from typing import Optional, Dict, Any
|
| 10 |
+
from hf_eda_mcp.config import get_config
|
| 11 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 13 |
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def get_dataset_service() -> DatasetService:
|
| 21 |
+
"""Get or create the global dataset service instance using current config."""
|
| 22 |
global _dataset_service
|
| 23 |
if _dataset_service is None:
|
| 24 |
+
config = get_config()
|
| 25 |
+
_dataset_service = DatasetService(
|
| 26 |
+
cache_dir=config.cache_dir,
|
| 27 |
+
token=config.hf_token
|
| 28 |
+
)
|
| 29 |
return _dataset_service
|
| 30 |
|
| 31 |
|
src/hf_eda_mcp/tools/sampling.py
CHANGED
|
@@ -7,6 +7,7 @@ with support for different splits, configurable sample sizes, and streaming for
|
|
| 7 |
|
| 8 |
import logging
|
| 9 |
from typing import Optional, Dict, Any, List
|
|
|
|
| 10 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 11 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 12 |
|
|
@@ -15,17 +16,19 @@ logger = logging.getLogger(__name__)
|
|
| 15 |
# Global dataset service instance
|
| 16 |
_dataset_service: Optional[DatasetService] = None
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
MAX_SAMPLE_SIZE = 10000 # Maximum samples to prevent memory issues
|
| 20 |
DEFAULT_SAMPLE_SIZE = 10
|
| 21 |
-
VALID_SPLITS = {
|
| 22 |
|
| 23 |
|
| 24 |
def get_dataset_service() -> DatasetService:
|
| 25 |
-
"""Get or create the global dataset service instance."""
|
| 26 |
global _dataset_service
|
| 27 |
if _dataset_service is None:
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
| 29 |
return _dataset_service
|
| 30 |
|
| 31 |
|
|
@@ -34,22 +37,22 @@ def get_dataset_sample(
|
|
| 34 |
split: str = "train",
|
| 35 |
num_samples: int = DEFAULT_SAMPLE_SIZE,
|
| 36 |
config_name: Optional[str] = None,
|
| 37 |
-
streaming: bool = True
|
| 38 |
) -> Dict[str, Any]:
|
| 39 |
"""
|
| 40 |
Retrieve a sample of rows from a HuggingFace dataset.
|
| 41 |
-
|
| 42 |
This function efficiently samples data from datasets with support for different
|
| 43 |
splits and configurable sample sizes. It uses streaming by default for large
|
| 44 |
datasets to minimize memory usage and loading time.
|
| 45 |
-
|
| 46 |
Args:
|
| 47 |
dataset_id: HuggingFace dataset identifier (e.g., 'imdb', 'squad', 'glue')
|
| 48 |
split: Dataset split to sample from (default: 'train')
|
| 49 |
num_samples: Number of samples to retrieve (default: 10, max: 10000)
|
| 50 |
config_name: Optional configuration name for multi-config datasets
|
| 51 |
streaming: Whether to use streaming mode for efficient loading (default: True)
|
| 52 |
-
|
| 53 |
Returns:
|
| 54 |
Dictionary containing sampled data and metadata:
|
| 55 |
- dataset_id: Original dataset identifier
|
|
@@ -60,31 +63,33 @@ def get_dataset_sample(
|
|
| 60 |
- data: List of sample dictionaries
|
| 61 |
- schema: Dictionary describing the dataset features/columns
|
| 62 |
- sample_info: Additional information about the sampling process
|
| 63 |
-
|
| 64 |
Raises:
|
| 65 |
ValueError: If inputs are invalid (empty dataset_id, invalid split, etc.)
|
| 66 |
DatasetNotFoundError: If dataset or split doesn't exist
|
| 67 |
AuthenticationError: If dataset is private and authentication fails
|
| 68 |
DatasetServiceError: If sampling fails for other reasons
|
| 69 |
-
|
| 70 |
Example:
|
| 71 |
>>> # Basic sampling
|
| 72 |
>>> sample = get_dataset_sample("imdb", split="train", num_samples=5)
|
| 73 |
>>> print(f"Got {sample['num_samples']} samples from {sample['dataset_id']}")
|
| 74 |
>>> for i, row in enumerate(sample['data']):
|
| 75 |
... print(f"Sample {i+1}: {list(row.keys())}")
|
| 76 |
-
|
| 77 |
>>> # Multi-config dataset sampling
|
| 78 |
-
>>> sample = get_dataset_sample("glue", split="validation",
|
| 79 |
... num_samples=3, config_name="cola")
|
| 80 |
>>> print(f"Schema: {sample['schema']}")
|
| 81 |
"""
|
| 82 |
# Input validation
|
| 83 |
validate_sampling_inputs(dataset_id, split, num_samples, config_name)
|
| 84 |
-
|
| 85 |
-
logger.info(
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
try:
|
| 89 |
# Get dataset service and load sample
|
| 90 |
service = get_dataset_service()
|
|
@@ -93,33 +98,40 @@ def get_dataset_sample(
|
|
| 93 |
split=split,
|
| 94 |
num_samples=num_samples,
|
| 95 |
config_name=config_name,
|
| 96 |
-
streaming=streaming
|
| 97 |
)
|
| 98 |
-
|
| 99 |
# Enhance the response with additional metadata
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
| 105 |
}
|
| 106 |
-
|
| 107 |
# Add data preview information
|
| 108 |
-
if sample_data[
|
| 109 |
-
first_sample = sample_data[
|
| 110 |
-
sample_data[
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
| 113 |
k: type(v).__name__ for k, v in first_sample.items()
|
| 114 |
-
}
|
|
|
|
|
|
|
| 115 |
}
|
| 116 |
-
|
| 117 |
# Add summary
|
| 118 |
-
sample_data[
|
| 119 |
-
|
| 120 |
-
logger.info(
|
|
|
|
|
|
|
| 121 |
return sample_data
|
| 122 |
-
|
| 123 |
except (DatasetNotFoundError, AuthenticationError):
|
| 124 |
# Re-raise these specific errors as-is
|
| 125 |
raise
|
|
@@ -132,23 +144,23 @@ def get_dataset_sample_with_indices(
|
|
| 132 |
dataset_id: str,
|
| 133 |
indices: List[int],
|
| 134 |
split: str = "train",
|
| 135 |
-
config_name: Optional[str] = None
|
| 136 |
) -> Dict[str, Any]:
|
| 137 |
"""
|
| 138 |
Retrieve specific samples by their indices from a HuggingFace dataset.
|
| 139 |
-
|
| 140 |
This function allows for targeted sampling by specifying exact row indices.
|
| 141 |
Note: This requires loading the dataset in non-streaming mode.
|
| 142 |
-
|
| 143 |
Args:
|
| 144 |
dataset_id: HuggingFace dataset identifier
|
| 145 |
indices: List of row indices to retrieve
|
| 146 |
split: Dataset split to sample from (default: 'train')
|
| 147 |
config_name: Optional configuration name for multi-config datasets
|
| 148 |
-
|
| 149 |
Returns:
|
| 150 |
Dictionary containing the requested samples and metadata
|
| 151 |
-
|
| 152 |
Raises:
|
| 153 |
ValueError: If inputs are invalid
|
| 154 |
DatasetServiceError: If sampling fails
|
|
@@ -156,116 +168,119 @@ def get_dataset_sample_with_indices(
|
|
| 156 |
# Input validation
|
| 157 |
if not indices or not isinstance(indices, list):
|
| 158 |
raise ValueError("indices must be a non-empty list")
|
| 159 |
-
|
| 160 |
if not all(isinstance(i, int) and i >= 0 for i in indices):
|
| 161 |
raise ValueError("All indices must be non-negative integers")
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
| 166 |
validate_sampling_inputs(dataset_id, split, len(indices), config_name)
|
| 167 |
-
|
| 168 |
logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
|
| 169 |
-
|
| 170 |
try:
|
| 171 |
from datasets import load_dataset
|
| 172 |
-
|
| 173 |
# Load dataset without streaming to access by index
|
| 174 |
dataset = load_dataset(
|
| 175 |
-
dataset_id,
|
| 176 |
-
name=config_name,
|
| 177 |
-
split=split,
|
| 178 |
-
streaming=False
|
| 179 |
)
|
| 180 |
-
|
| 181 |
# Validate indices are within bounds
|
| 182 |
max_index = max(indices)
|
| 183 |
if max_index >= len(dataset):
|
| 184 |
-
raise ValueError(
|
| 185 |
-
|
|
|
|
|
|
|
| 186 |
# Get samples by indices
|
| 187 |
samples = [dataset[i] for i in indices]
|
| 188 |
-
|
| 189 |
# Get dataset info for schema
|
| 190 |
service = get_dataset_service()
|
| 191 |
dataset_info = service.load_dataset_info(dataset_id, config_name)
|
| 192 |
-
|
| 193 |
# Prepare response
|
| 194 |
sample_data = {
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
}
|
| 207 |
}
|
| 208 |
-
|
| 209 |
-
sample_data[
|
| 210 |
-
|
| 211 |
return sample_data
|
| 212 |
-
|
| 213 |
except Exception as e:
|
| 214 |
logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
|
| 215 |
raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
|
| 216 |
|
| 217 |
|
| 218 |
def validate_sampling_inputs(
|
| 219 |
-
dataset_id: str,
|
| 220 |
-
split: str,
|
| 221 |
-
num_samples: int,
|
| 222 |
-
config_name: Optional[str] = None
|
| 223 |
) -> None:
|
| 224 |
"""
|
| 225 |
Validate inputs for dataset sampling.
|
| 226 |
-
|
| 227 |
Args:
|
| 228 |
dataset_id: Dataset identifier to validate
|
| 229 |
split: Split name to validate
|
| 230 |
num_samples: Number of samples to validate
|
| 231 |
config_name: Optional configuration name to validate
|
| 232 |
-
|
| 233 |
Raises:
|
| 234 |
ValueError: If any input is invalid
|
| 235 |
"""
|
| 236 |
# Validate dataset_id
|
| 237 |
if not dataset_id or not isinstance(dataset_id, str):
|
| 238 |
raise ValueError("dataset_id must be a non-empty string")
|
| 239 |
-
|
| 240 |
dataset_id = dataset_id.strip()
|
| 241 |
if not dataset_id:
|
| 242 |
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 243 |
-
|
| 244 |
# Validate split
|
| 245 |
if not split or not isinstance(split, str):
|
| 246 |
raise ValueError("split must be a non-empty string")
|
| 247 |
-
|
| 248 |
split = split.strip().lower()
|
| 249 |
if not split:
|
| 250 |
raise ValueError("split cannot be empty or whitespace")
|
| 251 |
-
|
| 252 |
# Note: We don't strictly enforce VALID_SPLITS as datasets may have custom split names
|
| 253 |
-
|
| 254 |
# Validate num_samples
|
| 255 |
if not isinstance(num_samples, int):
|
| 256 |
raise ValueError("num_samples must be an integer")
|
| 257 |
-
|
| 258 |
if num_samples <= 0:
|
| 259 |
raise ValueError("num_samples must be positive")
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
# Validate config_name
|
| 265 |
if config_name is not None:
|
| 266 |
if not isinstance(config_name, str):
|
| 267 |
raise ValueError("config_name must be a string")
|
| 268 |
-
|
| 269 |
config_name = config_name.strip()
|
| 270 |
if not config_name:
|
| 271 |
raise ValueError("config_name cannot be empty or whitespace")
|
|
@@ -274,57 +289,59 @@ def validate_sampling_inputs(
|
|
| 274 |
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
| 275 |
"""Generate a human-readable summary of the sample data."""
|
| 276 |
summary_parts = []
|
| 277 |
-
|
| 278 |
# Basic info
|
| 279 |
summary_parts.append(f"Dataset: {sample_data.get('dataset_id', 'Unknown')}")
|
| 280 |
summary_parts.append(f"Split: {sample_data.get('split', 'Unknown')}")
|
| 281 |
-
|
| 282 |
-
if sample_data.get(
|
| 283 |
summary_parts.append(f"Config: {sample_data['config_name']}")
|
| 284 |
-
|
| 285 |
# Sample info
|
| 286 |
-
num_samples = sample_data.get(
|
| 287 |
-
requested = sample_data.get(
|
| 288 |
-
|
| 289 |
if num_samples == requested:
|
| 290 |
summary_parts.append(f"Samples: {num_samples}")
|
| 291 |
else:
|
| 292 |
summary_parts.append(f"Samples: {num_samples}/{requested} (truncated)")
|
| 293 |
-
|
| 294 |
# Schema info
|
| 295 |
-
schema = sample_data.get(
|
| 296 |
if schema:
|
| 297 |
summary_parts.append(f"Columns: {len(schema)}")
|
| 298 |
-
|
| 299 |
# Sampling strategy
|
| 300 |
-
sample_info = sample_data.get(
|
| 301 |
-
strategy = sample_info.get(
|
| 302 |
-
if strategy ==
|
| 303 |
summary_parts.append("Strategy: by indices")
|
| 304 |
-
elif strategy ==
|
| 305 |
summary_parts.append("Strategy: first N rows")
|
| 306 |
-
|
| 307 |
return " | ".join(summary_parts)
|
| 308 |
|
| 309 |
|
| 310 |
-
def get_available_splits(
|
|
|
|
|
|
|
| 311 |
"""
|
| 312 |
Get available splits for a dataset.
|
| 313 |
-
|
| 314 |
Args:
|
| 315 |
dataset_id: HuggingFace dataset identifier
|
| 316 |
config_name: Optional configuration name
|
| 317 |
-
|
| 318 |
Returns:
|
| 319 |
List of available split names
|
| 320 |
-
|
| 321 |
Raises:
|
| 322 |
DatasetServiceError: If unable to retrieve split information
|
| 323 |
"""
|
| 324 |
try:
|
| 325 |
service = get_dataset_service()
|
| 326 |
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 327 |
-
return list(metadata.get(
|
| 328 |
except Exception as e:
|
| 329 |
logger.error(f"Failed to get splits for {dataset_id}: {str(e)}")
|
| 330 |
-
raise DatasetServiceError(f"Failed to get available splits: {str(e)}")
|
|
|
|
| 7 |
|
| 8 |
import logging
|
| 9 |
from typing import Optional, Dict, Any, List
|
| 10 |
+
from hf_eda_mcp.config import get_config
|
| 11 |
from hf_eda_mcp.services.dataset_service import DatasetService, DatasetServiceError
|
| 12 |
from hf_eda_mcp.integrations.hf_client import DatasetNotFoundError, AuthenticationError
|
| 13 |
|
|
|
|
| 16 |
# Global dataset service instance
|
| 17 |
_dataset_service: Optional[DatasetService] = None
|
| 18 |
|
| 19 |
+
# Default constants (can be overridden by config)
|
|
|
|
| 20 |
DEFAULT_SAMPLE_SIZE = 10
|
| 21 |
+
VALID_SPLITS = {"train", "validation", "test", "dev", "val"}
|
| 22 |
|
| 23 |
|
| 24 |
def get_dataset_service() -> DatasetService:
|
| 25 |
+
"""Get or create the global dataset service instance using current config."""
|
| 26 |
global _dataset_service
|
| 27 |
if _dataset_service is None:
|
| 28 |
+
config = get_config()
|
| 29 |
+
_dataset_service = DatasetService(
|
| 30 |
+
cache_dir=config.cache_dir, token=config.hf_token
|
| 31 |
+
)
|
| 32 |
return _dataset_service
|
| 33 |
|
| 34 |
|
|
|
|
| 37 |
split: str = "train",
|
| 38 |
num_samples: int = DEFAULT_SAMPLE_SIZE,
|
| 39 |
config_name: Optional[str] = None,
|
| 40 |
+
streaming: bool = True,
|
| 41 |
) -> Dict[str, Any]:
|
| 42 |
"""
|
| 43 |
Retrieve a sample of rows from a HuggingFace dataset.
|
| 44 |
+
|
| 45 |
This function efficiently samples data from datasets with support for different
|
| 46 |
splits and configurable sample sizes. It uses streaming by default for large
|
| 47 |
datasets to minimize memory usage and loading time.
|
| 48 |
+
|
| 49 |
Args:
|
| 50 |
dataset_id: HuggingFace dataset identifier (e.g., 'imdb', 'squad', 'glue')
|
| 51 |
split: Dataset split to sample from (default: 'train')
|
| 52 |
num_samples: Number of samples to retrieve (default: 10, max: 10000)
|
| 53 |
config_name: Optional configuration name for multi-config datasets
|
| 54 |
streaming: Whether to use streaming mode for efficient loading (default: True)
|
| 55 |
+
|
| 56 |
Returns:
|
| 57 |
Dictionary containing sampled data and metadata:
|
| 58 |
- dataset_id: Original dataset identifier
|
|
|
|
| 63 |
- data: List of sample dictionaries
|
| 64 |
- schema: Dictionary describing the dataset features/columns
|
| 65 |
- sample_info: Additional information about the sampling process
|
| 66 |
+
|
| 67 |
Raises:
|
| 68 |
ValueError: If inputs are invalid (empty dataset_id, invalid split, etc.)
|
| 69 |
DatasetNotFoundError: If dataset or split doesn't exist
|
| 70 |
AuthenticationError: If dataset is private and authentication fails
|
| 71 |
DatasetServiceError: If sampling fails for other reasons
|
| 72 |
+
|
| 73 |
Example:
|
| 74 |
>>> # Basic sampling
|
| 75 |
>>> sample = get_dataset_sample("imdb", split="train", num_samples=5)
|
| 76 |
>>> print(f"Got {sample['num_samples']} samples from {sample['dataset_id']}")
|
| 77 |
>>> for i, row in enumerate(sample['data']):
|
| 78 |
... print(f"Sample {i+1}: {list(row.keys())}")
|
| 79 |
+
|
| 80 |
>>> # Multi-config dataset sampling
|
| 81 |
+
>>> sample = get_dataset_sample("glue", split="validation",
|
| 82 |
... num_samples=3, config_name="cola")
|
| 83 |
>>> print(f"Schema: {sample['schema']}")
|
| 84 |
"""
|
| 85 |
# Input validation
|
| 86 |
validate_sampling_inputs(dataset_id, split, num_samples, config_name)
|
| 87 |
+
|
| 88 |
+
logger.info(
|
| 89 |
+
f"Sampling {num_samples} rows from dataset: {dataset_id}, "
|
| 90 |
+
f"split: {split}" + (f", config: {config_name}" if config_name else "")
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
try:
|
| 94 |
# Get dataset service and load sample
|
| 95 |
service = get_dataset_service()
|
|
|
|
| 98 |
split=split,
|
| 99 |
num_samples=num_samples,
|
| 100 |
config_name=config_name,
|
| 101 |
+
streaming=streaming,
|
| 102 |
)
|
| 103 |
+
|
| 104 |
# Enhance the response with additional metadata
|
| 105 |
+
config = get_config()
|
| 106 |
+
sample_data["sample_info"] = {
|
| 107 |
+
"streaming_used": streaming,
|
| 108 |
+
"sampling_strategy": "sequential_head", # We take first N samples
|
| 109 |
+
"max_sample_size": config.max_sample_size,
|
| 110 |
+
"truncated": sample_data["num_samples"] < sample_data["requested_samples"],
|
| 111 |
}
|
| 112 |
+
|
| 113 |
# Add data preview information
|
| 114 |
+
if sample_data["data"]:
|
| 115 |
+
first_sample = sample_data["data"][0]
|
| 116 |
+
sample_data["sample_info"]["preview"] = {
|
| 117 |
+
"columns": list(first_sample.keys())
|
| 118 |
+
if isinstance(first_sample, dict)
|
| 119 |
+
else [],
|
| 120 |
+
"first_sample_types": {
|
| 121 |
k: type(v).__name__ for k, v in first_sample.items()
|
| 122 |
+
}
|
| 123 |
+
if isinstance(first_sample, dict)
|
| 124 |
+
else {},
|
| 125 |
}
|
| 126 |
+
|
| 127 |
# Add summary
|
| 128 |
+
sample_data["summary"] = _generate_sample_summary(sample_data)
|
| 129 |
+
|
| 130 |
+
logger.info(
|
| 131 |
+
f"Successfully sampled {sample_data['num_samples']} rows from {dataset_id}"
|
| 132 |
+
)
|
| 133 |
return sample_data
|
| 134 |
+
|
| 135 |
except (DatasetNotFoundError, AuthenticationError):
|
| 136 |
# Re-raise these specific errors as-is
|
| 137 |
raise
|
|
|
|
| 144 |
dataset_id: str,
|
| 145 |
indices: List[int],
|
| 146 |
split: str = "train",
|
| 147 |
+
config_name: Optional[str] = None,
|
| 148 |
) -> Dict[str, Any]:
|
| 149 |
"""
|
| 150 |
Retrieve specific samples by their indices from a HuggingFace dataset.
|
| 151 |
+
|
| 152 |
This function allows for targeted sampling by specifying exact row indices.
|
| 153 |
Note: This requires loading the dataset in non-streaming mode.
|
| 154 |
+
|
| 155 |
Args:
|
| 156 |
dataset_id: HuggingFace dataset identifier
|
| 157 |
indices: List of row indices to retrieve
|
| 158 |
split: Dataset split to sample from (default: 'train')
|
| 159 |
config_name: Optional configuration name for multi-config datasets
|
| 160 |
+
|
| 161 |
Returns:
|
| 162 |
Dictionary containing the requested samples and metadata
|
| 163 |
+
|
| 164 |
Raises:
|
| 165 |
ValueError: If inputs are invalid
|
| 166 |
DatasetServiceError: If sampling fails
|
|
|
|
| 168 |
# Input validation
|
| 169 |
if not indices or not isinstance(indices, list):
|
| 170 |
raise ValueError("indices must be a non-empty list")
|
| 171 |
+
|
| 172 |
if not all(isinstance(i, int) and i >= 0 for i in indices):
|
| 173 |
raise ValueError("All indices must be non-negative integers")
|
| 174 |
+
|
| 175 |
+
config = get_config()
|
| 176 |
+
if len(indices) > config.max_sample_size:
|
| 177 |
+
raise ValueError(
|
| 178 |
+
f"Too many indices requested. Maximum: {config.max_sample_size}"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
validate_sampling_inputs(dataset_id, split, len(indices), config_name)
|
| 182 |
+
|
| 183 |
logger.info(f"Sampling {len(indices)} specific indices from dataset: {dataset_id}")
|
| 184 |
+
|
| 185 |
try:
|
| 186 |
from datasets import load_dataset
|
| 187 |
+
|
| 188 |
# Load dataset without streaming to access by index
|
| 189 |
dataset = load_dataset(
|
| 190 |
+
dataset_id, name=config_name, split=split, streaming=False
|
|
|
|
|
|
|
|
|
|
| 191 |
)
|
| 192 |
+
|
| 193 |
# Validate indices are within bounds
|
| 194 |
max_index = max(indices)
|
| 195 |
if max_index >= len(dataset):
|
| 196 |
+
raise ValueError(
|
| 197 |
+
f"Index {max_index} is out of bounds for dataset with {len(dataset)} rows"
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
# Get samples by indices
|
| 201 |
samples = [dataset[i] for i in indices]
|
| 202 |
+
|
| 203 |
# Get dataset info for schema
|
| 204 |
service = get_dataset_service()
|
| 205 |
dataset_info = service.load_dataset_info(dataset_id, config_name)
|
| 206 |
+
|
| 207 |
# Prepare response
|
| 208 |
sample_data = {
|
| 209 |
+
"dataset_id": dataset_id,
|
| 210 |
+
"config_name": config_name,
|
| 211 |
+
"split": split,
|
| 212 |
+
"num_samples": len(samples),
|
| 213 |
+
"requested_indices": indices,
|
| 214 |
+
"data": samples,
|
| 215 |
+
"schema": dataset_info.get("features", {}),
|
| 216 |
+
"sample_info": {
|
| 217 |
+
"sampling_strategy": "by_indices",
|
| 218 |
+
"streaming_used": False,
|
| 219 |
+
"indices_requested": len(indices),
|
| 220 |
+
},
|
| 221 |
}
|
| 222 |
+
|
| 223 |
+
sample_data["summary"] = _generate_sample_summary(sample_data)
|
| 224 |
+
|
| 225 |
return sample_data
|
| 226 |
+
|
| 227 |
except Exception as e:
|
| 228 |
logger.error(f"Failed to sample by indices from {dataset_id}: {str(e)}")
|
| 229 |
raise DatasetServiceError(f"Failed to sample by indices: {str(e)}")
|
| 230 |
|
| 231 |
|
| 232 |
def validate_sampling_inputs(
|
| 233 |
+
dataset_id: str, split: str, num_samples: int, config_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
| 234 |
) -> None:
|
| 235 |
"""
|
| 236 |
Validate inputs for dataset sampling.
|
| 237 |
+
|
| 238 |
Args:
|
| 239 |
dataset_id: Dataset identifier to validate
|
| 240 |
split: Split name to validate
|
| 241 |
num_samples: Number of samples to validate
|
| 242 |
config_name: Optional configuration name to validate
|
| 243 |
+
|
| 244 |
Raises:
|
| 245 |
ValueError: If any input is invalid
|
| 246 |
"""
|
| 247 |
# Validate dataset_id
|
| 248 |
if not dataset_id or not isinstance(dataset_id, str):
|
| 249 |
raise ValueError("dataset_id must be a non-empty string")
|
| 250 |
+
|
| 251 |
dataset_id = dataset_id.strip()
|
| 252 |
if not dataset_id:
|
| 253 |
raise ValueError("dataset_id cannot be empty or whitespace")
|
| 254 |
+
|
| 255 |
# Validate split
|
| 256 |
if not split or not isinstance(split, str):
|
| 257 |
raise ValueError("split must be a non-empty string")
|
| 258 |
+
|
| 259 |
split = split.strip().lower()
|
| 260 |
if not split:
|
| 261 |
raise ValueError("split cannot be empty or whitespace")
|
| 262 |
+
|
| 263 |
# Note: We don't strictly enforce VALID_SPLITS as datasets may have custom split names
|
| 264 |
+
|
| 265 |
# Validate num_samples
|
| 266 |
if not isinstance(num_samples, int):
|
| 267 |
raise ValueError("num_samples must be an integer")
|
| 268 |
+
|
| 269 |
if num_samples <= 0:
|
| 270 |
raise ValueError("num_samples must be positive")
|
| 271 |
+
|
| 272 |
+
# Get max sample size from config
|
| 273 |
+
config = get_config()
|
| 274 |
+
max_sample_size = config.max_sample_size
|
| 275 |
+
|
| 276 |
+
if num_samples > max_sample_size:
|
| 277 |
+
raise ValueError(f"num_samples cannot exceed {max_sample_size}")
|
| 278 |
+
|
| 279 |
# Validate config_name
|
| 280 |
if config_name is not None:
|
| 281 |
if not isinstance(config_name, str):
|
| 282 |
raise ValueError("config_name must be a string")
|
| 283 |
+
|
| 284 |
config_name = config_name.strip()
|
| 285 |
if not config_name:
|
| 286 |
raise ValueError("config_name cannot be empty or whitespace")
|
|
|
|
| 289 |
def _generate_sample_summary(sample_data: Dict[str, Any]) -> str:
|
| 290 |
"""Generate a human-readable summary of the sample data."""
|
| 291 |
summary_parts = []
|
| 292 |
+
|
| 293 |
# Basic info
|
| 294 |
summary_parts.append(f"Dataset: {sample_data.get('dataset_id', 'Unknown')}")
|
| 295 |
summary_parts.append(f"Split: {sample_data.get('split', 'Unknown')}")
|
| 296 |
+
|
| 297 |
+
if sample_data.get("config_name"):
|
| 298 |
summary_parts.append(f"Config: {sample_data['config_name']}")
|
| 299 |
+
|
| 300 |
# Sample info
|
| 301 |
+
num_samples = sample_data.get("num_samples", 0)
|
| 302 |
+
requested = sample_data.get("requested_samples", num_samples)
|
| 303 |
+
|
| 304 |
if num_samples == requested:
|
| 305 |
summary_parts.append(f"Samples: {num_samples}")
|
| 306 |
else:
|
| 307 |
summary_parts.append(f"Samples: {num_samples}/{requested} (truncated)")
|
| 308 |
+
|
| 309 |
# Schema info
|
| 310 |
+
schema = sample_data.get("schema", {})
|
| 311 |
if schema:
|
| 312 |
summary_parts.append(f"Columns: {len(schema)}")
|
| 313 |
+
|
| 314 |
# Sampling strategy
|
| 315 |
+
sample_info = sample_data.get("sample_info", {})
|
| 316 |
+
strategy = sample_info.get("sampling_strategy", "unknown")
|
| 317 |
+
if strategy == "by_indices":
|
| 318 |
summary_parts.append("Strategy: by indices")
|
| 319 |
+
elif strategy == "sequential_head":
|
| 320 |
summary_parts.append("Strategy: first N rows")
|
| 321 |
+
|
| 322 |
return " | ".join(summary_parts)
|
| 323 |
|
| 324 |
|
| 325 |
+
def get_available_splits(
|
| 326 |
+
dataset_id: str, config_name: Optional[str] = None
|
| 327 |
+
) -> List[str]:
|
| 328 |
"""
|
| 329 |
Get available splits for a dataset.
|
| 330 |
+
|
| 331 |
Args:
|
| 332 |
dataset_id: HuggingFace dataset identifier
|
| 333 |
config_name: Optional configuration name
|
| 334 |
+
|
| 335 |
Returns:
|
| 336 |
List of available split names
|
| 337 |
+
|
| 338 |
Raises:
|
| 339 |
DatasetServiceError: If unable to retrieve split information
|
| 340 |
"""
|
| 341 |
try:
|
| 342 |
service = get_dataset_service()
|
| 343 |
metadata = service.load_dataset_info(dataset_id, config_name)
|
| 344 |
+
return list(metadata.get("splits", {}).keys())
|
| 345 |
except Exception as e:
|
| 346 |
logger.error(f"Failed to get splits for {dataset_id}: {str(e)}")
|
| 347 |
+
raise DatasetServiceError(f"Failed to get available splits: {str(e)}")
|