KhalilGuetari's picture
Add badges to the app
ea0aee9
"""
Main Gradio application with MCP server functionality.
This module provides the main entry point for the hf-eda-mcp server,
creating Gradio interfaces for EDA tools and enabling MCP server functionality.
"""
import gradio as gr
import sys
from typing import Optional
from hf_eda_mcp.tools.metadata import get_dataset_metadata
from hf_eda_mcp.tools.sampling import get_dataset_sample
from hf_eda_mcp.tools.analysis import analyze_dataset_features
from hf_eda_mcp.tools.search import search_text_in_dataset
from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config
def create_gradio_app(config: ServerConfig) -> gr.Blocks:
"""Create and configure the main Gradio application with MCP server."""
# Create main app with MCP tool interfaces
with gr.Blocks(
title="HF EDA MCP Server",
) as app:
gr.Markdown(
"""
# πŸ“Š HuggingFace EDA MCP Server
**MCP server for exploratory data analysis of HuggingFace datasets**
This server provides four tools for dataset exploration that are automatically exposed as MCP tools.
"""
)
with gr.Row():
gr.HTML(
"""
<div style="display: flex; gap: 8px; justify-content: center; flex-wrap: wrap;">
<a href="https://www.youtube.com/watch?v=XdP7zGSb81k" target="_blank">
<img src="https://img.shields.io/badge/▢️_Demo_Video-FF0000?style=for-the-badge&logo=youtube&logoColor=white" alt="Demo Video">
</a>
<a href="https://www.linkedin.com/posts/khalil-guetari-00a61415a_mcp-server-for-huggingface-datasets-discovery-activity-7400587711838842880-2K8p" target="_blank">
<img src="https://img.shields.io/badge/LinkedIn_Post-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white" alt="LinkedIn Post">
</a>
<a href="https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md" target="_blank">
<img src="https://img.shields.io/badge/πŸ“–_README-FFD21E?style=for-the-badge" alt="README">
</a>
</div>
"""
)
# Create interfaces for each EDA tool - these will be automatically exposed as MCP tools
with gr.Tab("πŸ“Š Dataset Metadata"):
gr.Interface(
fn=get_dataset_metadata,
inputs=[
gr.Textbox(
label="dataset_id",
placeholder="e.g., imdb, squad, glue",
info="HuggingFace dataset identifier",
),
gr.Textbox(
label="config_name",
placeholder="e.g., cola, sst2 (optional)",
info="Configuration name for multi-config datasets",
),
],
outputs=gr.JSON(label="Dataset Metadata"),
title="Get Dataset Metadata",
description="Retrieve comprehensive metadata for a HuggingFace dataset including size, features, splits, and configuration details.",
examples=[
["imdb", ""],
["glue", "cola"],
["squad", ""],
["wikitext", "wikitext-2-raw-v1"],
],
)
with gr.Tab("πŸ” Dataset Sampling"):
gr.Interface(
fn=get_dataset_sample,
inputs=[
gr.Textbox(
label="dataset_id",
placeholder="e.g., imdb, squad, glue",
info="HuggingFace dataset identifier",
),
gr.Dropdown(
choices=["train", "validation", "test", "dev", "val"],
value="train",
label="split",
info="Dataset split to sample from",
allow_custom_value=True,
),
gr.Slider(
minimum=1,
maximum=1000,
value=10,
step=1,
label="num_samples",
info="Number of samples to retrieve (max: 10000 for MCP)",
),
gr.Textbox(
label="config_name",
placeholder="e.g., cola, sst2 (optional)",
info="Configuration name for multi-config datasets",
),
],
outputs=gr.JSON(label="Dataset Sample"),
title="Get Dataset Sample",
description="Retrieve a sample of rows from a HuggingFace dataset with support for different splits and configurable sample sizes.",
examples=[
["imdb", "train", 5, ""],
["glue", "validation", 3, "cola"],
["squad", "train", 2, ""],
["wikitext", "test", 1, "wikitext-2-raw-v1"],
],
)
with gr.Tab("πŸ“ˆ Feature Analysis"):
gr.Interface(
fn=analyze_dataset_features,
inputs=[
gr.Textbox(
label="dataset_id",
placeholder="e.g., imdb, squad, glue",
info="HuggingFace dataset identifier",
),
gr.Dropdown(
choices=["train", "validation", "test", "dev", "val"],
value="train",
label="split",
info="Dataset split to analyze",
allow_custom_value=True,
),
gr.Slider(
minimum=100,
maximum=10000,
value=1000,
step=100,
label="sample_size",
info="Number of samples to use for analysis (max: 50000 for MCP)",
),
gr.Textbox(
label="config_name",
placeholder="e.g., cola, sst2 (optional)",
info="Configuration name for multi-config datasets",
),
],
outputs=gr.JSON(label="Analysis Results"),
title="Analyze Dataset Features",
description="Perform basic exploratory analysis on dataset features including statistics, missing values, and data quality assessment.",
examples=[
["imdb", "train", 1000, ""],
["glue", "train", 500, "cola"],
["squad", "validation", 800, ""],
["wikitext", "train", 1200, "wikitext-2-raw-v1"],
],
)
with gr.Tab("πŸ”Ž Text Search"):
gr.Interface(
fn=search_text_in_dataset,
inputs=[
gr.Textbox(
label="dataset_id",
placeholder="e.g., imdb, squad, glue",
info="HuggingFace dataset identifier",
),
gr.Textbox(
label="config_name",
placeholder="e.g., cola, sst2",
info="Configuration name (required for search)",
),
gr.Dropdown(
choices=["train", "validation", "test", "dev", "val"],
value="train",
label="split",
info="Dataset split to search in",
allow_custom_value=True,
),
gr.Textbox(
label="query",
placeholder="Enter search query...",
info="Text to search for in the dataset",
),
gr.Slider(
minimum=0,
maximum=1000,
value=0,
step=10,
label="offset",
info="Offset for pagination",
),
gr.Slider(
minimum=1,
maximum=100,
value=10,
step=1,
label="length",
info="Number of results to return",
),
],
outputs=gr.JSON(label="Search Results"),
title="Search Text in Dataset",
description="Search for text in text columns of a dataset. Only text columns are searched and only parquet datasets are supported.",
examples=[
["stanfordnlp/imdb", "plain_text", "train", "great movie", 0, 10],
["rajpurkar/squad", "plain_text", "train", "president", 0, 5],
["nyu-mll/glue", "cola", "train", "friends", 0, 10],
],
)
with gr.Tab("ℹ️ About"):
gr.Markdown(
f"""
## About HF EDA MCP Server
This server implements the Model Context Protocol (MCP) to provide AI assistants
with tools for exploring and analyzing HuggingFace datasets.
### Available MCP Tools
1. **get_dataset_metadata**: Retrieve comprehensive dataset information
2. **get_dataset_sample**: Sample data from datasets with configurable parameters
3. **analyze_dataset_features**: Perform exploratory data analysis
4. **search_text_in_dataset**: Search for text in dataset columns
### MCP Server Configuration
### Server Status
- **MCP Tools**: 4 tools available
- **Authentication**: To explore private or gated datasets, set `hf-api-token` in MCP configuration headers
- **MCP Schema**: Available at `/gradio_api/mcp/schema`
- **Cache Directory**: {config.cache_dir or "Default system cache"}
- **Max Sample Size**: {config.max_sample_size:,}
- **Request Timeout**: {config.request_timeout}s
### Documentation
For full documentation, MCP client configuration, and local development instructions, see the [README](https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md).
"""
)
return app
def launch_server(
config: Optional[ServerConfig] = None,
port: Optional[int] = None,
mcp_server: Optional[bool] = None,
share: Optional[bool] = None,
) -> None:
"""
Launch the Gradio app with MCP server enabled.
Args:
config: Server configuration object. If None, loads from environment
port: Port to run the server on (overrides config)
mcp_server: Whether to enable MCP server functionality (overrides config)
share: Whether to create a public shareable link (overrides config)
"""
# Load configuration
if config is None:
config = ServerConfig.from_env()
# Override config with explicit parameters
if port is not None:
config.port = port
if mcp_server is not None:
config.mcp_server = mcp_server
if share is not None:
config.share = share
# Set global configuration for tools to use
set_config(config)
# Set up logging
logger = setup_logging(config)
logger.info("=" * 60)
logger.info("πŸš€ Starting HuggingFace EDA MCP Server")
logger.info("=" * 60)
# Validate configuration
try:
validate_config(config)
except Exception as e:
logger.error(f"Configuration validation failed: {e}")
sys.exit(1)
logger.info(f"HF Token is: {config.hf_token}")
# Log configuration
logger.info("Server configuration:")
logger.info(f" - Host: {config.host}")
logger.info(f" - Port: {config.port}")
logger.info(f" - MCP server enabled: {config.mcp_server}")
logger.info(f" - Share enabled: {config.share}")
logger.info(f" - Log level: {config.log_level}")
logger.info(f" - Cache directory: {config.cache_dir or 'Default system cache'}")
logger.info(f" - Max sample size: {config.max_sample_size:,}")
logger.info(f" - Request timeout: {config.request_timeout}s")
logger.info(f" - Max concurrent requests: {config.max_concurrent_requests}")
# Create the Gradio app
try:
logger.info("Creating Gradio application with EDA tools...")
app = create_gradio_app(config)
logger.info("βœ… Gradio application created successfully")
except Exception as e:
logger.error(f"Failed to create Gradio application: {e}")
logger.exception("Full traceback:")
sys.exit(1)
# Configure launch parameters
launch_kwargs = {
"server_name": config.host,
"server_port": config.port,
"share": config.share,
"show_error": True,
"quiet": False,
"footer_links": ["api", "gradio", "settings"],
"theme": gr.themes.Soft(),
"css": """
.gradio-container {
max-width: 1200px !important;
}
""",
"max_threads": config.max_concurrent_requests,
}
# Add additional Gradio settings from config
launch_kwargs.update(config.gradio_settings)
# Add MCP server configuration
if config.mcp_server:
launch_kwargs["mcp_server"] = True
logger.info("πŸ”— MCP server functionality enabled")
logger.info("πŸ“Š MCP tools available:")
logger.info(" - get_dataset_metadata: Retrieve dataset information")
logger.info(" - get_dataset_sample: Sample data from datasets")
logger.info(" - analyze_dataset_features: Perform EDA analysis")
logger.info(" - search_text_in_dataset: Search for text in datasets")
logger.info(
f"🌐 MCP schema available at: http://{config.host}:{config.port}/gradio_api/mcp/schema"
)
else:
logger.info("🌐 Running in web-only mode (MCP disabled)")
# Launch the server
try:
logger.info("πŸš€ Launching Gradio application...")
logger.info(f"🌐 Web interface: http://{config.host}:{config.port}")
if config.share:
logger.info("🌍 Public sharing enabled - shareable link will be generated")
logger.info("=" * 60)
logger.info("Server is starting... Press Ctrl+C to stop")
logger.info("=" * 60)
app.launch(**launch_kwargs)
except KeyboardInterrupt:
logger.info("πŸ‘‹ Server stopped by user (Ctrl+C)")
sys.exit(0)
except OSError as e:
if "Address already in use" in str(e):
logger.error(f"❌ Port {config.port} is already in use")
logger.info(
"πŸ’‘ Try using a different port with --port or HF_EDA_PORT environment variable"
)
else:
logger.error(f"❌ Network error: {e}")
sys.exit(1)
except Exception as e:
logger.error(f"❌ Failed to launch server: {e}")
logger.exception("Full traceback:")
sys.exit(1)