Spaces:
Running
Running
| """ | |
| Main Gradio application with MCP server functionality. | |
| This module provides the main entry point for the hf-eda-mcp server, | |
| creating Gradio interfaces for EDA tools and enabling MCP server functionality. | |
| """ | |
| import gradio as gr | |
| import sys | |
| from typing import Optional | |
| from hf_eda_mcp.tools.metadata import get_dataset_metadata | |
| from hf_eda_mcp.tools.sampling import get_dataset_sample | |
| from hf_eda_mcp.tools.analysis import analyze_dataset_features | |
| from hf_eda_mcp.tools.search import search_text_in_dataset | |
| from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config | |
| def create_gradio_app(config: ServerConfig) -> gr.Blocks: | |
| """Create and configure the main Gradio application with MCP server.""" | |
| # Create main app with MCP tool interfaces | |
| with gr.Blocks( | |
| title="HF EDA MCP Server", | |
| ) as app: | |
| gr.Markdown( | |
| """ | |
| # π HuggingFace EDA MCP Server | |
| **MCP server for exploratory data analysis of HuggingFace datasets** | |
| This server provides four tools for dataset exploration that are automatically exposed as MCP tools. | |
| """ | |
| ) | |
| with gr.Row(): | |
| gr.HTML( | |
| """ | |
| <div style="display: flex; gap: 8px; justify-content: center; flex-wrap: wrap;"> | |
| <a href="https://www.youtube.com/watch?v=XdP7zGSb81k" target="_blank"> | |
| <img src="https://img.shields.io/badge/βΆοΈ_Demo_Video-FF0000?style=for-the-badge&logo=youtube&logoColor=white" alt="Demo Video"> | |
| </a> | |
| <a href="https://www.linkedin.com/posts/khalil-guetari-00a61415a_mcp-server-for-huggingface-datasets-discovery-activity-7400587711838842880-2K8p" target="_blank"> | |
| <img src="https://img.shields.io/badge/LinkedIn_Post-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white" alt="LinkedIn Post"> | |
| </a> | |
| <a href="https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md" target="_blank"> | |
| <img src="https://img.shields.io/badge/π_README-FFD21E?style=for-the-badge" alt="README"> | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| # Create interfaces for each EDA tool - these will be automatically exposed as MCP tools | |
| with gr.Tab("π Dataset Metadata"): | |
| gr.Interface( | |
| fn=get_dataset_metadata, | |
| inputs=[ | |
| gr.Textbox( | |
| label="dataset_id", | |
| placeholder="e.g., imdb, squad, glue", | |
| info="HuggingFace dataset identifier", | |
| ), | |
| gr.Textbox( | |
| label="config_name", | |
| placeholder="e.g., cola, sst2 (optional)", | |
| info="Configuration name for multi-config datasets", | |
| ), | |
| ], | |
| outputs=gr.JSON(label="Dataset Metadata"), | |
| title="Get Dataset Metadata", | |
| description="Retrieve comprehensive metadata for a HuggingFace dataset including size, features, splits, and configuration details.", | |
| examples=[ | |
| ["imdb", ""], | |
| ["glue", "cola"], | |
| ["squad", ""], | |
| ["wikitext", "wikitext-2-raw-v1"], | |
| ], | |
| ) | |
| with gr.Tab("π Dataset Sampling"): | |
| gr.Interface( | |
| fn=get_dataset_sample, | |
| inputs=[ | |
| gr.Textbox( | |
| label="dataset_id", | |
| placeholder="e.g., imdb, squad, glue", | |
| info="HuggingFace dataset identifier", | |
| ), | |
| gr.Dropdown( | |
| choices=["train", "validation", "test", "dev", "val"], | |
| value="train", | |
| label="split", | |
| info="Dataset split to sample from", | |
| allow_custom_value=True, | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=1000, | |
| value=10, | |
| step=1, | |
| label="num_samples", | |
| info="Number of samples to retrieve (max: 10000 for MCP)", | |
| ), | |
| gr.Textbox( | |
| label="config_name", | |
| placeholder="e.g., cola, sst2 (optional)", | |
| info="Configuration name for multi-config datasets", | |
| ), | |
| ], | |
| outputs=gr.JSON(label="Dataset Sample"), | |
| title="Get Dataset Sample", | |
| description="Retrieve a sample of rows from a HuggingFace dataset with support for different splits and configurable sample sizes.", | |
| examples=[ | |
| ["imdb", "train", 5, ""], | |
| ["glue", "validation", 3, "cola"], | |
| ["squad", "train", 2, ""], | |
| ["wikitext", "test", 1, "wikitext-2-raw-v1"], | |
| ], | |
| ) | |
| with gr.Tab("π Feature Analysis"): | |
| gr.Interface( | |
| fn=analyze_dataset_features, | |
| inputs=[ | |
| gr.Textbox( | |
| label="dataset_id", | |
| placeholder="e.g., imdb, squad, glue", | |
| info="HuggingFace dataset identifier", | |
| ), | |
| gr.Dropdown( | |
| choices=["train", "validation", "test", "dev", "val"], | |
| value="train", | |
| label="split", | |
| info="Dataset split to analyze", | |
| allow_custom_value=True, | |
| ), | |
| gr.Slider( | |
| minimum=100, | |
| maximum=10000, | |
| value=1000, | |
| step=100, | |
| label="sample_size", | |
| info="Number of samples to use for analysis (max: 50000 for MCP)", | |
| ), | |
| gr.Textbox( | |
| label="config_name", | |
| placeholder="e.g., cola, sst2 (optional)", | |
| info="Configuration name for multi-config datasets", | |
| ), | |
| ], | |
| outputs=gr.JSON(label="Analysis Results"), | |
| title="Analyze Dataset Features", | |
| description="Perform basic exploratory analysis on dataset features including statistics, missing values, and data quality assessment.", | |
| examples=[ | |
| ["imdb", "train", 1000, ""], | |
| ["glue", "train", 500, "cola"], | |
| ["squad", "validation", 800, ""], | |
| ["wikitext", "train", 1200, "wikitext-2-raw-v1"], | |
| ], | |
| ) | |
| with gr.Tab("π Text Search"): | |
| gr.Interface( | |
| fn=search_text_in_dataset, | |
| inputs=[ | |
| gr.Textbox( | |
| label="dataset_id", | |
| placeholder="e.g., imdb, squad, glue", | |
| info="HuggingFace dataset identifier", | |
| ), | |
| gr.Textbox( | |
| label="config_name", | |
| placeholder="e.g., cola, sst2", | |
| info="Configuration name (required for search)", | |
| ), | |
| gr.Dropdown( | |
| choices=["train", "validation", "test", "dev", "val"], | |
| value="train", | |
| label="split", | |
| info="Dataset split to search in", | |
| allow_custom_value=True, | |
| ), | |
| gr.Textbox( | |
| label="query", | |
| placeholder="Enter search query...", | |
| info="Text to search for in the dataset", | |
| ), | |
| gr.Slider( | |
| minimum=0, | |
| maximum=1000, | |
| value=0, | |
| step=10, | |
| label="offset", | |
| info="Offset for pagination", | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=10, | |
| step=1, | |
| label="length", | |
| info="Number of results to return", | |
| ), | |
| ], | |
| outputs=gr.JSON(label="Search Results"), | |
| title="Search Text in Dataset", | |
| description="Search for text in text columns of a dataset. Only text columns are searched and only parquet datasets are supported.", | |
| examples=[ | |
| ["stanfordnlp/imdb", "plain_text", "train", "great movie", 0, 10], | |
| ["rajpurkar/squad", "plain_text", "train", "president", 0, 5], | |
| ["nyu-mll/glue", "cola", "train", "friends", 0, 10], | |
| ], | |
| ) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown( | |
| f""" | |
| ## About HF EDA MCP Server | |
| This server implements the Model Context Protocol (MCP) to provide AI assistants | |
| with tools for exploring and analyzing HuggingFace datasets. | |
| ### Available MCP Tools | |
| 1. **get_dataset_metadata**: Retrieve comprehensive dataset information | |
| 2. **get_dataset_sample**: Sample data from datasets with configurable parameters | |
| 3. **analyze_dataset_features**: Perform exploratory data analysis | |
| 4. **search_text_in_dataset**: Search for text in dataset columns | |
| ### MCP Server Configuration | |
| ### Server Status | |
| - **MCP Tools**: 4 tools available | |
| - **Authentication**: To explore private or gated datasets, set `hf-api-token` in MCP configuration headers | |
| - **MCP Schema**: Available at `/gradio_api/mcp/schema` | |
| - **Cache Directory**: {config.cache_dir or "Default system cache"} | |
| - **Max Sample Size**: {config.max_sample_size:,} | |
| - **Request Timeout**: {config.request_timeout}s | |
| ### Documentation | |
| For full documentation, MCP client configuration, and local development instructions, see the [README](https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md). | |
| """ | |
| ) | |
| return app | |
| def launch_server( | |
| config: Optional[ServerConfig] = None, | |
| port: Optional[int] = None, | |
| mcp_server: Optional[bool] = None, | |
| share: Optional[bool] = None, | |
| ) -> None: | |
| """ | |
| Launch the Gradio app with MCP server enabled. | |
| Args: | |
| config: Server configuration object. If None, loads from environment | |
| port: Port to run the server on (overrides config) | |
| mcp_server: Whether to enable MCP server functionality (overrides config) | |
| share: Whether to create a public shareable link (overrides config) | |
| """ | |
| # Load configuration | |
| if config is None: | |
| config = ServerConfig.from_env() | |
| # Override config with explicit parameters | |
| if port is not None: | |
| config.port = port | |
| if mcp_server is not None: | |
| config.mcp_server = mcp_server | |
| if share is not None: | |
| config.share = share | |
| # Set global configuration for tools to use | |
| set_config(config) | |
| # Set up logging | |
| logger = setup_logging(config) | |
| logger.info("=" * 60) | |
| logger.info("π Starting HuggingFace EDA MCP Server") | |
| logger.info("=" * 60) | |
| # Validate configuration | |
| try: | |
| validate_config(config) | |
| except Exception as e: | |
| logger.error(f"Configuration validation failed: {e}") | |
| sys.exit(1) | |
| logger.info(f"HF Token is: {config.hf_token}") | |
| # Log configuration | |
| logger.info("Server configuration:") | |
| logger.info(f" - Host: {config.host}") | |
| logger.info(f" - Port: {config.port}") | |
| logger.info(f" - MCP server enabled: {config.mcp_server}") | |
| logger.info(f" - Share enabled: {config.share}") | |
| logger.info(f" - Log level: {config.log_level}") | |
| logger.info(f" - Cache directory: {config.cache_dir or 'Default system cache'}") | |
| logger.info(f" - Max sample size: {config.max_sample_size:,}") | |
| logger.info(f" - Request timeout: {config.request_timeout}s") | |
| logger.info(f" - Max concurrent requests: {config.max_concurrent_requests}") | |
| # Create the Gradio app | |
| try: | |
| logger.info("Creating Gradio application with EDA tools...") | |
| app = create_gradio_app(config) | |
| logger.info("β Gradio application created successfully") | |
| except Exception as e: | |
| logger.error(f"Failed to create Gradio application: {e}") | |
| logger.exception("Full traceback:") | |
| sys.exit(1) | |
| # Configure launch parameters | |
| launch_kwargs = { | |
| "server_name": config.host, | |
| "server_port": config.port, | |
| "share": config.share, | |
| "show_error": True, | |
| "quiet": False, | |
| "footer_links": ["api", "gradio", "settings"], | |
| "theme": gr.themes.Soft(), | |
| "css": """ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """, | |
| "max_threads": config.max_concurrent_requests, | |
| } | |
| # Add additional Gradio settings from config | |
| launch_kwargs.update(config.gradio_settings) | |
| # Add MCP server configuration | |
| if config.mcp_server: | |
| launch_kwargs["mcp_server"] = True | |
| logger.info("π MCP server functionality enabled") | |
| logger.info("π MCP tools available:") | |
| logger.info(" - get_dataset_metadata: Retrieve dataset information") | |
| logger.info(" - get_dataset_sample: Sample data from datasets") | |
| logger.info(" - analyze_dataset_features: Perform EDA analysis") | |
| logger.info(" - search_text_in_dataset: Search for text in datasets") | |
| logger.info( | |
| f"π MCP schema available at: http://{config.host}:{config.port}/gradio_api/mcp/schema" | |
| ) | |
| else: | |
| logger.info("π Running in web-only mode (MCP disabled)") | |
| # Launch the server | |
| try: | |
| logger.info("π Launching Gradio application...") | |
| logger.info(f"π Web interface: http://{config.host}:{config.port}") | |
| if config.share: | |
| logger.info("π Public sharing enabled - shareable link will be generated") | |
| logger.info("=" * 60) | |
| logger.info("Server is starting... Press Ctrl+C to stop") | |
| logger.info("=" * 60) | |
| app.launch(**launch_kwargs) | |
| except KeyboardInterrupt: | |
| logger.info("π Server stopped by user (Ctrl+C)") | |
| sys.exit(0) | |
| except OSError as e: | |
| if "Address already in use" in str(e): | |
| logger.error(f"β Port {config.port} is already in use") | |
| logger.info( | |
| "π‘ Try using a different port with --port or HF_EDA_PORT environment variable" | |
| ) | |
| else: | |
| logger.error(f"β Network error: {e}") | |
| sys.exit(1) | |
| except Exception as e: | |
| logger.error(f"β Failed to launch server: {e}") | |
| logger.exception("Full traceback:") | |
| sys.exit(1) | |