""" Main Gradio application with MCP server functionality. This module provides the main entry point for the hf-eda-mcp server, creating Gradio interfaces for EDA tools and enabling MCP server functionality. """ import gradio as gr import sys from typing import Optional from hf_eda_mcp.tools.metadata import get_dataset_metadata from hf_eda_mcp.tools.sampling import get_dataset_sample from hf_eda_mcp.tools.analysis import analyze_dataset_features from hf_eda_mcp.tools.search import search_text_in_dataset from hf_eda_mcp.config import ServerConfig, setup_logging, validate_config, set_config def create_gradio_app(config: ServerConfig) -> gr.Blocks: """Create and configure the main Gradio application with MCP server.""" # Create main app with MCP tool interfaces with gr.Blocks( title="HF EDA MCP Server", ) as app: gr.Markdown( """ # đ HuggingFace EDA MCP Server **MCP server for exploratory data analysis of HuggingFace datasets** This server provides four tools for dataset exploration that are automatically exposed as MCP tools. """ ) with gr.Row(): gr.HTML( """
""" ) # Create interfaces for each EDA tool - these will be automatically exposed as MCP tools with gr.Tab("đ Dataset Metadata"): gr.Interface( fn=get_dataset_metadata, inputs=[ gr.Textbox( label="dataset_id", placeholder="e.g., imdb, squad, glue", info="HuggingFace dataset identifier", ), gr.Textbox( label="config_name", placeholder="e.g., cola, sst2 (optional)", info="Configuration name for multi-config datasets", ), ], outputs=gr.JSON(label="Dataset Metadata"), title="Get Dataset Metadata", description="Retrieve comprehensive metadata for a HuggingFace dataset including size, features, splits, and configuration details.", examples=[ ["imdb", ""], ["glue", "cola"], ["squad", ""], ["wikitext", "wikitext-2-raw-v1"], ], ) with gr.Tab("đ Dataset Sampling"): gr.Interface( fn=get_dataset_sample, inputs=[ gr.Textbox( label="dataset_id", placeholder="e.g., imdb, squad, glue", info="HuggingFace dataset identifier", ), gr.Dropdown( choices=["train", "validation", "test", "dev", "val"], value="train", label="split", info="Dataset split to sample from", allow_custom_value=True, ), gr.Slider( minimum=1, maximum=1000, value=10, step=1, label="num_samples", info="Number of samples to retrieve (max: 10000 for MCP)", ), gr.Textbox( label="config_name", placeholder="e.g., cola, sst2 (optional)", info="Configuration name for multi-config datasets", ), ], outputs=gr.JSON(label="Dataset Sample"), title="Get Dataset Sample", description="Retrieve a sample of rows from a HuggingFace dataset with support for different splits and configurable sample sizes.", examples=[ ["imdb", "train", 5, ""], ["glue", "validation", 3, "cola"], ["squad", "train", 2, ""], ["wikitext", "test", 1, "wikitext-2-raw-v1"], ], ) with gr.Tab("đ Feature Analysis"): gr.Interface( fn=analyze_dataset_features, inputs=[ gr.Textbox( label="dataset_id", placeholder="e.g., imdb, squad, glue", info="HuggingFace dataset identifier", ), gr.Dropdown( choices=["train", "validation", "test", "dev", "val"], value="train", label="split", info="Dataset split to analyze", allow_custom_value=True, ), gr.Slider( minimum=100, maximum=10000, value=1000, step=100, label="sample_size", info="Number of samples to use for analysis (max: 50000 for MCP)", ), gr.Textbox( label="config_name", placeholder="e.g., cola, sst2 (optional)", info="Configuration name for multi-config datasets", ), ], outputs=gr.JSON(label="Analysis Results"), title="Analyze Dataset Features", description="Perform basic exploratory analysis on dataset features including statistics, missing values, and data quality assessment.", examples=[ ["imdb", "train", 1000, ""], ["glue", "train", 500, "cola"], ["squad", "validation", 800, ""], ["wikitext", "train", 1200, "wikitext-2-raw-v1"], ], ) with gr.Tab("đ Text Search"): gr.Interface( fn=search_text_in_dataset, inputs=[ gr.Textbox( label="dataset_id", placeholder="e.g., imdb, squad, glue", info="HuggingFace dataset identifier", ), gr.Textbox( label="config_name", placeholder="e.g., cola, sst2", info="Configuration name (required for search)", ), gr.Dropdown( choices=["train", "validation", "test", "dev", "val"], value="train", label="split", info="Dataset split to search in", allow_custom_value=True, ), gr.Textbox( label="query", placeholder="Enter search query...", info="Text to search for in the dataset", ), gr.Slider( minimum=0, maximum=1000, value=0, step=10, label="offset", info="Offset for pagination", ), gr.Slider( minimum=1, maximum=100, value=10, step=1, label="length", info="Number of results to return", ), ], outputs=gr.JSON(label="Search Results"), title="Search Text in Dataset", description="Search for text in text columns of a dataset. Only text columns are searched and only parquet datasets are supported.", examples=[ ["stanfordnlp/imdb", "plain_text", "train", "great movie", 0, 10], ["rajpurkar/squad", "plain_text", "train", "president", 0, 5], ["nyu-mll/glue", "cola", "train", "friends", 0, 10], ], ) with gr.Tab("âšī¸ About"): gr.Markdown( f""" ## About HF EDA MCP Server This server implements the Model Context Protocol (MCP) to provide AI assistants with tools for exploring and analyzing HuggingFace datasets. ### Available MCP Tools 1. **get_dataset_metadata**: Retrieve comprehensive dataset information 2. **get_dataset_sample**: Sample data from datasets with configurable parameters 3. **analyze_dataset_features**: Perform exploratory data analysis 4. **search_text_in_dataset**: Search for text in dataset columns ### MCP Server Configuration ### Server Status - **MCP Tools**: 4 tools available - **Authentication**: To explore private or gated datasets, set `hf-api-token` in MCP configuration headers - **MCP Schema**: Available at `/gradio_api/mcp/schema` - **Cache Directory**: {config.cache_dir or "Default system cache"} - **Max Sample Size**: {config.max_sample_size:,} - **Request Timeout**: {config.request_timeout}s ### Documentation For full documentation, MCP client configuration, and local development instructions, see the [README](https://huggingface.co/spaces/MCP-1st-Birthday/hf-eda-mcp/blob/main/README.md). """ ) return app def launch_server( config: Optional[ServerConfig] = None, port: Optional[int] = None, mcp_server: Optional[bool] = None, share: Optional[bool] = None, ) -> None: """ Launch the Gradio app with MCP server enabled. Args: config: Server configuration object. If None, loads from environment port: Port to run the server on (overrides config) mcp_server: Whether to enable MCP server functionality (overrides config) share: Whether to create a public shareable link (overrides config) """ # Load configuration if config is None: config = ServerConfig.from_env() # Override config with explicit parameters if port is not None: config.port = port if mcp_server is not None: config.mcp_server = mcp_server if share is not None: config.share = share # Set global configuration for tools to use set_config(config) # Set up logging logger = setup_logging(config) logger.info("=" * 60) logger.info("đ Starting HuggingFace EDA MCP Server") logger.info("=" * 60) # Validate configuration try: validate_config(config) except Exception as e: logger.error(f"Configuration validation failed: {e}") sys.exit(1) logger.info(f"HF Token is: {config.hf_token}") # Log configuration logger.info("Server configuration:") logger.info(f" - Host: {config.host}") logger.info(f" - Port: {config.port}") logger.info(f" - MCP server enabled: {config.mcp_server}") logger.info(f" - Share enabled: {config.share}") logger.info(f" - Log level: {config.log_level}") logger.info(f" - Cache directory: {config.cache_dir or 'Default system cache'}") logger.info(f" - Max sample size: {config.max_sample_size:,}") logger.info(f" - Request timeout: {config.request_timeout}s") logger.info(f" - Max concurrent requests: {config.max_concurrent_requests}") # Create the Gradio app try: logger.info("Creating Gradio application with EDA tools...") app = create_gradio_app(config) logger.info("â Gradio application created successfully") except Exception as e: logger.error(f"Failed to create Gradio application: {e}") logger.exception("Full traceback:") sys.exit(1) # Configure launch parameters launch_kwargs = { "server_name": config.host, "server_port": config.port, "share": config.share, "show_error": True, "quiet": False, "footer_links": ["api", "gradio", "settings"], "theme": gr.themes.Soft(), "css": """ .gradio-container { max-width: 1200px !important; } """, "max_threads": config.max_concurrent_requests, } # Add additional Gradio settings from config launch_kwargs.update(config.gradio_settings) # Add MCP server configuration if config.mcp_server: launch_kwargs["mcp_server"] = True logger.info("đ MCP server functionality enabled") logger.info("đ MCP tools available:") logger.info(" - get_dataset_metadata: Retrieve dataset information") logger.info(" - get_dataset_sample: Sample data from datasets") logger.info(" - analyze_dataset_features: Perform EDA analysis") logger.info(" - search_text_in_dataset: Search for text in datasets") logger.info( f"đ MCP schema available at: http://{config.host}:{config.port}/gradio_api/mcp/schema" ) else: logger.info("đ Running in web-only mode (MCP disabled)") # Launch the server try: logger.info("đ Launching Gradio application...") logger.info(f"đ Web interface: http://{config.host}:{config.port}") if config.share: logger.info("đ Public sharing enabled - shareable link will be generated") logger.info("=" * 60) logger.info("Server is starting... Press Ctrl+C to stop") logger.info("=" * 60) app.launch(**launch_kwargs) except KeyboardInterrupt: logger.info("đ Server stopped by user (Ctrl+C)") sys.exit(0) except OSError as e: if "Address already in use" in str(e): logger.error(f"â Port {config.port} is already in use") logger.info( "đĄ Try using a different port with --port or HF_EDA_PORT environment variable" ) else: logger.error(f"â Network error: {e}") sys.exit(1) except Exception as e: logger.error(f"â Failed to launch server: {e}") logger.exception("Full traceback:") sys.exit(1)