""" DataView MCP - A comprehensive MCP server for exploring Hugging Face datasets. This MCP server provides 10 tools for searching, sampling, profiling, and discovering datasets on the Hugging Face Hub. Tools: 1. search_datasets - Find datasets by keyword, task, or domain 2. search_by_columns - Find datasets with specific column names 3. get_dataset_info - Get detailed metadata and README 4. get_schema - Get column names and data types 5. sample_rows - Get actual data samples 6. get_statistics - Compute column statistics 7. profile_quality - Assess data quality issues 8. find_similar - Find similar datasets 9. suggest_tasks - Suggest ML tasks for a dataset 10. compare_datasets - Compare two datasets side-by-side Usage: # Run locally python app.py # Or with Gradio CLI gradio app.py # Connect via MCP Add to your MCP client config: { "mcpServers": { "dataview": { "url": "http://localhost:7860/gradio_api/mcp/sse" } } } """ import gradio as gr from typing import Optional, List # Import all tools from tools.search import search_datasets, search_by_columns from tools.metadata import get_dataset_info, get_schema from tools.sampling import sample_rows from tools.profiling import get_statistics, profile_quality from tools.discovery import find_similar, suggest_tasks, compare_datasets # Create Gradio interfaces for each tool # Note: Gradio will automatically convert these to MCP tools def create_demo(): """Create the Gradio demo with all tools.""" with gr.Blocks( title="DataView MCP - HuggingFace Dataset Explorer", theme=gr.themes.Soft() ) as demo: gr.Markdown(""" # DataView MCP ## Explore Hugging Face Datasets with AI This MCP server provides tools for AI assistants to explore, analyze, and understand datasets on the Hugging Face Hub. **10 Tools Available:** - Search & Discovery: `search_datasets`, `search_by_columns`, `find_similar` - Metadata: `get_dataset_info`, `get_schema` - Data Access: `sample_rows` - Analysis: `get_statistics`, `profile_quality` - Intelligence: `suggest_tasks`, `compare_datasets` --- ### Try the tools below or connect via MCP """) with gr.Tabs(): # Search Tab with gr.Tab("Search"): with gr.Row(): with gr.Column(): search_query = gr.Textbox( label="Search Query", placeholder="e.g., sentiment analysis, medical imaging" ) search_limit = gr.Slider(1, 50, value=10, step=1, label="Max Results") search_task = gr.Dropdown( choices=[ None, "text-classification", "question-answering", "summarization", "translation", "image-classification", "object-detection", "text-generation" ], label="Filter by Task (optional)" ) search_btn = gr.Button("Search Datasets", variant="primary") with gr.Column(): search_output = gr.Markdown(label="Results") search_btn.click( search_datasets, inputs=[search_query, search_limit, search_task], outputs=search_output ) # Dataset Info Tab with gr.Tab("Dataset Info"): with gr.Row(): with gr.Column(): info_dataset_id = gr.Textbox( label="Dataset ID", placeholder="e.g., imdb, squad, huggingface/documentation-images" ) info_btn = gr.Button("Get Info", variant="primary") schema_btn = gr.Button("Get Schema") with gr.Column(): info_output = gr.Markdown(label="Dataset Info") info_btn.click(get_dataset_info, inputs=[info_dataset_id], outputs=info_output) schema_btn.click(get_schema, inputs=[info_dataset_id], outputs=info_output) # Sample Data Tab with gr.Tab("Sample Data"): with gr.Row(): with gr.Column(): sample_dataset_id = gr.Textbox( label="Dataset ID", placeholder="e.g., imdb" ) sample_n_rows = gr.Slider(1, 20, value=5, step=1, label="Number of Rows") sample_split = gr.Dropdown( choices=["train", "test", "validation"], value="train", label="Split" ) sample_btn = gr.Button("Get Sample", variant="primary") with gr.Column(): sample_output = gr.Markdown(label="Sample Data") sample_btn.click( sample_rows, inputs=[sample_dataset_id, sample_n_rows, gr.State(None), sample_split], outputs=sample_output ) # Analysis Tab with gr.Tab("Analysis"): with gr.Row(): with gr.Column(): analysis_dataset_id = gr.Textbox( label="Dataset ID", placeholder="e.g., imdb" ) analysis_sample_size = gr.Slider( 100, 2000, value=500, step=100, label="Sample Size for Analysis" ) stats_btn = gr.Button("Get Statistics", variant="primary") quality_btn = gr.Button("Profile Quality") with gr.Column(): analysis_output = gr.Markdown(label="Analysis Results") stats_btn.click( get_statistics, inputs=[analysis_dataset_id, gr.State(None), gr.State("train"), analysis_sample_size], outputs=analysis_output ) quality_btn.click( profile_quality, inputs=[analysis_dataset_id, gr.State(None), gr.State("train"), analysis_sample_size], outputs=analysis_output ) # Discovery Tab with gr.Tab("Discovery"): with gr.Row(): with gr.Column(): discovery_dataset_id = gr.Textbox( label="Dataset ID", placeholder="e.g., imdb" ) discovery_top_k = gr.Slider(1, 10, value=5, step=1, label="Number of Results") similar_btn = gr.Button("Find Similar", variant="primary") suggest_btn = gr.Button("Suggest Tasks") with gr.Column(): discovery_output = gr.Markdown(label="Discovery Results") similar_btn.click( find_similar, inputs=[discovery_dataset_id, discovery_top_k], outputs=discovery_output ) suggest_btn.click( suggest_tasks, inputs=[discovery_dataset_id], outputs=discovery_output ) # Compare Tab with gr.Tab("Compare"): with gr.Row(): with gr.Column(): compare_dataset_a = gr.Textbox( label="Dataset A", placeholder="e.g., imdb" ) compare_dataset_b = gr.Textbox( label="Dataset B", placeholder="e.g., rotten_tomatoes" ) compare_btn = gr.Button("Compare Datasets", variant="primary") with gr.Column(): compare_output = gr.Markdown(label="Comparison Results") compare_btn.click( compare_datasets, inputs=[compare_dataset_a, compare_dataset_b], outputs=compare_output ) gr.Markdown(""" --- ### MCP Connection To use with Claude or other MCP clients, add this to your config: ```json { "mcpServers": { "dataview": { "url": "https://YOUR-SPACE.hf.space/gradio_api/mcp/sse" } } } ``` --- Built with Gradio MCP """) return demo # Create the demo demo = create_demo() # Launch with MCP server enabled if __name__ == "__main__": demo.launch( mcp_server=True, share=False, server_name="0.0.0.0", server_port=7860 )