dataview-mcp / app.py
efecelik's picture
Initial release: DataView MCP - HuggingFace Dataset Explorer
b67578f
"""
DataView MCP - A comprehensive MCP server for exploring Hugging Face datasets.
This MCP server provides 10 tools for searching, sampling, profiling, and
discovering datasets on the Hugging Face Hub.
Tools:
1. search_datasets - Find datasets by keyword, task, or domain
2. search_by_columns - Find datasets with specific column names
3. get_dataset_info - Get detailed metadata and README
4. get_schema - Get column names and data types
5. sample_rows - Get actual data samples
6. get_statistics - Compute column statistics
7. profile_quality - Assess data quality issues
8. find_similar - Find similar datasets
9. suggest_tasks - Suggest ML tasks for a dataset
10. compare_datasets - Compare two datasets side-by-side
Usage:
# Run locally
python app.py
# Or with Gradio CLI
gradio app.py
# Connect via MCP
Add to your MCP client config:
{
"mcpServers": {
"dataview": {
"url": "http://localhost:7860/gradio_api/mcp/sse"
}
}
}
"""
import gradio as gr
from typing import Optional, List
# Import all tools
from tools.search import search_datasets, search_by_columns
from tools.metadata import get_dataset_info, get_schema
from tools.sampling import sample_rows
from tools.profiling import get_statistics, profile_quality
from tools.discovery import find_similar, suggest_tasks, compare_datasets
# Create Gradio interfaces for each tool
# Note: Gradio will automatically convert these to MCP tools
def create_demo():
"""Create the Gradio demo with all tools."""
with gr.Blocks(
title="DataView MCP - HuggingFace Dataset Explorer",
theme=gr.themes.Soft()
) as demo:
gr.Markdown("""
# DataView MCP
## Explore Hugging Face Datasets with AI
This MCP server provides tools for AI assistants to explore, analyze, and
understand datasets on the Hugging Face Hub.
**10 Tools Available:**
- Search & Discovery: `search_datasets`, `search_by_columns`, `find_similar`
- Metadata: `get_dataset_info`, `get_schema`
- Data Access: `sample_rows`
- Analysis: `get_statistics`, `profile_quality`
- Intelligence: `suggest_tasks`, `compare_datasets`
---
### Try the tools below or connect via MCP
""")
with gr.Tabs():
# Search Tab
with gr.Tab("Search"):
with gr.Row():
with gr.Column():
search_query = gr.Textbox(
label="Search Query",
placeholder="e.g., sentiment analysis, medical imaging"
)
search_limit = gr.Slider(1, 50, value=10, step=1, label="Max Results")
search_task = gr.Dropdown(
choices=[
None, "text-classification", "question-answering",
"summarization", "translation", "image-classification",
"object-detection", "text-generation"
],
label="Filter by Task (optional)"
)
search_btn = gr.Button("Search Datasets", variant="primary")
with gr.Column():
search_output = gr.Markdown(label="Results")
search_btn.click(
search_datasets,
inputs=[search_query, search_limit, search_task],
outputs=search_output
)
# Dataset Info Tab
with gr.Tab("Dataset Info"):
with gr.Row():
with gr.Column():
info_dataset_id = gr.Textbox(
label="Dataset ID",
placeholder="e.g., imdb, squad, huggingface/documentation-images"
)
info_btn = gr.Button("Get Info", variant="primary")
schema_btn = gr.Button("Get Schema")
with gr.Column():
info_output = gr.Markdown(label="Dataset Info")
info_btn.click(get_dataset_info, inputs=[info_dataset_id], outputs=info_output)
schema_btn.click(get_schema, inputs=[info_dataset_id], outputs=info_output)
# Sample Data Tab
with gr.Tab("Sample Data"):
with gr.Row():
with gr.Column():
sample_dataset_id = gr.Textbox(
label="Dataset ID",
placeholder="e.g., imdb"
)
sample_n_rows = gr.Slider(1, 20, value=5, step=1, label="Number of Rows")
sample_split = gr.Dropdown(
choices=["train", "test", "validation"],
value="train",
label="Split"
)
sample_btn = gr.Button("Get Sample", variant="primary")
with gr.Column():
sample_output = gr.Markdown(label="Sample Data")
sample_btn.click(
sample_rows,
inputs=[sample_dataset_id, sample_n_rows, gr.State(None), sample_split],
outputs=sample_output
)
# Analysis Tab
with gr.Tab("Analysis"):
with gr.Row():
with gr.Column():
analysis_dataset_id = gr.Textbox(
label="Dataset ID",
placeholder="e.g., imdb"
)
analysis_sample_size = gr.Slider(
100, 2000, value=500, step=100,
label="Sample Size for Analysis"
)
stats_btn = gr.Button("Get Statistics", variant="primary")
quality_btn = gr.Button("Profile Quality")
with gr.Column():
analysis_output = gr.Markdown(label="Analysis Results")
stats_btn.click(
get_statistics,
inputs=[analysis_dataset_id, gr.State(None), gr.State("train"), analysis_sample_size],
outputs=analysis_output
)
quality_btn.click(
profile_quality,
inputs=[analysis_dataset_id, gr.State(None), gr.State("train"), analysis_sample_size],
outputs=analysis_output
)
# Discovery Tab
with gr.Tab("Discovery"):
with gr.Row():
with gr.Column():
discovery_dataset_id = gr.Textbox(
label="Dataset ID",
placeholder="e.g., imdb"
)
discovery_top_k = gr.Slider(1, 10, value=5, step=1, label="Number of Results")
similar_btn = gr.Button("Find Similar", variant="primary")
suggest_btn = gr.Button("Suggest Tasks")
with gr.Column():
discovery_output = gr.Markdown(label="Discovery Results")
similar_btn.click(
find_similar,
inputs=[discovery_dataset_id, discovery_top_k],
outputs=discovery_output
)
suggest_btn.click(
suggest_tasks,
inputs=[discovery_dataset_id],
outputs=discovery_output
)
# Compare Tab
with gr.Tab("Compare"):
with gr.Row():
with gr.Column():
compare_dataset_a = gr.Textbox(
label="Dataset A",
placeholder="e.g., imdb"
)
compare_dataset_b = gr.Textbox(
label="Dataset B",
placeholder="e.g., rotten_tomatoes"
)
compare_btn = gr.Button("Compare Datasets", variant="primary")
with gr.Column():
compare_output = gr.Markdown(label="Comparison Results")
compare_btn.click(
compare_datasets,
inputs=[compare_dataset_a, compare_dataset_b],
outputs=compare_output
)
gr.Markdown("""
---
### MCP Connection
To use with Claude or other MCP clients, add this to your config:
```json
{
"mcpServers": {
"dataview": {
"url": "https://YOUR-SPACE.hf.space/gradio_api/mcp/sse"
}
}
}
```
---
Built with Gradio MCP
""")
return demo
# Create the demo
demo = create_demo()
# Launch with MCP server enabled
if __name__ == "__main__":
demo.launch(
mcp_server=True,
share=False,
server_name="0.0.0.0",
server_port=7860
)