Spaces:
Running
Running
Initial release: DataView MCP - HuggingFace Dataset Explorer
Browse filesFeatures 10 MCP tools:
- search_datasets, search_by_columns
- get_dataset_info, get_schema
- sample_rows
- get_statistics, profile_quality
- find_similar, suggest_tasks, compare_datasets
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- .env.example +3 -0
- .gitignore +18 -0
- README.md +183 -6
- app.py +252 -0
- requirements.txt +7 -0
- tools/__init__.py +20 -0
- tools/discovery.py +384 -0
- tools/metadata.py +99 -0
- tools/profiling.py +283 -0
- tools/sampling.py +51 -0
- tools/search.py +116 -0
- utils/__init__.py +28 -0
- utils/formatting.py +199 -0
- utils/hf_client.py +164 -0
.env.example
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face API Token (optional, but recommended for higher rate limits)
|
| 2 |
+
# Get yours at: https://huggingface.co/settings/tokens
|
| 3 |
+
HF_TOKEN=your_token_here
|
.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment
|
| 2 |
+
.env
|
| 3 |
+
.venv/
|
| 4 |
+
venv/
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.pyc
|
| 7 |
+
|
| 8 |
+
# IDE
|
| 9 |
+
.idea/
|
| 10 |
+
.vscode/
|
| 11 |
+
*.swp
|
| 12 |
+
|
| 13 |
+
# OS
|
| 14 |
+
.DS_Store
|
| 15 |
+
Thumbs.db
|
| 16 |
+
|
| 17 |
+
# Gradio
|
| 18 |
+
flagged/
|
README.md
CHANGED
|
@@ -1,12 +1,189 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: DataView MCP
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.0.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
tags:
|
| 12 |
+
- mcp
|
| 13 |
+
- datasets
|
| 14 |
+
- huggingface
|
| 15 |
+
- exploration
|
| 16 |
+
- gradio
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# DataView MCP 🔍
|
| 20 |
+
|
| 21 |
+
A comprehensive **Model Context Protocol (MCP) server** for exploring Hugging Face datasets. Give your AI assistant the power to search, sample, analyze, and discover datasets on the Hub.
|
| 22 |
+
|
| 23 |
+
## Features
|
| 24 |
+
|
| 25 |
+
| Tool | Description |
|
| 26 |
+
|------|-------------|
|
| 27 |
+
| `search_datasets` | Find datasets by keyword, task, or domain |
|
| 28 |
+
| `search_by_columns` | Find datasets with specific column names |
|
| 29 |
+
| `get_dataset_info` | Get detailed metadata and README |
|
| 30 |
+
| `get_schema` | Get column names and data types |
|
| 31 |
+
| `sample_rows` | Get actual data samples |
|
| 32 |
+
| `get_statistics` | Compute column statistics |
|
| 33 |
+
| `profile_quality` | Assess data quality issues |
|
| 34 |
+
| `find_similar` | Find similar datasets |
|
| 35 |
+
| `suggest_tasks` | Suggest ML tasks for a dataset |
|
| 36 |
+
| `compare_datasets` | Compare two datasets side-by-side |
|
| 37 |
+
|
| 38 |
+
## Quick Start
|
| 39 |
+
|
| 40 |
+
### Use with Claude Desktop
|
| 41 |
+
|
| 42 |
+
Add to your `claude_desktop_config.json`:
|
| 43 |
+
|
| 44 |
+
```json
|
| 45 |
+
{
|
| 46 |
+
"mcpServers": {
|
| 47 |
+
"dataview": {
|
| 48 |
+
"url": "https://efecelik-dataview-mcp.hf.space/gradio_api/mcp/sse"
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
### Use with Claude Code
|
| 55 |
+
|
| 56 |
+
Add to your MCP settings:
|
| 57 |
+
|
| 58 |
+
```json
|
| 59 |
+
{
|
| 60 |
+
"mcpServers": {
|
| 61 |
+
"dataview": {
|
| 62 |
+
"command": "npx",
|
| 63 |
+
"args": ["mcp-remote", "https://efecelik-dataview-mcp.hf.space/gradio_api/mcp/sse"]
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### Run Locally
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
# Clone the repository
|
| 73 |
+
git clone https://huggingface.co/spaces/efecelik/dataview-mcp
|
| 74 |
+
cd dataview-mcp
|
| 75 |
+
|
| 76 |
+
# Install dependencies
|
| 77 |
+
pip install -r requirements.txt
|
| 78 |
+
|
| 79 |
+
# Optional: Set HF token for higher rate limits
|
| 80 |
+
export HF_TOKEN=your_token_here
|
| 81 |
+
|
| 82 |
+
# Run the server
|
| 83 |
+
python app.py
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
Then connect to `http://localhost:7860/gradio_api/mcp/sse`
|
| 87 |
+
|
| 88 |
+
## Example Usage
|
| 89 |
+
|
| 90 |
+
Once connected, ask your AI assistant:
|
| 91 |
+
|
| 92 |
+
- *"Search for sentiment analysis datasets"*
|
| 93 |
+
- *"Show me 5 sample rows from the IMDB dataset"*
|
| 94 |
+
- *"What's the schema of the SQuAD dataset?"*
|
| 95 |
+
- *"Find datasets similar to IMDB"*
|
| 96 |
+
- *"What ML tasks could I do with the IMDB dataset?"*
|
| 97 |
+
- *"Compare IMDB and Rotten Tomatoes datasets"*
|
| 98 |
+
- *"Check the data quality of this dataset"*
|
| 99 |
+
|
| 100 |
+
## Tool Details
|
| 101 |
+
|
| 102 |
+
### search_datasets
|
| 103 |
+
|
| 104 |
+
Find datasets matching your criteria.
|
| 105 |
+
|
| 106 |
+
```
|
| 107 |
+
Query: "sentiment analysis"
|
| 108 |
+
Filter: text-classification
|
| 109 |
+
Limit: 10
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
### sample_rows
|
| 113 |
+
|
| 114 |
+
See actual data from a dataset.
|
| 115 |
+
|
| 116 |
+
```
|
| 117 |
+
Dataset: imdb
|
| 118 |
+
Rows: 5
|
| 119 |
+
Split: train
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
### get_statistics
|
| 123 |
+
|
| 124 |
+
Get statistical overview of columns.
|
| 125 |
+
|
| 126 |
+
```
|
| 127 |
+
Dataset: imdb
|
| 128 |
+
Sample Size: 1000
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
### profile_quality
|
| 132 |
+
|
| 133 |
+
Check for data quality issues.
|
| 134 |
+
|
| 135 |
+
```
|
| 136 |
+
Dataset: imdb
|
| 137 |
+
Sample Size: 500
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
Returns quality score, missing values, duplicates, class imbalance.
|
| 141 |
+
|
| 142 |
+
### suggest_tasks
|
| 143 |
+
|
| 144 |
+
AI-powered task suggestions based on dataset structure.
|
| 145 |
+
|
| 146 |
+
```
|
| 147 |
+
Dataset: imdb
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
Returns suggested ML tasks with confidence levels.
|
| 151 |
+
|
| 152 |
+
## Development
|
| 153 |
+
|
| 154 |
+
```bash
|
| 155 |
+
# Install dev dependencies
|
| 156 |
+
pip install -r requirements.txt
|
| 157 |
+
|
| 158 |
+
# Run in development mode
|
| 159 |
+
gradio app.py --reload
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
## Architecture
|
| 163 |
+
|
| 164 |
+
```
|
| 165 |
+
dataview-mcp/
|
| 166 |
+
├── app.py # Main Gradio MCP server
|
| 167 |
+
├── tools/
|
| 168 |
+
│ ├── search.py # search_datasets, search_by_columns
|
| 169 |
+
│ ├── metadata.py # get_dataset_info, get_schema
|
| 170 |
+
│ ├── sampling.py # sample_rows
|
| 171 |
+
│ ├── profiling.py # get_statistics, profile_quality
|
| 172 |
+
│ └── discovery.py # find_similar, suggest_tasks, compare_datasets
|
| 173 |
+
├── utils/
|
| 174 |
+
│ ├── hf_client.py # HF API wrapper
|
| 175 |
+
│ └── formatting.py # Output formatters
|
| 176 |
+
└── requirements.txt
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
## License
|
| 180 |
+
|
| 181 |
+
MIT
|
| 182 |
+
|
| 183 |
+
## Contributing
|
| 184 |
+
|
| 185 |
+
Contributions welcome! Please open an issue or PR.
|
| 186 |
+
|
| 187 |
+
---
|
| 188 |
+
|
| 189 |
+
Built with Gradio and Hugging Face Hub
|
app.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DataView MCP - A comprehensive MCP server for exploring Hugging Face datasets.
|
| 3 |
+
|
| 4 |
+
This MCP server provides 10 tools for searching, sampling, profiling, and
|
| 5 |
+
discovering datasets on the Hugging Face Hub.
|
| 6 |
+
|
| 7 |
+
Tools:
|
| 8 |
+
1. search_datasets - Find datasets by keyword, task, or domain
|
| 9 |
+
2. search_by_columns - Find datasets with specific column names
|
| 10 |
+
3. get_dataset_info - Get detailed metadata and README
|
| 11 |
+
4. get_schema - Get column names and data types
|
| 12 |
+
5. sample_rows - Get actual data samples
|
| 13 |
+
6. get_statistics - Compute column statistics
|
| 14 |
+
7. profile_quality - Assess data quality issues
|
| 15 |
+
8. find_similar - Find similar datasets
|
| 16 |
+
9. suggest_tasks - Suggest ML tasks for a dataset
|
| 17 |
+
10. compare_datasets - Compare two datasets side-by-side
|
| 18 |
+
|
| 19 |
+
Usage:
|
| 20 |
+
# Run locally
|
| 21 |
+
python app.py
|
| 22 |
+
|
| 23 |
+
# Or with Gradio CLI
|
| 24 |
+
gradio app.py
|
| 25 |
+
|
| 26 |
+
# Connect via MCP
|
| 27 |
+
Add to your MCP client config:
|
| 28 |
+
{
|
| 29 |
+
"mcpServers": {
|
| 30 |
+
"dataview": {
|
| 31 |
+
"url": "http://localhost:7860/gradio_api/mcp/sse"
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
import gradio as gr
|
| 38 |
+
from typing import Optional, List
|
| 39 |
+
|
| 40 |
+
# Import all tools
|
| 41 |
+
from tools.search import search_datasets, search_by_columns
|
| 42 |
+
from tools.metadata import get_dataset_info, get_schema
|
| 43 |
+
from tools.sampling import sample_rows
|
| 44 |
+
from tools.profiling import get_statistics, profile_quality
|
| 45 |
+
from tools.discovery import find_similar, suggest_tasks, compare_datasets
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Create Gradio interfaces for each tool
|
| 49 |
+
# Note: Gradio will automatically convert these to MCP tools
|
| 50 |
+
|
| 51 |
+
def create_demo():
|
| 52 |
+
"""Create the Gradio demo with all tools."""
|
| 53 |
+
|
| 54 |
+
with gr.Blocks(
|
| 55 |
+
title="DataView MCP - HuggingFace Dataset Explorer",
|
| 56 |
+
theme=gr.themes.Soft()
|
| 57 |
+
) as demo:
|
| 58 |
+
gr.Markdown("""
|
| 59 |
+
# DataView MCP
|
| 60 |
+
## Explore Hugging Face Datasets with AI
|
| 61 |
+
|
| 62 |
+
This MCP server provides tools for AI assistants to explore, analyze, and
|
| 63 |
+
understand datasets on the Hugging Face Hub.
|
| 64 |
+
|
| 65 |
+
**10 Tools Available:**
|
| 66 |
+
- Search & Discovery: `search_datasets`, `search_by_columns`, `find_similar`
|
| 67 |
+
- Metadata: `get_dataset_info`, `get_schema`
|
| 68 |
+
- Data Access: `sample_rows`
|
| 69 |
+
- Analysis: `get_statistics`, `profile_quality`
|
| 70 |
+
- Intelligence: `suggest_tasks`, `compare_datasets`
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
### Try the tools below or connect via MCP
|
| 74 |
+
""")
|
| 75 |
+
|
| 76 |
+
with gr.Tabs():
|
| 77 |
+
# Search Tab
|
| 78 |
+
with gr.Tab("Search"):
|
| 79 |
+
with gr.Row():
|
| 80 |
+
with gr.Column():
|
| 81 |
+
search_query = gr.Textbox(
|
| 82 |
+
label="Search Query",
|
| 83 |
+
placeholder="e.g., sentiment analysis, medical imaging"
|
| 84 |
+
)
|
| 85 |
+
search_limit = gr.Slider(1, 50, value=10, step=1, label="Max Results")
|
| 86 |
+
search_task = gr.Dropdown(
|
| 87 |
+
choices=[
|
| 88 |
+
None, "text-classification", "question-answering",
|
| 89 |
+
"summarization", "translation", "image-classification",
|
| 90 |
+
"object-detection", "text-generation"
|
| 91 |
+
],
|
| 92 |
+
label="Filter by Task (optional)"
|
| 93 |
+
)
|
| 94 |
+
search_btn = gr.Button("Search Datasets", variant="primary")
|
| 95 |
+
with gr.Column():
|
| 96 |
+
search_output = gr.Markdown(label="Results")
|
| 97 |
+
|
| 98 |
+
search_btn.click(
|
| 99 |
+
search_datasets,
|
| 100 |
+
inputs=[search_query, search_limit, search_task],
|
| 101 |
+
outputs=search_output
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Dataset Info Tab
|
| 105 |
+
with gr.Tab("Dataset Info"):
|
| 106 |
+
with gr.Row():
|
| 107 |
+
with gr.Column():
|
| 108 |
+
info_dataset_id = gr.Textbox(
|
| 109 |
+
label="Dataset ID",
|
| 110 |
+
placeholder="e.g., imdb, squad, huggingface/documentation-images"
|
| 111 |
+
)
|
| 112 |
+
info_btn = gr.Button("Get Info", variant="primary")
|
| 113 |
+
schema_btn = gr.Button("Get Schema")
|
| 114 |
+
with gr.Column():
|
| 115 |
+
info_output = gr.Markdown(label="Dataset Info")
|
| 116 |
+
|
| 117 |
+
info_btn.click(get_dataset_info, inputs=[info_dataset_id], outputs=info_output)
|
| 118 |
+
schema_btn.click(get_schema, inputs=[info_dataset_id], outputs=info_output)
|
| 119 |
+
|
| 120 |
+
# Sample Data Tab
|
| 121 |
+
with gr.Tab("Sample Data"):
|
| 122 |
+
with gr.Row():
|
| 123 |
+
with gr.Column():
|
| 124 |
+
sample_dataset_id = gr.Textbox(
|
| 125 |
+
label="Dataset ID",
|
| 126 |
+
placeholder="e.g., imdb"
|
| 127 |
+
)
|
| 128 |
+
sample_n_rows = gr.Slider(1, 20, value=5, step=1, label="Number of Rows")
|
| 129 |
+
sample_split = gr.Dropdown(
|
| 130 |
+
choices=["train", "test", "validation"],
|
| 131 |
+
value="train",
|
| 132 |
+
label="Split"
|
| 133 |
+
)
|
| 134 |
+
sample_btn = gr.Button("Get Sample", variant="primary")
|
| 135 |
+
with gr.Column():
|
| 136 |
+
sample_output = gr.Markdown(label="Sample Data")
|
| 137 |
+
|
| 138 |
+
sample_btn.click(
|
| 139 |
+
sample_rows,
|
| 140 |
+
inputs=[sample_dataset_id, sample_n_rows, gr.State(None), sample_split],
|
| 141 |
+
outputs=sample_output
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Analysis Tab
|
| 145 |
+
with gr.Tab("Analysis"):
|
| 146 |
+
with gr.Row():
|
| 147 |
+
with gr.Column():
|
| 148 |
+
analysis_dataset_id = gr.Textbox(
|
| 149 |
+
label="Dataset ID",
|
| 150 |
+
placeholder="e.g., imdb"
|
| 151 |
+
)
|
| 152 |
+
analysis_sample_size = gr.Slider(
|
| 153 |
+
100, 2000, value=500, step=100,
|
| 154 |
+
label="Sample Size for Analysis"
|
| 155 |
+
)
|
| 156 |
+
stats_btn = gr.Button("Get Statistics", variant="primary")
|
| 157 |
+
quality_btn = gr.Button("Profile Quality")
|
| 158 |
+
with gr.Column():
|
| 159 |
+
analysis_output = gr.Markdown(label="Analysis Results")
|
| 160 |
+
|
| 161 |
+
stats_btn.click(
|
| 162 |
+
get_statistics,
|
| 163 |
+
inputs=[analysis_dataset_id, gr.State(None), gr.State("train"), analysis_sample_size],
|
| 164 |
+
outputs=analysis_output
|
| 165 |
+
)
|
| 166 |
+
quality_btn.click(
|
| 167 |
+
profile_quality,
|
| 168 |
+
inputs=[analysis_dataset_id, gr.State(None), gr.State("train"), analysis_sample_size],
|
| 169 |
+
outputs=analysis_output
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Discovery Tab
|
| 173 |
+
with gr.Tab("Discovery"):
|
| 174 |
+
with gr.Row():
|
| 175 |
+
with gr.Column():
|
| 176 |
+
discovery_dataset_id = gr.Textbox(
|
| 177 |
+
label="Dataset ID",
|
| 178 |
+
placeholder="e.g., imdb"
|
| 179 |
+
)
|
| 180 |
+
discovery_top_k = gr.Slider(1, 10, value=5, step=1, label="Number of Results")
|
| 181 |
+
similar_btn = gr.Button("Find Similar", variant="primary")
|
| 182 |
+
suggest_btn = gr.Button("Suggest Tasks")
|
| 183 |
+
with gr.Column():
|
| 184 |
+
discovery_output = gr.Markdown(label="Discovery Results")
|
| 185 |
+
|
| 186 |
+
similar_btn.click(
|
| 187 |
+
find_similar,
|
| 188 |
+
inputs=[discovery_dataset_id, discovery_top_k],
|
| 189 |
+
outputs=discovery_output
|
| 190 |
+
)
|
| 191 |
+
suggest_btn.click(
|
| 192 |
+
suggest_tasks,
|
| 193 |
+
inputs=[discovery_dataset_id],
|
| 194 |
+
outputs=discovery_output
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
# Compare Tab
|
| 198 |
+
with gr.Tab("Compare"):
|
| 199 |
+
with gr.Row():
|
| 200 |
+
with gr.Column():
|
| 201 |
+
compare_dataset_a = gr.Textbox(
|
| 202 |
+
label="Dataset A",
|
| 203 |
+
placeholder="e.g., imdb"
|
| 204 |
+
)
|
| 205 |
+
compare_dataset_b = gr.Textbox(
|
| 206 |
+
label="Dataset B",
|
| 207 |
+
placeholder="e.g., rotten_tomatoes"
|
| 208 |
+
)
|
| 209 |
+
compare_btn = gr.Button("Compare Datasets", variant="primary")
|
| 210 |
+
with gr.Column():
|
| 211 |
+
compare_output = gr.Markdown(label="Comparison Results")
|
| 212 |
+
|
| 213 |
+
compare_btn.click(
|
| 214 |
+
compare_datasets,
|
| 215 |
+
inputs=[compare_dataset_a, compare_dataset_b],
|
| 216 |
+
outputs=compare_output
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
gr.Markdown("""
|
| 220 |
+
---
|
| 221 |
+
### MCP Connection
|
| 222 |
+
|
| 223 |
+
To use with Claude or other MCP clients, add this to your config:
|
| 224 |
+
|
| 225 |
+
```json
|
| 226 |
+
{
|
| 227 |
+
"mcpServers": {
|
| 228 |
+
"dataview": {
|
| 229 |
+
"url": "https://YOUR-SPACE.hf.space/gradio_api/mcp/sse"
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
---
|
| 236 |
+
Built with Gradio MCP
|
| 237 |
+
""")
|
| 238 |
+
|
| 239 |
+
return demo
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# Create the demo
|
| 243 |
+
demo = create_demo()
|
| 244 |
+
|
| 245 |
+
# Launch with MCP server enabled
|
| 246 |
+
if __name__ == "__main__":
|
| 247 |
+
demo.launch(
|
| 248 |
+
mcp_server=True,
|
| 249 |
+
share=False,
|
| 250 |
+
server_name="0.0.0.0",
|
| 251 |
+
server_port=7860
|
| 252 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.0.0
|
| 2 |
+
huggingface_hub>=0.25.0
|
| 3 |
+
datasets>=3.0.0
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
sentence-transformers>=3.0.0
|
| 7 |
+
python-dotenv>=1.0.0
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""MCP Tools for dataset exploration."""
|
| 2 |
+
|
| 3 |
+
from .search import search_datasets, search_by_columns
|
| 4 |
+
from .metadata import get_dataset_info, get_schema
|
| 5 |
+
from .sampling import sample_rows
|
| 6 |
+
from .profiling import get_statistics, profile_quality
|
| 7 |
+
from .discovery import find_similar, suggest_tasks, compare_datasets
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"search_datasets",
|
| 11 |
+
"search_by_columns",
|
| 12 |
+
"get_dataset_info",
|
| 13 |
+
"get_schema",
|
| 14 |
+
"sample_rows",
|
| 15 |
+
"get_statistics",
|
| 16 |
+
"profile_quality",
|
| 17 |
+
"find_similar",
|
| 18 |
+
"suggest_tasks",
|
| 19 |
+
"compare_datasets",
|
| 20 |
+
]
|
tools/discovery.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Discovery tools for finding similar datasets and suggesting ML tasks."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional, List, Dict, Any
|
| 4 |
+
from utils.hf_client import get_client
|
| 5 |
+
from utils.formatting import format_similar_datasets, format_task_suggestions, format_comparison
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Common ML task patterns based on column names and types
|
| 9 |
+
TASK_PATTERNS = {
|
| 10 |
+
"text-classification": {
|
| 11 |
+
"columns": ["text", "label", "sentence", "review", "comment", "content"],
|
| 12 |
+
"name": "Text Classification",
|
| 13 |
+
"target_hints": ["label", "class", "category", "sentiment", "target"]
|
| 14 |
+
},
|
| 15 |
+
"question-answering": {
|
| 16 |
+
"columns": ["question", "answer", "context", "response"],
|
| 17 |
+
"name": "Question Answering",
|
| 18 |
+
"target_hints": ["answer", "response"]
|
| 19 |
+
},
|
| 20 |
+
"summarization": {
|
| 21 |
+
"columns": ["article", "summary", "document", "highlights", "abstract"],
|
| 22 |
+
"name": "Text Summarization",
|
| 23 |
+
"target_hints": ["summary", "highlights", "abstract"]
|
| 24 |
+
},
|
| 25 |
+
"translation": {
|
| 26 |
+
"columns": ["source", "target", "en", "de", "fr", "es", "translation"],
|
| 27 |
+
"name": "Machine Translation",
|
| 28 |
+
"target_hints": ["target", "translation"]
|
| 29 |
+
},
|
| 30 |
+
"image-classification": {
|
| 31 |
+
"columns": ["image", "label", "img", "photo"],
|
| 32 |
+
"name": "Image Classification",
|
| 33 |
+
"target_hints": ["label", "class", "category"]
|
| 34 |
+
},
|
| 35 |
+
"named-entity-recognition": {
|
| 36 |
+
"columns": ["tokens", "ner_tags", "tags", "entities"],
|
| 37 |
+
"name": "Named Entity Recognition",
|
| 38 |
+
"target_hints": ["ner_tags", "tags", "entities", "labels"]
|
| 39 |
+
},
|
| 40 |
+
"token-classification": {
|
| 41 |
+
"columns": ["tokens", "labels", "tags", "pos_tags"],
|
| 42 |
+
"name": "Token Classification",
|
| 43 |
+
"target_hints": ["labels", "tags"]
|
| 44 |
+
},
|
| 45 |
+
"text-generation": {
|
| 46 |
+
"columns": ["prompt", "completion", "input", "output", "instruction"],
|
| 47 |
+
"name": "Text Generation / Instruction Following",
|
| 48 |
+
"target_hints": ["completion", "output", "response"]
|
| 49 |
+
},
|
| 50 |
+
"tabular-classification": {
|
| 51 |
+
"columns": ["target", "label", "class"],
|
| 52 |
+
"name": "Tabular Classification",
|
| 53 |
+
"target_hints": ["target", "label", "class", "y"]
|
| 54 |
+
},
|
| 55 |
+
"tabular-regression": {
|
| 56 |
+
"columns": ["target", "price", "value", "score", "rating"],
|
| 57 |
+
"name": "Tabular Regression",
|
| 58 |
+
"target_hints": ["target", "price", "value", "score", "rating"]
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def find_similar(
|
| 64 |
+
dataset_id: str,
|
| 65 |
+
top_k: int = 5
|
| 66 |
+
) -> str:
|
| 67 |
+
"""
|
| 68 |
+
Find datasets similar to a given dataset based on tags, domain, and structure.
|
| 69 |
+
|
| 70 |
+
Use this tool to discover alternative or complementary datasets for your task.
|
| 71 |
+
Similarity is based on shared tags, similar column structures, and domain overlap.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
dataset_id: The dataset to find similar datasets for (e.g., "imdb", "squad")
|
| 75 |
+
top_k: Number of similar datasets to return (1-10, default: 5)
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
List of similar datasets with:
|
| 79 |
+
- Dataset ID and download count
|
| 80 |
+
- Similarity score (0-1)
|
| 81 |
+
- Reason for similarity (shared tags, similar structure, etc.)
|
| 82 |
+
|
| 83 |
+
How similarity is computed:
|
| 84 |
+
- Tag overlap (same task categories, languages, domains)
|
| 85 |
+
- Similar column names and structures
|
| 86 |
+
- Same author/organization
|
| 87 |
+
- Related task types
|
| 88 |
+
"""
|
| 89 |
+
top_k = max(1, min(10, top_k))
|
| 90 |
+
|
| 91 |
+
client = get_client()
|
| 92 |
+
|
| 93 |
+
# Get info about the source dataset
|
| 94 |
+
source_info = client.get_dataset_info(dataset_id)
|
| 95 |
+
if "error" in source_info:
|
| 96 |
+
return f"Error: Could not fetch info for dataset '{dataset_id}': {source_info.get('error')}"
|
| 97 |
+
|
| 98 |
+
source_tags = set(source_info.get('tags', []))
|
| 99 |
+
|
| 100 |
+
# Get schema for structure comparison
|
| 101 |
+
source_schema = client.get_schema(dataset_id)
|
| 102 |
+
source_columns = set(source_schema.get('columns', [])) if "error" not in source_schema else set()
|
| 103 |
+
|
| 104 |
+
# Extract key tags for search
|
| 105 |
+
search_terms = []
|
| 106 |
+
for tag in source_tags:
|
| 107 |
+
if ':' in tag:
|
| 108 |
+
# Task category tags like "task_categories:text-classification"
|
| 109 |
+
if tag.startswith('task_categories:'):
|
| 110 |
+
search_terms.append(tag.split(':')[1])
|
| 111 |
+
elif tag.startswith('language:'):
|
| 112 |
+
search_terms.append(tag.split(':')[1])
|
| 113 |
+
elif len(tag) > 2:
|
| 114 |
+
search_terms.append(tag)
|
| 115 |
+
|
| 116 |
+
# Search for candidates
|
| 117 |
+
candidates = []
|
| 118 |
+
for term in search_terms[:3]: # Use top 3 terms
|
| 119 |
+
results = client.search_datasets(term, limit=20)
|
| 120 |
+
candidates.extend(results)
|
| 121 |
+
|
| 122 |
+
# Remove duplicates and source dataset
|
| 123 |
+
seen = {dataset_id}
|
| 124 |
+
unique_candidates = []
|
| 125 |
+
for ds in candidates:
|
| 126 |
+
if ds['id'] not in seen:
|
| 127 |
+
seen.add(ds['id'])
|
| 128 |
+
unique_candidates.append(ds)
|
| 129 |
+
|
| 130 |
+
# Score candidates
|
| 131 |
+
scored = []
|
| 132 |
+
for ds in unique_candidates[:30]: # Limit processing
|
| 133 |
+
try:
|
| 134 |
+
ds_info = client.get_dataset_info(ds['id'])
|
| 135 |
+
ds_tags = set(ds_info.get('tags', []))
|
| 136 |
+
|
| 137 |
+
# Compute similarity score
|
| 138 |
+
tag_overlap = len(source_tags & ds_tags)
|
| 139 |
+
tag_score = tag_overlap / max(len(source_tags), 1)
|
| 140 |
+
|
| 141 |
+
# Check column similarity
|
| 142 |
+
ds_schema = client.get_schema(ds['id'])
|
| 143 |
+
ds_columns = set(ds_schema.get('columns', [])) if "error" not in ds_schema else set()
|
| 144 |
+
col_overlap = len(source_columns & ds_columns)
|
| 145 |
+
col_score = col_overlap / max(len(source_columns), 1) if source_columns else 0
|
| 146 |
+
|
| 147 |
+
# Combined score
|
| 148 |
+
similarity = (tag_score * 0.6) + (col_score * 0.4)
|
| 149 |
+
|
| 150 |
+
# Determine reason
|
| 151 |
+
reasons = []
|
| 152 |
+
if tag_overlap > 0:
|
| 153 |
+
common_tags = list(source_tags & ds_tags)[:3]
|
| 154 |
+
reasons.append(f"Shared tags: {', '.join(common_tags)}")
|
| 155 |
+
if col_overlap > 0:
|
| 156 |
+
common_cols = list(source_columns & ds_columns)[:3]
|
| 157 |
+
reasons.append(f"Similar columns: {', '.join(common_cols)}")
|
| 158 |
+
if ds_info.get('author') == source_info.get('author'):
|
| 159 |
+
reasons.append("Same author")
|
| 160 |
+
similarity += 0.1
|
| 161 |
+
|
| 162 |
+
if similarity > 0.1:
|
| 163 |
+
scored.append({
|
| 164 |
+
"id": ds['id'],
|
| 165 |
+
"downloads": ds.get('downloads', 0),
|
| 166 |
+
"similarity_score": min(1.0, similarity),
|
| 167 |
+
"reason": "; ".join(reasons) if reasons else "Related domain"
|
| 168 |
+
})
|
| 169 |
+
except Exception:
|
| 170 |
+
continue
|
| 171 |
+
|
| 172 |
+
# Sort by similarity and return top_k
|
| 173 |
+
scored.sort(key=lambda x: x['similarity_score'], reverse=True)
|
| 174 |
+
return format_similar_datasets(scored[:top_k])
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def suggest_tasks(dataset_id: str) -> str:
|
| 178 |
+
"""
|
| 179 |
+
Analyze a dataset and suggest suitable machine learning tasks.
|
| 180 |
+
|
| 181 |
+
Use this tool to discover what ML tasks a dataset could be used for,
|
| 182 |
+
based on its column structure, data types, and metadata.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
dataset_id: The dataset to analyze (e.g., "imdb", "squad", "cnn_dailymail")
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
List of suggested ML tasks with:
|
| 189 |
+
- Task name and confidence level (high/medium/low)
|
| 190 |
+
- Reasoning for the suggestion
|
| 191 |
+
- Recommended target column
|
| 192 |
+
- Recommended feature columns
|
| 193 |
+
|
| 194 |
+
Task types detected:
|
| 195 |
+
- Text Classification (sentiment, topic, intent)
|
| 196 |
+
- Question Answering
|
| 197 |
+
- Summarization
|
| 198 |
+
- Translation
|
| 199 |
+
- Image Classification
|
| 200 |
+
- Named Entity Recognition
|
| 201 |
+
- Token Classification
|
| 202 |
+
- Text Generation
|
| 203 |
+
- Tabular Classification/Regression
|
| 204 |
+
"""
|
| 205 |
+
client = get_client()
|
| 206 |
+
|
| 207 |
+
# Get schema
|
| 208 |
+
schema = client.get_schema(dataset_id)
|
| 209 |
+
if "error" in schema:
|
| 210 |
+
return format_task_suggestions({"error": f"Could not load schema: {schema['error']}"})
|
| 211 |
+
|
| 212 |
+
columns = [c.lower() for c in schema.get('columns', [])]
|
| 213 |
+
features = schema.get('features', {})
|
| 214 |
+
|
| 215 |
+
# Get dataset info for tags
|
| 216 |
+
info = client.get_dataset_info(dataset_id)
|
| 217 |
+
tags = [t.lower() for t in info.get('tags', [])] if "error" not in info else []
|
| 218 |
+
|
| 219 |
+
suggestions: List[Dict[str, Any]] = []
|
| 220 |
+
|
| 221 |
+
for task_id, pattern in TASK_PATTERNS.items():
|
| 222 |
+
# Check column name matches
|
| 223 |
+
pattern_cols = [c.lower() for c in pattern['columns']]
|
| 224 |
+
matching_cols = [c for c in columns if any(p in c for p in pattern_cols)]
|
| 225 |
+
|
| 226 |
+
# Check tag matches
|
| 227 |
+
tag_match = any(task_id in t for t in tags)
|
| 228 |
+
|
| 229 |
+
if matching_cols or tag_match:
|
| 230 |
+
# Determine confidence
|
| 231 |
+
if tag_match and len(matching_cols) >= 2:
|
| 232 |
+
confidence = "high"
|
| 233 |
+
elif tag_match or len(matching_cols) >= 2:
|
| 234 |
+
confidence = "medium"
|
| 235 |
+
else:
|
| 236 |
+
confidence = "low"
|
| 237 |
+
|
| 238 |
+
# Find target column
|
| 239 |
+
target_hints = [h.lower() for h in pattern['target_hints']]
|
| 240 |
+
target_col = None
|
| 241 |
+
for col in columns:
|
| 242 |
+
if any(hint in col for hint in target_hints):
|
| 243 |
+
target_col = col
|
| 244 |
+
break
|
| 245 |
+
|
| 246 |
+
# Feature columns (all except target)
|
| 247 |
+
feature_cols = [c for c in columns if c != target_col][:5]
|
| 248 |
+
|
| 249 |
+
# Build reason
|
| 250 |
+
reasons = []
|
| 251 |
+
if matching_cols:
|
| 252 |
+
reasons.append(f"Found columns: {', '.join(matching_cols[:3])}")
|
| 253 |
+
if tag_match:
|
| 254 |
+
reasons.append("Dataset tags indicate this task")
|
| 255 |
+
|
| 256 |
+
suggestions.append({
|
| 257 |
+
"name": pattern['name'],
|
| 258 |
+
"confidence": confidence,
|
| 259 |
+
"reason": "; ".join(reasons),
|
| 260 |
+
"target_column": target_col,
|
| 261 |
+
"feature_columns": feature_cols
|
| 262 |
+
})
|
| 263 |
+
|
| 264 |
+
# Sort by confidence
|
| 265 |
+
confidence_order = {"high": 0, "medium": 1, "low": 2}
|
| 266 |
+
suggestions.sort(key=lambda x: confidence_order.get(x['confidence'], 3))
|
| 267 |
+
|
| 268 |
+
if not suggestions:
|
| 269 |
+
# Generic suggestion based on column types
|
| 270 |
+
has_text = any('string' in str(features.get(c, '')).lower() for c in schema.get('columns', []))
|
| 271 |
+
has_numeric = any('int' in str(features.get(c, '')).lower() or 'float' in str(features.get(c, '')).lower()
|
| 272 |
+
for c in schema.get('columns', []))
|
| 273 |
+
|
| 274 |
+
if has_text:
|
| 275 |
+
suggestions.append({
|
| 276 |
+
"name": "Text Analysis (Generic)",
|
| 277 |
+
"confidence": "low",
|
| 278 |
+
"reason": "Dataset contains text columns",
|
| 279 |
+
"target_column": None,
|
| 280 |
+
"feature_columns": columns[:5]
|
| 281 |
+
})
|
| 282 |
+
if has_numeric:
|
| 283 |
+
suggestions.append({
|
| 284 |
+
"name": "Regression/Classification (Generic)",
|
| 285 |
+
"confidence": "low",
|
| 286 |
+
"reason": "Dataset contains numeric columns",
|
| 287 |
+
"target_column": columns[-1] if columns else None,
|
| 288 |
+
"feature_columns": columns[:-1] if len(columns) > 1 else columns
|
| 289 |
+
})
|
| 290 |
+
|
| 291 |
+
return format_task_suggestions({
|
| 292 |
+
"dataset_id": dataset_id,
|
| 293 |
+
"tasks": suggestions[:5] # Return top 5 suggestions
|
| 294 |
+
})
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def compare_datasets(
|
| 298 |
+
dataset_a: str,
|
| 299 |
+
dataset_b: str
|
| 300 |
+
) -> str:
|
| 301 |
+
"""
|
| 302 |
+
Compare two datasets side-by-side to understand their differences.
|
| 303 |
+
|
| 304 |
+
Use this tool when deciding between similar datasets or understanding
|
| 305 |
+
how datasets differ in structure, size, and content.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
dataset_a: First dataset ID to compare (e.g., "imdb")
|
| 309 |
+
dataset_b: Second dataset ID to compare (e.g., "rotten_tomatoes")
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
Comparison table showing:
|
| 313 |
+
- Download and like counts
|
| 314 |
+
- Number of columns
|
| 315 |
+
- Column names (common and unique)
|
| 316 |
+
- License information
|
| 317 |
+
- Tags comparison
|
| 318 |
+
- Data types comparison
|
| 319 |
+
|
| 320 |
+
Use cases:
|
| 321 |
+
- Choosing between similar datasets for a task
|
| 322 |
+
- Understanding dataset versions or variants
|
| 323 |
+
- Finding complementary datasets
|
| 324 |
+
"""
|
| 325 |
+
client = get_client()
|
| 326 |
+
|
| 327 |
+
# Get info for both datasets
|
| 328 |
+
info_a = client.get_dataset_info(dataset_a)
|
| 329 |
+
info_b = client.get_dataset_info(dataset_b)
|
| 330 |
+
|
| 331 |
+
if "error" in info_a:
|
| 332 |
+
return f"Error loading dataset A ({dataset_a}): {info_a.get('error')}"
|
| 333 |
+
if "error" in info_b:
|
| 334 |
+
return f"Error loading dataset B ({dataset_b}): {info_b.get('error')}"
|
| 335 |
+
|
| 336 |
+
# Get schemas
|
| 337 |
+
schema_a = client.get_schema(dataset_a)
|
| 338 |
+
schema_b = client.get_schema(dataset_b)
|
| 339 |
+
|
| 340 |
+
cols_a = set(schema_a.get('columns', [])) if "error" not in schema_a else set()
|
| 341 |
+
cols_b = set(schema_b.get('columns', [])) if "error" not in schema_b else set()
|
| 342 |
+
|
| 343 |
+
comparison = {
|
| 344 |
+
"dataset_a": dataset_a,
|
| 345 |
+
"dataset_b": dataset_b,
|
| 346 |
+
"comparison": {
|
| 347 |
+
"Downloads": {
|
| 348 |
+
"a": f"{info_a.get('downloads', 0):,}",
|
| 349 |
+
"b": f"{info_b.get('downloads', 0):,}"
|
| 350 |
+
},
|
| 351 |
+
"Likes": {
|
| 352 |
+
"a": str(info_a.get('likes', 0)),
|
| 353 |
+
"b": str(info_b.get('likes', 0))
|
| 354 |
+
},
|
| 355 |
+
"License": {
|
| 356 |
+
"a": info_a.get('license') or "N/A",
|
| 357 |
+
"b": info_b.get('license') or "N/A"
|
| 358 |
+
},
|
| 359 |
+
"Columns": {
|
| 360 |
+
"a": str(len(cols_a)),
|
| 361 |
+
"b": str(len(cols_b))
|
| 362 |
+
},
|
| 363 |
+
"Author": {
|
| 364 |
+
"a": info_a.get('author') or "N/A",
|
| 365 |
+
"b": info_b.get('author') or "N/A"
|
| 366 |
+
}
|
| 367 |
+
},
|
| 368 |
+
"common_columns": list(cols_a & cols_b),
|
| 369 |
+
"unique_to_a": list(cols_a - cols_b),
|
| 370 |
+
"unique_to_b": list(cols_b - cols_a)
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
# Compare tags
|
| 374 |
+
tags_a = set(info_a.get('tags', []))
|
| 375 |
+
tags_b = set(info_b.get('tags', []))
|
| 376 |
+
common_tags = tags_a & tags_b
|
| 377 |
+
|
| 378 |
+
if common_tags:
|
| 379 |
+
comparison["comparison"]["Common Tags"] = {
|
| 380 |
+
"a": str(len(common_tags)),
|
| 381 |
+
"b": ", ".join(list(common_tags)[:3])
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
return format_comparison(comparison)
|
tools/metadata.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Metadata tools for getting dataset information and schemas."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from utils.hf_client import get_client
|
| 5 |
+
from utils.formatting import format_dataset_info, format_schema
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_dataset_info(dataset_id: str) -> str:
|
| 9 |
+
"""
|
| 10 |
+
Get detailed information about a specific dataset on Hugging Face Hub.
|
| 11 |
+
|
| 12 |
+
Use this tool to learn about a dataset's metadata, including its author,
|
| 13 |
+
download count, license, tags, and a summary of its dataset card/README.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
dataset_id: The full dataset identifier (e.g., "squad", "imdb", "huggingface/documentation-images",
|
| 17 |
+
"username/dataset-name")
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
Formatted dataset information including:
|
| 21 |
+
- Author and creation date
|
| 22 |
+
- Download and like counts
|
| 23 |
+
- License information
|
| 24 |
+
- Tags and categories
|
| 25 |
+
- Dataset card summary (first ~1500 characters)
|
| 26 |
+
|
| 27 |
+
Example dataset IDs:
|
| 28 |
+
- "squad" - Stanford Question Answering Dataset
|
| 29 |
+
- "imdb" - IMDB movie reviews for sentiment
|
| 30 |
+
- "cnn_dailymail" - News summarization
|
| 31 |
+
- "imagenet-1k" - Image classification benchmark
|
| 32 |
+
"""
|
| 33 |
+
client = get_client()
|
| 34 |
+
info = client.get_dataset_info(dataset_id)
|
| 35 |
+
|
| 36 |
+
if "error" in info:
|
| 37 |
+
return f"Error fetching dataset info: {info['error']}\n\nMake sure the dataset ID is correct and the dataset exists."
|
| 38 |
+
|
| 39 |
+
return format_dataset_info(info)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_schema(
|
| 43 |
+
dataset_id: str,
|
| 44 |
+
config: Optional[str] = None,
|
| 45 |
+
split: str = "train"
|
| 46 |
+
) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Get the schema (columns and data types) of a dataset.
|
| 49 |
+
|
| 50 |
+
Use this tool to understand the structure of a dataset before loading samples
|
| 51 |
+
or performing analysis. Shows all column names and their data types.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
dataset_id: The full dataset identifier (e.g., "squad", "imdb")
|
| 55 |
+
config: Optional dataset configuration name. Many datasets have multiple configs
|
| 56 |
+
(e.g., "plain_text" vs "parquet" for some datasets). Leave empty for default.
|
| 57 |
+
split: The dataset split to examine ("train", "test", "validation"). Default: "train"
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Formatted schema showing:
|
| 61 |
+
- Number of columns
|
| 62 |
+
- Column names and their Hugging Face feature types
|
| 63 |
+
- Table view for easy reading
|
| 64 |
+
|
| 65 |
+
Common feature types:
|
| 66 |
+
- Value(dtype='string') - Text data
|
| 67 |
+
- Value(dtype='int64') - Integer numbers
|
| 68 |
+
- Value(dtype='float32') - Decimal numbers
|
| 69 |
+
- ClassLabel - Categorical labels with names
|
| 70 |
+
- Image - PIL Image objects
|
| 71 |
+
- Audio - Audio waveform data
|
| 72 |
+
- Sequence - Lists/arrays of values
|
| 73 |
+
"""
|
| 74 |
+
client = get_client()
|
| 75 |
+
|
| 76 |
+
# First, get available configs and splits
|
| 77 |
+
configs_splits = client.get_configs_and_splits(dataset_id)
|
| 78 |
+
|
| 79 |
+
schema = client.get_schema(dataset_id, config, split)
|
| 80 |
+
|
| 81 |
+
if "error" in schema:
|
| 82 |
+
# Provide helpful error message
|
| 83 |
+
error_msg = f"Error getting schema: {schema['error']}\n\n"
|
| 84 |
+
if configs_splits:
|
| 85 |
+
error_msg += "Available configurations and splits:\n"
|
| 86 |
+
for cfg, splits in configs_splits.items():
|
| 87 |
+
error_msg += f"- Config '{cfg}': {', '.join(splits)}\n"
|
| 88 |
+
error_msg += "\nTry specifying a valid config and split."
|
| 89 |
+
return error_msg
|
| 90 |
+
|
| 91 |
+
result = format_schema(schema)
|
| 92 |
+
|
| 93 |
+
# Add configs info
|
| 94 |
+
if configs_splits:
|
| 95 |
+
result += "\n\n### Available Configurations\n"
|
| 96 |
+
for cfg, splits in configs_splits.items():
|
| 97 |
+
result += f"- **{cfg}**: {', '.join(splits)}\n"
|
| 98 |
+
|
| 99 |
+
return result
|
tools/profiling.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Profiling tools for analyzing dataset statistics and quality."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Dict, Any, List
|
| 4 |
+
from utils.hf_client import get_client
|
| 5 |
+
from utils.formatting import format_statistics, format_quality_report
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_statistics(
|
| 10 |
+
dataset_id: str,
|
| 11 |
+
config: Optional[str] = None,
|
| 12 |
+
split: str = "train",
|
| 13 |
+
sample_size: int = 1000
|
| 14 |
+
) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Compute basic statistics for each column in a dataset.
|
| 17 |
+
|
| 18 |
+
Use this tool to get a statistical overview of a dataset, including
|
| 19 |
+
counts, means, unique values, and distributions for each column.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
dataset_id: The full dataset identifier (e.g., "squad", "imdb")
|
| 23 |
+
config: Optional dataset configuration name. Leave empty for default.
|
| 24 |
+
split: The dataset split to analyze ("train", "test", "validation"). Default: "train"
|
| 25 |
+
sample_size: Number of rows to sample for statistics (100-5000, default: 1000).
|
| 26 |
+
Larger samples are more accurate but slower.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Formatted statistics including:
|
| 30 |
+
- Total row count (estimated from sample)
|
| 31 |
+
- Per-column statistics:
|
| 32 |
+
- Numeric: min, max, mean, median, std
|
| 33 |
+
- Text: avg length, min/max length, unique count
|
| 34 |
+
- Categorical: value counts, top categories
|
| 35 |
+
|
| 36 |
+
Notes:
|
| 37 |
+
- Statistics are computed on a sample for efficiency
|
| 38 |
+
- Very large datasets may show approximate values
|
| 39 |
+
- Binary data columns (images, audio) show type info only
|
| 40 |
+
"""
|
| 41 |
+
sample_size = max(100, min(5000, sample_size))
|
| 42 |
+
|
| 43 |
+
client = get_client()
|
| 44 |
+
|
| 45 |
+
# Load sample for statistics
|
| 46 |
+
samples = client.load_sample(
|
| 47 |
+
dataset_id=dataset_id,
|
| 48 |
+
config=config,
|
| 49 |
+
split=split,
|
| 50 |
+
n_rows=sample_size
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
if not samples or "error" in samples[0]:
|
| 54 |
+
error_msg = samples[0].get('error', 'Unknown error') if samples else 'No data'
|
| 55 |
+
return f"Error loading data for statistics: {error_msg}"
|
| 56 |
+
|
| 57 |
+
# Compute statistics
|
| 58 |
+
stats = {
|
| 59 |
+
"total_rows": f"~{len(samples):,}+ (sampled)",
|
| 60 |
+
"column_stats": {}
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Analyze each column
|
| 64 |
+
columns = samples[0].keys() if samples else []
|
| 65 |
+
|
| 66 |
+
for col in columns:
|
| 67 |
+
col_values = [row.get(col) for row in samples if row.get(col) is not None]
|
| 68 |
+
|
| 69 |
+
if not col_values:
|
| 70 |
+
stats["column_stats"][col] = {"status": "all null"}
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
# Determine column type and compute appropriate stats
|
| 74 |
+
sample_val = col_values[0]
|
| 75 |
+
|
| 76 |
+
if isinstance(sample_val, (int, float)) and not isinstance(sample_val, bool):
|
| 77 |
+
# Numeric column
|
| 78 |
+
numeric_vals = [v for v in col_values if isinstance(v, (int, float))]
|
| 79 |
+
if numeric_vals:
|
| 80 |
+
stats["column_stats"][col] = {
|
| 81 |
+
"type": "numeric",
|
| 82 |
+
"count": len(numeric_vals),
|
| 83 |
+
"min": min(numeric_vals),
|
| 84 |
+
"max": max(numeric_vals),
|
| 85 |
+
"mean": sum(numeric_vals) / len(numeric_vals),
|
| 86 |
+
"unique": len(set(numeric_vals))
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
elif isinstance(sample_val, str):
|
| 90 |
+
# Text column
|
| 91 |
+
lengths = [len(v) for v in col_values if isinstance(v, str)]
|
| 92 |
+
unique_vals = set(col_values)
|
| 93 |
+
stats["column_stats"][col] = {
|
| 94 |
+
"type": "text",
|
| 95 |
+
"count": len(col_values),
|
| 96 |
+
"avg_length": sum(lengths) / len(lengths) if lengths else 0,
|
| 97 |
+
"min_length": min(lengths) if lengths else 0,
|
| 98 |
+
"max_length": max(lengths) if lengths else 0,
|
| 99 |
+
"unique": len(unique_vals),
|
| 100 |
+
"sample_values": list(unique_vals)[:3]
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
elif isinstance(sample_val, bool):
|
| 104 |
+
# Boolean column
|
| 105 |
+
true_count = sum(1 for v in col_values if v is True)
|
| 106 |
+
stats["column_stats"][col] = {
|
| 107 |
+
"type": "boolean",
|
| 108 |
+
"count": len(col_values),
|
| 109 |
+
"true_count": true_count,
|
| 110 |
+
"false_count": len(col_values) - true_count,
|
| 111 |
+
"true_pct": (true_count / len(col_values)) * 100
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
elif isinstance(sample_val, list):
|
| 115 |
+
# List/sequence column
|
| 116 |
+
lengths = [len(v) for v in col_values if isinstance(v, list)]
|
| 117 |
+
stats["column_stats"][col] = {
|
| 118 |
+
"type": "list/sequence",
|
| 119 |
+
"count": len(col_values),
|
| 120 |
+
"avg_length": sum(lengths) / len(lengths) if lengths else 0,
|
| 121 |
+
"min_length": min(lengths) if lengths else 0,
|
| 122 |
+
"max_length": max(lengths) if lengths else 0
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
elif isinstance(sample_val, dict):
|
| 126 |
+
# Nested object
|
| 127 |
+
stats["column_stats"][col] = {
|
| 128 |
+
"type": "object/nested",
|
| 129 |
+
"count": len(col_values),
|
| 130 |
+
"sample_keys": list(sample_val.keys())[:5] if sample_val else []
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
else:
|
| 134 |
+
# Binary or other type
|
| 135 |
+
stats["column_stats"][col] = {
|
| 136 |
+
"type": str(type(sample_val).__name__),
|
| 137 |
+
"count": len(col_values),
|
| 138 |
+
"note": "Binary/special data type"
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
return format_statistics(stats)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def profile_quality(
|
| 145 |
+
dataset_id: str,
|
| 146 |
+
config: Optional[str] = None,
|
| 147 |
+
split: str = "train",
|
| 148 |
+
sample_size: int = 500
|
| 149 |
+
) -> str:
|
| 150 |
+
"""
|
| 151 |
+
Assess the data quality of a dataset and identify potential issues.
|
| 152 |
+
|
| 153 |
+
Use this tool to check for common data quality problems like missing values,
|
| 154 |
+
duplicates, imbalanced classes, and outliers before using a dataset.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
dataset_id: The full dataset identifier (e.g., "squad", "imdb")
|
| 158 |
+
config: Optional dataset configuration name. Leave empty for default.
|
| 159 |
+
split: The dataset split to analyze ("train", "test", "validation"). Default: "train"
|
| 160 |
+
sample_size: Number of rows to sample for quality check (100-2000, default: 500).
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
Data quality report including:
|
| 164 |
+
- Overall quality score (0-100)
|
| 165 |
+
- List of identified issues
|
| 166 |
+
- Per-column quality metrics:
|
| 167 |
+
- Missing value percentage
|
| 168 |
+
- Unique value percentage
|
| 169 |
+
- Specific issues (constant values, high cardinality, etc.)
|
| 170 |
+
|
| 171 |
+
Quality checks performed:
|
| 172 |
+
- Missing/null values
|
| 173 |
+
- Duplicate rows
|
| 174 |
+
- Constant columns (single value)
|
| 175 |
+
- High cardinality text columns
|
| 176 |
+
- Class imbalance for categorical columns
|
| 177 |
+
- Outliers for numeric columns
|
| 178 |
+
"""
|
| 179 |
+
sample_size = max(100, min(2000, sample_size))
|
| 180 |
+
|
| 181 |
+
client = get_client()
|
| 182 |
+
|
| 183 |
+
# Load sample
|
| 184 |
+
samples = client.load_sample(
|
| 185 |
+
dataset_id=dataset_id,
|
| 186 |
+
config=config,
|
| 187 |
+
split=split,
|
| 188 |
+
n_rows=sample_size
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
if not samples or "error" in samples[0]:
|
| 192 |
+
error_msg = samples[0].get('error', 'Unknown error') if samples else 'No data'
|
| 193 |
+
return format_quality_report({"error": error_msg})
|
| 194 |
+
|
| 195 |
+
# Initialize report
|
| 196 |
+
report: Dict[str, Any] = {
|
| 197 |
+
"dataset_id": dataset_id,
|
| 198 |
+
"sample_size": len(samples),
|
| 199 |
+
"issues": [],
|
| 200 |
+
"column_quality": {},
|
| 201 |
+
"overall_score": 100
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
# Check for duplicate rows
|
| 205 |
+
try:
|
| 206 |
+
row_strings = [json.dumps(row, sort_keys=True, default=str) for row in samples]
|
| 207 |
+
unique_rows = len(set(row_strings))
|
| 208 |
+
duplicate_pct = ((len(samples) - unique_rows) / len(samples)) * 100
|
| 209 |
+
if duplicate_pct > 5:
|
| 210 |
+
report["issues"].append(f"High duplicate rate: {duplicate_pct:.1f}% duplicate rows")
|
| 211 |
+
report["overall_score"] -= min(20, duplicate_pct)
|
| 212 |
+
except Exception:
|
| 213 |
+
pass
|
| 214 |
+
|
| 215 |
+
# Analyze each column
|
| 216 |
+
columns = samples[0].keys() if samples else []
|
| 217 |
+
|
| 218 |
+
for col in columns:
|
| 219 |
+
col_values = [row.get(col) for row in samples]
|
| 220 |
+
non_null_values = [v for v in col_values if v is not None and v != ""]
|
| 221 |
+
|
| 222 |
+
col_quality: Dict[str, Any] = {
|
| 223 |
+
"missing_pct": ((len(samples) - len(non_null_values)) / len(samples)) * 100,
|
| 224 |
+
"issues": []
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
# Check missing values
|
| 228 |
+
if col_quality["missing_pct"] > 20:
|
| 229 |
+
col_quality["issues"].append("High missing rate")
|
| 230 |
+
report["overall_score"] -= 5
|
| 231 |
+
elif col_quality["missing_pct"] > 50:
|
| 232 |
+
report["issues"].append(f"Column '{col}' has {col_quality['missing_pct']:.0f}% missing values")
|
| 233 |
+
report["overall_score"] -= 10
|
| 234 |
+
|
| 235 |
+
# Calculate unique percentage
|
| 236 |
+
if non_null_values:
|
| 237 |
+
unique_count = len(set(str(v) for v in non_null_values))
|
| 238 |
+
col_quality["unique_pct"] = (unique_count / len(non_null_values)) * 100
|
| 239 |
+
|
| 240 |
+
# Check for constant column
|
| 241 |
+
if unique_count == 1:
|
| 242 |
+
col_quality["issues"].append("Constant value")
|
| 243 |
+
report["issues"].append(f"Column '{col}' has only one unique value")
|
| 244 |
+
report["overall_score"] -= 5
|
| 245 |
+
|
| 246 |
+
# Check for potential ID column (all unique)
|
| 247 |
+
elif col_quality["unique_pct"] > 99 and len(non_null_values) > 10:
|
| 248 |
+
col_quality["issues"].append("Possibly ID column")
|
| 249 |
+
|
| 250 |
+
# Check for high cardinality in small dataset
|
| 251 |
+
elif isinstance(non_null_values[0], str) and unique_count > len(samples) * 0.8:
|
| 252 |
+
col_quality["issues"].append("High cardinality")
|
| 253 |
+
|
| 254 |
+
# Check class imbalance for categorical
|
| 255 |
+
sample_val = non_null_values[0]
|
| 256 |
+
if isinstance(sample_val, (str, int, bool)) and unique_count <= 20:
|
| 257 |
+
value_counts = {}
|
| 258 |
+
for v in non_null_values:
|
| 259 |
+
key = str(v)
|
| 260 |
+
value_counts[key] = value_counts.get(key, 0) + 1
|
| 261 |
+
|
| 262 |
+
if value_counts:
|
| 263 |
+
max_count = max(value_counts.values())
|
| 264 |
+
min_count = min(value_counts.values())
|
| 265 |
+
if max_count > min_count * 10:
|
| 266 |
+
col_quality["issues"].append("Class imbalance")
|
| 267 |
+
if "label" in col.lower() or "class" in col.lower():
|
| 268 |
+
report["issues"].append(f"Significant class imbalance in '{col}'")
|
| 269 |
+
report["overall_score"] -= 10
|
| 270 |
+
|
| 271 |
+
else:
|
| 272 |
+
col_quality["unique_pct"] = 0
|
| 273 |
+
|
| 274 |
+
col_quality["issues"] = ", ".join(col_quality["issues"]) if col_quality["issues"] else "-"
|
| 275 |
+
report["column_quality"][col] = col_quality
|
| 276 |
+
|
| 277 |
+
# Clamp score
|
| 278 |
+
report["overall_score"] = max(0, min(100, report["overall_score"]))
|
| 279 |
+
|
| 280 |
+
if not report["issues"]:
|
| 281 |
+
report["issues"].append("No major issues detected")
|
| 282 |
+
|
| 283 |
+
return format_quality_report(report)
|
tools/sampling.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sampling tools for loading actual data from datasets."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from utils.hf_client import get_client
|
| 5 |
+
from utils.formatting import format_sample
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def sample_rows(
|
| 9 |
+
dataset_id: str,
|
| 10 |
+
n_rows: int = 5,
|
| 11 |
+
config: Optional[str] = None,
|
| 12 |
+
split: str = "train",
|
| 13 |
+
random_seed: Optional[int] = None
|
| 14 |
+
) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Get a sample of actual rows from a dataset to inspect the data.
|
| 17 |
+
|
| 18 |
+
Use this tool to see real examples from a dataset. This helps understand
|
| 19 |
+
what the data looks like, the format of each column, and typical values.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
dataset_id: The full dataset identifier (e.g., "squad", "imdb")
|
| 23 |
+
n_rows: Number of rows to sample (1-20, default: 5). Keep small for large datasets.
|
| 24 |
+
config: Optional dataset configuration name. Leave empty for default config.
|
| 25 |
+
split: The dataset split to sample from ("train", "test", "validation"). Default: "train"
|
| 26 |
+
random_seed: Optional seed for reproducible sampling. If not provided, returns first N rows.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Formatted sample showing actual data rows in JSON format, with each row
|
| 30 |
+
numbered and clearly separated.
|
| 31 |
+
|
| 32 |
+
Notes:
|
| 33 |
+
- Large binary data (images, audio) is shown as placeholders
|
| 34 |
+
- Very long text is truncated for readability
|
| 35 |
+
- Use get_schema first to understand column types before sampling
|
| 36 |
+
|
| 37 |
+
Example usage:
|
| 38 |
+
- sample_rows("imdb", 3) - Get 3 movie reviews
|
| 39 |
+
- sample_rows("squad", 5, split="validation") - Get 5 QA pairs from validation
|
| 40 |
+
"""
|
| 41 |
+
n_rows = max(1, min(20, n_rows)) # Clamp between 1 and 20
|
| 42 |
+
|
| 43 |
+
client = get_client()
|
| 44 |
+
samples = client.load_sample(
|
| 45 |
+
dataset_id=dataset_id,
|
| 46 |
+
config=config,
|
| 47 |
+
split=split,
|
| 48 |
+
n_rows=n_rows
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
return format_sample(samples, dataset_id)
|
tools/search.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search tools for finding datasets on Hugging Face Hub."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional, List
|
| 4 |
+
from utils.hf_client import get_client
|
| 5 |
+
from utils.formatting import format_dataset_list
|
| 6 |
+
from huggingface_hub import list_datasets
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def search_datasets(
|
| 10 |
+
query: str,
|
| 11 |
+
limit: int = 10,
|
| 12 |
+
filter_task: Optional[str] = None,
|
| 13 |
+
sort_by: str = "downloads"
|
| 14 |
+
) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Search for datasets on Hugging Face Hub by keyword, task, or domain.
|
| 17 |
+
|
| 18 |
+
Use this tool to find datasets matching specific criteria. You can search by
|
| 19 |
+
name, description, or filter by ML task category.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
query: Search query string (e.g., "sentiment analysis", "image classification", "medical")
|
| 23 |
+
limit: Maximum number of results to return (1-50, default: 10)
|
| 24 |
+
filter_task: Optional ML task filter (e.g., "text-classification", "image-classification",
|
| 25 |
+
"question-answering", "summarization", "translation")
|
| 26 |
+
sort_by: Sort results by "downloads", "likes", or "created" (default: "downloads")
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Formatted list of matching datasets with their IDs, download counts, and tags.
|
| 30 |
+
|
| 31 |
+
Example queries:
|
| 32 |
+
- "sentiment" - Find sentiment analysis datasets
|
| 33 |
+
- "medical imaging" - Find medical image datasets
|
| 34 |
+
- "french translation" - Find French translation datasets
|
| 35 |
+
"""
|
| 36 |
+
limit = max(1, min(50, limit)) # Clamp between 1 and 50
|
| 37 |
+
|
| 38 |
+
client = get_client()
|
| 39 |
+
datasets = client.search_datasets(
|
| 40 |
+
query=query,
|
| 41 |
+
limit=limit,
|
| 42 |
+
filter_task=filter_task,
|
| 43 |
+
sort=sort_by
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
return format_dataset_list(datasets)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def search_by_columns(
|
| 50 |
+
column_names: List[str],
|
| 51 |
+
data_types: Optional[List[str]] = None,
|
| 52 |
+
limit: int = 10
|
| 53 |
+
) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Find datasets that contain specific column names or data types.
|
| 56 |
+
|
| 57 |
+
Use this tool when you need datasets with particular features or columns,
|
| 58 |
+
such as finding all datasets with a "label" column or "image" type.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
column_names: List of column names to search for (e.g., ["text", "label"], ["image", "caption"])
|
| 62 |
+
data_types: Optional list of data types to filter by (e.g., ["Image", "Audio", "ClassLabel"])
|
| 63 |
+
limit: Maximum number of results to return (1-30, default: 10)
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Formatted list of datasets containing the specified columns/types.
|
| 67 |
+
|
| 68 |
+
Common column patterns:
|
| 69 |
+
- Text classification: ["text", "label"]
|
| 70 |
+
- Image classification: ["image", "label"]
|
| 71 |
+
- Question answering: ["question", "answer", "context"]
|
| 72 |
+
- Translation: ["source", "target"] or ["en", "fr"]
|
| 73 |
+
"""
|
| 74 |
+
limit = max(1, min(30, limit))
|
| 75 |
+
|
| 76 |
+
# Build search query from column names
|
| 77 |
+
search_query = " ".join(column_names)
|
| 78 |
+
|
| 79 |
+
# Search for datasets
|
| 80 |
+
client = get_client()
|
| 81 |
+
all_datasets = client.search_datasets(query=search_query, limit=limit * 3)
|
| 82 |
+
|
| 83 |
+
# Filter by actually checking schemas (best effort)
|
| 84 |
+
matching_datasets = []
|
| 85 |
+
for ds in all_datasets:
|
| 86 |
+
if len(matching_datasets) >= limit:
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
schema = client.get_schema(ds['id'])
|
| 91 |
+
if "error" not in schema:
|
| 92 |
+
columns = schema.get('columns', [])
|
| 93 |
+
columns_lower = [c.lower() for c in columns]
|
| 94 |
+
|
| 95 |
+
# Check if any requested columns match
|
| 96 |
+
matches = sum(1 for col in column_names if col.lower() in columns_lower)
|
| 97 |
+
if matches > 0:
|
| 98 |
+
ds['matched_columns'] = matches
|
| 99 |
+
ds['total_columns'] = len(columns)
|
| 100 |
+
matching_datasets.append(ds)
|
| 101 |
+
except Exception:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
if not matching_datasets:
|
| 105 |
+
return f"No datasets found with columns matching: {', '.join(column_names)}\n\nTry broader search terms or check column naming conventions."
|
| 106 |
+
|
| 107 |
+
# Format results
|
| 108 |
+
lines = [f"## Datasets with columns: {', '.join(column_names)}\n"]
|
| 109 |
+
for i, ds in enumerate(matching_datasets, 1):
|
| 110 |
+
lines.append(f"### {i}. {ds['id']}")
|
| 111 |
+
lines.append(f"- Matched columns: {ds.get('matched_columns', 'N/A')}/{len(column_names)}")
|
| 112 |
+
lines.append(f"- Total columns: {ds.get('total_columns', 'N/A')}")
|
| 113 |
+
lines.append(f"- Downloads: {ds.get('downloads', 'N/A'):,}")
|
| 114 |
+
lines.append("")
|
| 115 |
+
|
| 116 |
+
return "\n".join(lines)
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility modules for dataview-mcp."""
|
| 2 |
+
|
| 3 |
+
from .hf_client import get_client, HFDatasetClient
|
| 4 |
+
from .formatting import (
|
| 5 |
+
format_dataset_list,
|
| 6 |
+
format_dataset_info,
|
| 7 |
+
format_schema,
|
| 8 |
+
format_sample,
|
| 9 |
+
format_statistics,
|
| 10 |
+
format_quality_report,
|
| 11 |
+
format_comparison,
|
| 12 |
+
format_similar_datasets,
|
| 13 |
+
format_task_suggestions,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"get_client",
|
| 18 |
+
"HFDatasetClient",
|
| 19 |
+
"format_dataset_list",
|
| 20 |
+
"format_dataset_info",
|
| 21 |
+
"format_schema",
|
| 22 |
+
"format_sample",
|
| 23 |
+
"format_statistics",
|
| 24 |
+
"format_quality_report",
|
| 25 |
+
"format_comparison",
|
| 26 |
+
"format_similar_datasets",
|
| 27 |
+
"format_task_suggestions",
|
| 28 |
+
]
|
utils/formatting.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Formatting utilities for MCP tool outputs."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from typing import Any, Dict, List
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def format_dataset_list(datasets: List[Dict[str, Any]]) -> str:
|
| 8 |
+
"""Format a list of datasets for display."""
|
| 9 |
+
if not datasets:
|
| 10 |
+
return "No datasets found."
|
| 11 |
+
|
| 12 |
+
lines = ["## Datasets Found\n"]
|
| 13 |
+
for i, ds in enumerate(datasets, 1):
|
| 14 |
+
lines.append(f"### {i}. {ds['id']}")
|
| 15 |
+
lines.append(f"- Downloads: {ds.get('downloads', 'N/A'):,}")
|
| 16 |
+
lines.append(f"- Likes: {ds.get('likes', 'N/A')}")
|
| 17 |
+
if ds.get('tags'):
|
| 18 |
+
lines.append(f"- Tags: {', '.join(ds['tags'][:5])}")
|
| 19 |
+
lines.append("")
|
| 20 |
+
|
| 21 |
+
return "\n".join(lines)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def format_dataset_info(info: Dict[str, Any]) -> str:
|
| 25 |
+
"""Format dataset info for display."""
|
| 26 |
+
lines = [f"## Dataset: {info['id']}\n"]
|
| 27 |
+
lines.append(f"- **Author**: {info.get('author', 'N/A')}")
|
| 28 |
+
lines.append(f"- **Downloads**: {info.get('downloads', 0):,}")
|
| 29 |
+
lines.append(f"- **Likes**: {info.get('likes', 0)}")
|
| 30 |
+
lines.append(f"- **License**: {info.get('license', 'N/A')}")
|
| 31 |
+
|
| 32 |
+
if info.get('tags'):
|
| 33 |
+
lines.append(f"- **Tags**: {', '.join(info['tags'][:10])}")
|
| 34 |
+
|
| 35 |
+
if info.get('card_summary'):
|
| 36 |
+
lines.append("\n### Dataset Card (Summary)")
|
| 37 |
+
lines.append(info['card_summary'][:1500] + "..." if len(info.get('card_summary', '')) > 1500 else info['card_summary'])
|
| 38 |
+
|
| 39 |
+
return "\n".join(lines)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def format_schema(schema: Dict[str, Any]) -> str:
|
| 43 |
+
"""Format schema information for display."""
|
| 44 |
+
if "error" in schema:
|
| 45 |
+
return f"Error: {schema['error']}"
|
| 46 |
+
|
| 47 |
+
lines = ["## Dataset Schema\n"]
|
| 48 |
+
lines.append(f"**Number of columns**: {schema.get('num_columns', 'N/A')}\n")
|
| 49 |
+
lines.append("### Columns\n")
|
| 50 |
+
lines.append("| Column | Type |")
|
| 51 |
+
lines.append("|--------|------|")
|
| 52 |
+
|
| 53 |
+
for col, dtype in schema.get('features', {}).items():
|
| 54 |
+
lines.append(f"| `{col}` | {dtype} |")
|
| 55 |
+
|
| 56 |
+
return "\n".join(lines)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def format_sample(samples: List[Dict[str, Any]], dataset_id: str) -> str:
|
| 60 |
+
"""Format sample rows for display."""
|
| 61 |
+
if not samples:
|
| 62 |
+
return "No samples available."
|
| 63 |
+
|
| 64 |
+
if "error" in samples[0]:
|
| 65 |
+
return f"Error loading samples: {samples[0]['error']}"
|
| 66 |
+
|
| 67 |
+
lines = [f"## Sample from `{dataset_id}`\n"]
|
| 68 |
+
lines.append(f"Showing {len(samples)} row(s):\n")
|
| 69 |
+
|
| 70 |
+
for i, row in enumerate(samples, 1):
|
| 71 |
+
lines.append(f"### Row {i}")
|
| 72 |
+
lines.append("```json")
|
| 73 |
+
lines.append(json.dumps(row, indent=2, default=str, ensure_ascii=False)[:1000])
|
| 74 |
+
lines.append("```\n")
|
| 75 |
+
|
| 76 |
+
return "\n".join(lines)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def format_statistics(stats: Dict[str, Any]) -> str:
|
| 80 |
+
"""Format statistics for display."""
|
| 81 |
+
if "error" in stats:
|
| 82 |
+
return f"Error: {stats['error']}"
|
| 83 |
+
|
| 84 |
+
lines = ["## Dataset Statistics\n"]
|
| 85 |
+
lines.append(f"**Total rows**: {stats.get('total_rows', 'N/A'):,}\n")
|
| 86 |
+
|
| 87 |
+
if stats.get('column_stats'):
|
| 88 |
+
lines.append("### Column Statistics\n")
|
| 89 |
+
for col, col_stats in stats['column_stats'].items():
|
| 90 |
+
lines.append(f"#### `{col}`")
|
| 91 |
+
for key, value in col_stats.items():
|
| 92 |
+
if isinstance(value, float):
|
| 93 |
+
lines.append(f"- {key}: {value:.2f}")
|
| 94 |
+
else:
|
| 95 |
+
lines.append(f"- {key}: {value}")
|
| 96 |
+
lines.append("")
|
| 97 |
+
|
| 98 |
+
return "\n".join(lines)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def format_quality_report(report: Dict[str, Any]) -> str:
|
| 102 |
+
"""Format data quality report for display."""
|
| 103 |
+
if "error" in report:
|
| 104 |
+
return f"Error: {report['error']}"
|
| 105 |
+
|
| 106 |
+
lines = ["## Data Quality Report\n"]
|
| 107 |
+
|
| 108 |
+
# Overall score
|
| 109 |
+
if "overall_score" in report:
|
| 110 |
+
score = report['overall_score']
|
| 111 |
+
emoji = "" if score >= 80 else "" if score >= 60 else ""
|
| 112 |
+
lines.append(f"**Overall Quality Score**: {emoji} {score}/100\n")
|
| 113 |
+
|
| 114 |
+
# Issues
|
| 115 |
+
if report.get('issues'):
|
| 116 |
+
lines.append("### Issues Found\n")
|
| 117 |
+
for issue in report['issues']:
|
| 118 |
+
lines.append(f"- {issue}")
|
| 119 |
+
lines.append("")
|
| 120 |
+
|
| 121 |
+
# Column-level quality
|
| 122 |
+
if report.get('column_quality'):
|
| 123 |
+
lines.append("### Column Quality\n")
|
| 124 |
+
lines.append("| Column | Missing % | Unique % | Issues |")
|
| 125 |
+
lines.append("|--------|-----------|----------|--------|")
|
| 126 |
+
for col, quality in report['column_quality'].items():
|
| 127 |
+
missing = quality.get('missing_pct', 0)
|
| 128 |
+
unique = quality.get('unique_pct', 0)
|
| 129 |
+
issues = quality.get('issues', '-')
|
| 130 |
+
lines.append(f"| `{col}` | {missing:.1f}% | {unique:.1f}% | {issues} |")
|
| 131 |
+
|
| 132 |
+
return "\n".join(lines)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def format_comparison(comparison: Dict[str, Any]) -> str:
|
| 136 |
+
"""Format dataset comparison for display."""
|
| 137 |
+
if "error" in comparison:
|
| 138 |
+
return f"Error: {comparison['error']}"
|
| 139 |
+
|
| 140 |
+
lines = ["## Dataset Comparison\n"]
|
| 141 |
+
lines.append(f"Comparing **{comparison['dataset_a']}** vs **{comparison['dataset_b']}**\n")
|
| 142 |
+
|
| 143 |
+
lines.append("| Aspect | Dataset A | Dataset B |")
|
| 144 |
+
lines.append("|--------|-----------|-----------|")
|
| 145 |
+
|
| 146 |
+
for aspect, values in comparison.get('comparison', {}).items():
|
| 147 |
+
lines.append(f"| {aspect} | {values.get('a', 'N/A')} | {values.get('b', 'N/A')} |")
|
| 148 |
+
|
| 149 |
+
if comparison.get('common_columns'):
|
| 150 |
+
lines.append(f"\n**Common columns**: {', '.join(comparison['common_columns'])}")
|
| 151 |
+
|
| 152 |
+
if comparison.get('unique_to_a'):
|
| 153 |
+
lines.append(f"**Unique to A**: {', '.join(comparison['unique_to_a'])}")
|
| 154 |
+
|
| 155 |
+
if comparison.get('unique_to_b'):
|
| 156 |
+
lines.append(f"**Unique to B**: {', '.join(comparison['unique_to_b'])}")
|
| 157 |
+
|
| 158 |
+
return "\n".join(lines)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def format_similar_datasets(similar: List[Dict[str, Any]]) -> str:
|
| 162 |
+
"""Format similar datasets list for display."""
|
| 163 |
+
if not similar:
|
| 164 |
+
return "No similar datasets found."
|
| 165 |
+
|
| 166 |
+
lines = ["## Similar Datasets\n"]
|
| 167 |
+
|
| 168 |
+
for i, ds in enumerate(similar, 1):
|
| 169 |
+
score = ds.get('similarity_score', 0)
|
| 170 |
+
lines.append(f"### {i}. {ds['id']} (similarity: {score:.2f})")
|
| 171 |
+
lines.append(f"- Downloads: {ds.get('downloads', 'N/A'):,}")
|
| 172 |
+
if ds.get('reason'):
|
| 173 |
+
lines.append(f"- Why similar: {ds['reason']}")
|
| 174 |
+
lines.append("")
|
| 175 |
+
|
| 176 |
+
return "\n".join(lines)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def format_task_suggestions(suggestions: Dict[str, Any]) -> str:
|
| 180 |
+
"""Format ML task suggestions for display."""
|
| 181 |
+
if "error" in suggestions:
|
| 182 |
+
return f"Error: {suggestions['error']}"
|
| 183 |
+
|
| 184 |
+
lines = [f"## Suggested ML Tasks for `{suggestions.get('dataset_id', 'dataset')}`\n"]
|
| 185 |
+
|
| 186 |
+
if suggestions.get('tasks'):
|
| 187 |
+
for i, task in enumerate(suggestions['tasks'], 1):
|
| 188 |
+
confidence = task.get('confidence', 'medium')
|
| 189 |
+
emoji = "" if confidence == 'high' else "" if confidence == 'medium' else ""
|
| 190 |
+
lines.append(f"### {i}. {task['name']} {emoji}")
|
| 191 |
+
lines.append(f"- **Confidence**: {confidence}")
|
| 192 |
+
lines.append(f"- **Reason**: {task.get('reason', 'Based on dataset structure')}")
|
| 193 |
+
if task.get('target_column'):
|
| 194 |
+
lines.append(f"- **Target column**: `{task['target_column']}`")
|
| 195 |
+
if task.get('feature_columns'):
|
| 196 |
+
lines.append(f"- **Feature columns**: {', '.join(f'`{c}`' for c in task['feature_columns'][:5])}")
|
| 197 |
+
lines.append("")
|
| 198 |
+
|
| 199 |
+
return "\n".join(lines)
|
utils/hf_client.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hugging Face API client wrapper for dataset operations."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Optional, List, Dict, Any
|
| 5 |
+
from huggingface_hub import HfApi, list_datasets, DatasetCard
|
| 6 |
+
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class HFDatasetClient:
|
| 13 |
+
"""Client for interacting with Hugging Face datasets."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, token: Optional[str] = None):
|
| 16 |
+
self.token = token or os.getenv("HF_TOKEN")
|
| 17 |
+
self.api = HfApi(token=self.token)
|
| 18 |
+
|
| 19 |
+
def search_datasets(
|
| 20 |
+
self,
|
| 21 |
+
query: str,
|
| 22 |
+
limit: int = 10,
|
| 23 |
+
filter_task: Optional[str] = None,
|
| 24 |
+
sort: str = "downloads"
|
| 25 |
+
) -> List[Dict[str, Any]]:
|
| 26 |
+
"""Search for datasets on Hugging Face Hub."""
|
| 27 |
+
datasets = list(list_datasets(
|
| 28 |
+
search=query,
|
| 29 |
+
limit=limit,
|
| 30 |
+
sort=sort,
|
| 31 |
+
task_categories=filter_task if filter_task else None
|
| 32 |
+
))
|
| 33 |
+
|
| 34 |
+
return [
|
| 35 |
+
{
|
| 36 |
+
"id": ds.id,
|
| 37 |
+
"downloads": ds.downloads,
|
| 38 |
+
"likes": ds.likes,
|
| 39 |
+
"tags": ds.tags[:5] if ds.tags else [],
|
| 40 |
+
"created_at": str(ds.created_at) if ds.created_at else None,
|
| 41 |
+
}
|
| 42 |
+
for ds in datasets
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
def get_dataset_info(self, dataset_id: str) -> Dict[str, Any]:
|
| 46 |
+
"""Get detailed information about a dataset."""
|
| 47 |
+
info = self.api.dataset_info(dataset_id)
|
| 48 |
+
|
| 49 |
+
# Try to get the dataset card
|
| 50 |
+
card_content = None
|
| 51 |
+
try:
|
| 52 |
+
card = DatasetCard.load(dataset_id)
|
| 53 |
+
card_content = card.text[:2000] if card.text else None # Limit card size
|
| 54 |
+
except Exception:
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
return {
|
| 58 |
+
"id": info.id,
|
| 59 |
+
"author": info.author,
|
| 60 |
+
"downloads": info.downloads,
|
| 61 |
+
"likes": info.likes,
|
| 62 |
+
"tags": info.tags,
|
| 63 |
+
"license": getattr(info, 'license', None),
|
| 64 |
+
"created_at": str(info.created_at) if info.created_at else None,
|
| 65 |
+
"last_modified": str(info.last_modified) if info.last_modified else None,
|
| 66 |
+
"card_summary": card_content,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
def get_configs_and_splits(self, dataset_id: str) -> Dict[str, List[str]]:
|
| 70 |
+
"""Get available configs and splits for a dataset."""
|
| 71 |
+
try:
|
| 72 |
+
configs = get_dataset_config_names(dataset_id, trust_remote_code=True)
|
| 73 |
+
except Exception:
|
| 74 |
+
configs = ["default"]
|
| 75 |
+
|
| 76 |
+
result = {}
|
| 77 |
+
for config in configs[:3]: # Limit to first 3 configs
|
| 78 |
+
try:
|
| 79 |
+
splits = get_dataset_split_names(dataset_id, config, trust_remote_code=True)
|
| 80 |
+
result[config] = splits
|
| 81 |
+
except Exception:
|
| 82 |
+
result[config] = ["train"]
|
| 83 |
+
|
| 84 |
+
return result
|
| 85 |
+
|
| 86 |
+
def load_sample(
|
| 87 |
+
self,
|
| 88 |
+
dataset_id: str,
|
| 89 |
+
config: Optional[str] = None,
|
| 90 |
+
split: str = "train",
|
| 91 |
+
n_rows: int = 5,
|
| 92 |
+
streaming: bool = True
|
| 93 |
+
) -> List[Dict[str, Any]]:
|
| 94 |
+
"""Load a sample of rows from a dataset."""
|
| 95 |
+
try:
|
| 96 |
+
ds = load_dataset(
|
| 97 |
+
dataset_id,
|
| 98 |
+
config,
|
| 99 |
+
split=split,
|
| 100 |
+
streaming=streaming,
|
| 101 |
+
trust_remote_code=True
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
if streaming:
|
| 105 |
+
samples = []
|
| 106 |
+
for i, row in enumerate(ds):
|
| 107 |
+
if i >= n_rows:
|
| 108 |
+
break
|
| 109 |
+
# Convert row to serializable format
|
| 110 |
+
samples.append(self._serialize_row(row))
|
| 111 |
+
return samples
|
| 112 |
+
else:
|
| 113 |
+
return [self._serialize_row(row) for row in ds.select(range(min(n_rows, len(ds))))]
|
| 114 |
+
except Exception as e:
|
| 115 |
+
return [{"error": str(e)}]
|
| 116 |
+
|
| 117 |
+
def get_schema(self, dataset_id: str, config: Optional[str] = None, split: str = "train") -> Dict[str, Any]:
|
| 118 |
+
"""Get the schema/features of a dataset."""
|
| 119 |
+
try:
|
| 120 |
+
ds = load_dataset(
|
| 121 |
+
dataset_id,
|
| 122 |
+
config,
|
| 123 |
+
split=split,
|
| 124 |
+
streaming=True,
|
| 125 |
+
trust_remote_code=True
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
features = ds.features
|
| 129 |
+
schema = {}
|
| 130 |
+
for name, feature in features.items():
|
| 131 |
+
schema[name] = str(feature)
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
"columns": list(features.keys()),
|
| 135 |
+
"features": schema,
|
| 136 |
+
"num_columns": len(features)
|
| 137 |
+
}
|
| 138 |
+
except Exception as e:
|
| 139 |
+
return {"error": str(e)}
|
| 140 |
+
|
| 141 |
+
def _serialize_row(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
| 142 |
+
"""Convert a row to JSON-serializable format."""
|
| 143 |
+
result = {}
|
| 144 |
+
for key, value in row.items():
|
| 145 |
+
if hasattr(value, 'tolist'): # numpy array
|
| 146 |
+
result[key] = value.tolist()
|
| 147 |
+
elif hasattr(value, '__dict__'): # PIL Image or similar
|
| 148 |
+
result[key] = f"<{type(value).__name__}>"
|
| 149 |
+
elif isinstance(value, bytes):
|
| 150 |
+
result[key] = f"<bytes: {len(value)} bytes>"
|
| 151 |
+
else:
|
| 152 |
+
result[key] = value
|
| 153 |
+
return result
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# Singleton instance
|
| 157 |
+
_client = None
|
| 158 |
+
|
| 159 |
+
def get_client() -> HFDatasetClient:
|
| 160 |
+
"""Get or create the HF client singleton."""
|
| 161 |
+
global _client
|
| 162 |
+
if _client is None:
|
| 163 |
+
_client = HFDatasetClient()
|
| 164 |
+
return _client
|