danielrosehill's picture
commit
597e3a5
import gradio as gr
from datasets import load_dataset
import json
import pandas as pd
# Load the dataset
dataset = load_dataset("danielrosehill/multimodal-ai-taxonomy", split="train")
# Extract taxonomy data and reconstruct nested structure
taxonomy_data = {}
for record in dataset:
# Get modality info
output_modality = record['output_modality']
operation_type = record['operation_type']
# Map output_modality to the keys used in MODALITY_INFO
modality_key_map = {
"video": "video_generation",
"audio": "audio_generation",
"image": "image_generation",
"text": "text_generation",
"3d": "3d_generation",
"3d-model": "3d_generation"
}
modality_key = modality_key_map.get(output_modality, f"{output_modality}_generation")
# Initialize nested structure
if modality_key not in taxonomy_data:
taxonomy_data[modality_key] = {}
if operation_type not in taxonomy_data[modality_key]:
taxonomy_data[modality_key][operation_type] = {
"description": f"{output_modality.title()} {operation_type} modalities",
"outputModality": output_modality,
"operationType": operation_type,
"modalities": []
}
# Reconstruct the nested modality object
modality_obj = {
"id": record['id'],
"name": record['name'],
"input": {
"primary": record['input_primary'],
"secondary": record['input_secondary']
},
"output": {
"primary": record['output_primary'],
"audio": record['output_audio']
},
"characteristics": json.loads(record['characteristics']) if record['characteristics'] else {},
"metadata": {
"maturityLevel": record['metadata_maturity_level'],
"commonUseCases": record['metadata_common_use_cases'],
"platforms": record['metadata_platforms'],
"exampleModels": record['metadata_example_models']
},
"relationships": json.loads(record['relationships']) if record['relationships'] else {}
}
# Add audio type if present
if record['output_audio'] and record.get('output_audio_type'):
modality_obj["output"]["audioType"] = record['output_audio_type']
# Add to taxonomy data
taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj)
# Define modality display names
MODALITY_INFO = {
"video_generation": {"name": "Video Generation", "color": "#FF6B6B"},
"audio_generation": {"name": "Audio Generation", "color": "#4ECDC4"},
"image_generation": {"name": "Image Generation", "color": "#95E1D3"},
"text_generation": {"name": "Text Generation", "color": "#F38181"},
"3d_generation": {"name": "3D Generation", "color": "#AA96DA"},
}
# CSS for styling
custom_css = """
.modality-card {
border: 2px solid #e0e0e0;
border-radius: 10px;
padding: 20px;
margin: 10px 0;
background: white;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.modality-header {
font-size: 1.5em;
font-weight: bold;
margin-bottom: 10px;
color: #333;
}
.modality-meta {
background: #f5f5f5;
padding: 10px;
border-radius: 5px;
margin: 10px 0;
}
.badge {
display: inline-block;
padding: 4px 12px;
border-radius: 12px;
margin: 2px;
font-size: 0.85em;
font-weight: 500;
}
.badge-mature { background: #4CAF50; color: white; }
.badge-emerging { background: #FF9800; color: white; }
.badge-experimental { background: #9C27B0; color: white; }
.index-card {
border: 2px solid #ddd;
border-radius: 15px;
padding: 30px;
margin: 15px;
text-align: center;
cursor: pointer;
transition: all 0.3s;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.index-card:hover {
transform: translateY(-5px);
box-shadow: 0 10px 20px rgba(0,0,0,0.2);
}
.stat-box {
background: #f8f9fa;
border-radius: 10px;
padding: 15px;
margin: 10px;
text-align: center;
}
"""
def create_modality_card(modality_obj):
"""Create an HTML card for a single modality"""
# Maturity badge
maturity = modality_obj['metadata']['maturityLevel']
badge_class = f"badge badge-{maturity}"
# Input/Output info
input_primary = modality_obj['input']['primary']
input_secondary = modality_obj['input'].get('secondary', [])
output_primary = modality_obj['output']['primary']
# Build input string
input_str = f"**Primary:** {input_primary}"
if input_secondary:
input_str += f"<br>**Secondary:** {', '.join(input_secondary)}"
# Audio info for output
audio_info = ""
if modality_obj['output'].get('audio'):
audio_type = modality_obj['output'].get('audioType', 'N/A')
audio_info = f"<br>**Audio:** {audio_type}"
# Characteristics
chars = modality_obj.get('characteristics', {})
char_items = [f"**{k}:** {v}" for k, v in chars.items()]
char_str = "<br>".join(char_items) if char_items else "N/A"
# Use cases
use_cases = modality_obj['metadata'].get('commonUseCases', [])
use_case_str = "<br>• " + "<br>• ".join(use_cases) if use_cases else "N/A"
# Platforms
platforms = modality_obj['metadata'].get('platforms', [])
platform_str = ", ".join(platforms) if platforms else "N/A"
# Example models
models = modality_obj['metadata'].get('exampleModels', [])
model_str = ", ".join(models) if models else "N/A"
html = f"""
<div class="modality-card">
<div class="modality-header">
{modality_obj['name']}
<span class="{badge_class}">{maturity}</span>
</div>
<div class="modality-meta">
<p><strong>Input</strong><br>{input_str}</p>
<p><strong>Output</strong><br>**Primary:** {output_primary}{audio_info}</p>
</div>
<details>
<summary><strong>Characteristics</strong></summary>
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
{char_str}
</div>
</details>
<details>
<summary><strong>Common Use Cases</strong></summary>
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
{use_case_str}
</div>
</details>
<details>
<summary><strong>Platforms & Models</strong></summary>
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
<p><strong>Platforms:</strong> {platform_str}</p>
<p><strong>Example Models:</strong> {model_str}</p>
</div>
</details>
</div>
"""
return html
def create_overview_page():
"""Create the main overview/index page"""
stats_html = "<div style='display: flex; flex-wrap: wrap; justify-content: space-around;'>"
total_modalities = 0
for modality_key, operations in taxonomy_data.items():
info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"})
creation_count = len(operations.get('creation', {}).get('modalities', []))
editing_count = len(operations.get('editing', {}).get('modalities', []))
total_count = creation_count + editing_count
total_modalities += total_count
stats_html += f"""
<div class="stat-box" style="border-left: 4px solid {info['color']};">
<div style="font-size: 1.2em; font-weight: bold; margin: 10px 0;">{info['name']}</div>
<div style="font-size: 0.9em; color: #666;">
Creation: {creation_count} | Editing: {editing_count}
</div>
<div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;">
{total_count} modalities
</div>
</div>
"""
stats_html += "</div>"
overview_html = f"""
<div style="text-align: center; padding: 30px;">
<h1>Multimodal AI Taxonomy</h1>
<p style="font-size: 1.2em; color: #666; max-width: 800px; margin: 20px auto;">
An attempt to define a structured taxonomy for multimodal generative AI capabilities, organized by output modality and operation type.
</p>
<p style="font-size: 1em; color: #666; max-width: 800px; margin: 20px auto;">
Dataset repository: <a href="https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy" target="_blank">danielrosehill/multimodal-ai-taxonomy</a>
</p>
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px auto; max-width: 300px;">
<div style="font-size: 3em; font-weight: bold;">{total_modalities}</div>
<div style="font-size: 1.2em;">Total Modalities Defined</div>
</div>
</div>
{stats_html}
<div style="margin: 30px; padding: 20px; background: #f0f7ff; border-radius: 10px; border-left: 4px solid #2196F3;">
<h3>How to Use This Space</h3>
<p>Navigate through the tabs above to explore different output modalities (Video, Audio, Image, Text, 3D).</p>
<p>Each modality is organized into <strong>Creation</strong> (generating new content) and <strong>Editing</strong> (modifying existing content) operations.</p>
<p>Click on the details sections to expand and see characteristics, use cases, platforms, and example models.</p>
</div>
"""
return overview_html
def create_modality_page(modality_key, operation_type):
"""Create a page for a specific modality and operation type"""
if modality_key not in taxonomy_data:
return f"<p>No data found for {modality_key}</p>"
if operation_type not in taxonomy_data[modality_key]:
return f"<p>No {operation_type} data found for {modality_key}</p>"
data = taxonomy_data[modality_key][operation_type]
modalities = data.get('modalities', [])
info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"})
html = f"""
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, {info['color']}22 0%, {info['color']}44 100%); border-radius: 15px; margin-bottom: 20px;">
<h2>{info['name']} - {operation_type.title()}</h2>
<p style="color: #666;">{data.get('description', '')}</p>
<div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;">
{len(modalities)} modalities defined
</div>
</div>
"""
for modality in modalities:
html += create_modality_card(modality)
return html
def create_comparison_table(modality_key):
"""Create a comparison table for creation vs editing"""
if modality_key not in taxonomy_data:
return pd.DataFrame()
rows = []
for operation_type in ['creation', 'editing']:
if operation_type in taxonomy_data[modality_key]:
modalities = taxonomy_data[modality_key][operation_type].get('modalities', [])
for mod in modalities:
rows.append({
'Operation': operation_type.title(),
'Name': mod['name'],
'Primary Input': mod['input']['primary'],
'Primary Output': mod['output']['primary'],
'Maturity': mod['metadata']['maturityLevel'],
'Platforms': len(mod['metadata'].get('platforms', [])),
})
return pd.DataFrame(rows)
# Create the Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
gr.Markdown("# Multimodal AI Taxonomy Explorer")
with gr.Tabs():
# Overview tab
with gr.Tab("Overview"):
gr.HTML(create_overview_page())
# Video Generation
with gr.Tab("Video"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("video_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("video_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("video_generation"), wrap=True)
# Audio Generation
with gr.Tab("Audio"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("audio_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("audio_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("audio_generation"), wrap=True)
# Image Generation
with gr.Tab("Image"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("image_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("image_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("image_generation"), wrap=True)
# Text Generation
with gr.Tab("Text"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("text_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("text_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("text_generation"), wrap=True)
# 3D Generation
with gr.Tab("3D"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("3d_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("3d_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("3d_generation"), wrap=True)
# About tab
with gr.Tab("About"):
gr.Markdown("""
## About This Taxonomy
This is an attempt to define a structured taxonomy for multimodal AI capabilities, organized by:
- **Output Modality**: The primary type of content being generated (video, audio, image, text, 3D)
- **Operation Type**: Whether the task involves creation (from scratch) or editing (modifying existing content)
### Key Features
- **Structured Metadata**: Each modality includes input/output specs, characteristics, maturity level, use cases, platforms, and example models
- **Fine-grained Classification**: Goes beyond simple input/output categorization to capture nuanced differences
### Data Schema
Each modality entry includes:
- Unique identifier and human-readable name
- Input specifications (primary and secondary modalities)
- Output specifications (with audio metadata for video outputs)
- Characteristics (process type, audio handling, motion type, etc.)
- Metadata (maturity level, use cases, platforms, example models)
### Dataset
This visualization is powered by the [multimodal-ai-taxonomy](https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy) dataset on Hugging Face.
### Maturity Levels
- **Mature**: Well-established, widely available, production-ready
- **Emerging**: Growing adoption, increasingly stable
- **Experimental**: Cutting-edge, limited availability, proof-of-concept
""")
if __name__ == "__main__":
demo.launch()