|
|
import gradio as gr |
|
|
from datasets import load_dataset |
|
|
import json |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
dataset = load_dataset("danielrosehill/multimodal-ai-taxonomy", split="train") |
|
|
|
|
|
|
|
|
taxonomy_data = {} |
|
|
|
|
|
for record in dataset: |
|
|
|
|
|
output_modality = record['output_modality'] |
|
|
operation_type = record['operation_type'] |
|
|
|
|
|
|
|
|
modality_key_map = { |
|
|
"video": "video_generation", |
|
|
"audio": "audio_generation", |
|
|
"image": "image_generation", |
|
|
"text": "text_generation", |
|
|
"3d": "3d_generation", |
|
|
"3d-model": "3d_generation" |
|
|
} |
|
|
|
|
|
modality_key = modality_key_map.get(output_modality, f"{output_modality}_generation") |
|
|
|
|
|
|
|
|
if modality_key not in taxonomy_data: |
|
|
taxonomy_data[modality_key] = {} |
|
|
|
|
|
if operation_type not in taxonomy_data[modality_key]: |
|
|
taxonomy_data[modality_key][operation_type] = { |
|
|
"description": f"{output_modality.title()} {operation_type} modalities", |
|
|
"outputModality": output_modality, |
|
|
"operationType": operation_type, |
|
|
"modalities": [] |
|
|
} |
|
|
|
|
|
|
|
|
modality_obj = { |
|
|
"id": record['id'], |
|
|
"name": record['name'], |
|
|
"input": { |
|
|
"primary": record['input_primary'], |
|
|
"secondary": record['input_secondary'] |
|
|
}, |
|
|
"output": { |
|
|
"primary": record['output_primary'], |
|
|
"audio": record['output_audio'] |
|
|
}, |
|
|
"characteristics": json.loads(record['characteristics']) if record['characteristics'] else {}, |
|
|
"metadata": { |
|
|
"maturityLevel": record['metadata_maturity_level'], |
|
|
"commonUseCases": record['metadata_common_use_cases'], |
|
|
"platforms": record['metadata_platforms'], |
|
|
"exampleModels": record['metadata_example_models'] |
|
|
}, |
|
|
"relationships": json.loads(record['relationships']) if record['relationships'] else {} |
|
|
} |
|
|
|
|
|
|
|
|
if record['output_audio'] and record.get('output_audio_type'): |
|
|
modality_obj["output"]["audioType"] = record['output_audio_type'] |
|
|
|
|
|
|
|
|
taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj) |
|
|
|
|
|
|
|
|
MODALITY_INFO = { |
|
|
"video_generation": {"name": "Video Generation", "color": "#FF6B6B"}, |
|
|
"audio_generation": {"name": "Audio Generation", "color": "#4ECDC4"}, |
|
|
"image_generation": {"name": "Image Generation", "color": "#95E1D3"}, |
|
|
"text_generation": {"name": "Text Generation", "color": "#F38181"}, |
|
|
"3d_generation": {"name": "3D Generation", "color": "#AA96DA"}, |
|
|
} |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
.modality-card { |
|
|
border: 2px solid #e0e0e0; |
|
|
border-radius: 10px; |
|
|
padding: 20px; |
|
|
margin: 10px 0; |
|
|
background: white; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
|
} |
|
|
.modality-header { |
|
|
font-size: 1.5em; |
|
|
font-weight: bold; |
|
|
margin-bottom: 10px; |
|
|
color: #333; |
|
|
} |
|
|
.modality-meta { |
|
|
background: #f5f5f5; |
|
|
padding: 10px; |
|
|
border-radius: 5px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
.badge { |
|
|
display: inline-block; |
|
|
padding: 4px 12px; |
|
|
border-radius: 12px; |
|
|
margin: 2px; |
|
|
font-size: 0.85em; |
|
|
font-weight: 500; |
|
|
} |
|
|
.badge-mature { background: #4CAF50; color: white; } |
|
|
.badge-emerging { background: #FF9800; color: white; } |
|
|
.badge-experimental { background: #9C27B0; color: white; } |
|
|
.index-card { |
|
|
border: 2px solid #ddd; |
|
|
border-radius: 15px; |
|
|
padding: 30px; |
|
|
margin: 15px; |
|
|
text-align: center; |
|
|
cursor: pointer; |
|
|
transition: all 0.3s; |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
color: white; |
|
|
} |
|
|
.index-card:hover { |
|
|
transform: translateY(-5px); |
|
|
box-shadow: 0 10px 20px rgba(0,0,0,0.2); |
|
|
} |
|
|
.stat-box { |
|
|
background: #f8f9fa; |
|
|
border-radius: 10px; |
|
|
padding: 15px; |
|
|
margin: 10px; |
|
|
text-align: center; |
|
|
} |
|
|
""" |
|
|
|
|
|
def create_modality_card(modality_obj): |
|
|
"""Create an HTML card for a single modality""" |
|
|
|
|
|
|
|
|
maturity = modality_obj['metadata']['maturityLevel'] |
|
|
badge_class = f"badge badge-{maturity}" |
|
|
|
|
|
|
|
|
input_primary = modality_obj['input']['primary'] |
|
|
input_secondary = modality_obj['input'].get('secondary', []) |
|
|
output_primary = modality_obj['output']['primary'] |
|
|
|
|
|
|
|
|
input_str = f"**Primary:** {input_primary}" |
|
|
if input_secondary: |
|
|
input_str += f"<br>**Secondary:** {', '.join(input_secondary)}" |
|
|
|
|
|
|
|
|
audio_info = "" |
|
|
if modality_obj['output'].get('audio'): |
|
|
audio_type = modality_obj['output'].get('audioType', 'N/A') |
|
|
audio_info = f"<br>**Audio:** {audio_type}" |
|
|
|
|
|
|
|
|
chars = modality_obj.get('characteristics', {}) |
|
|
char_items = [f"**{k}:** {v}" for k, v in chars.items()] |
|
|
char_str = "<br>".join(char_items) if char_items else "N/A" |
|
|
|
|
|
|
|
|
use_cases = modality_obj['metadata'].get('commonUseCases', []) |
|
|
use_case_str = "<br>• " + "<br>• ".join(use_cases) if use_cases else "N/A" |
|
|
|
|
|
|
|
|
platforms = modality_obj['metadata'].get('platforms', []) |
|
|
platform_str = ", ".join(platforms) if platforms else "N/A" |
|
|
|
|
|
|
|
|
models = modality_obj['metadata'].get('exampleModels', []) |
|
|
model_str = ", ".join(models) if models else "N/A" |
|
|
|
|
|
html = f""" |
|
|
<div class="modality-card"> |
|
|
<div class="modality-header"> |
|
|
{modality_obj['name']} |
|
|
<span class="{badge_class}">{maturity}</span> |
|
|
</div> |
|
|
|
|
|
<div class="modality-meta"> |
|
|
<p><strong>Input</strong><br>{input_str}</p> |
|
|
<p><strong>Output</strong><br>**Primary:** {output_primary}{audio_info}</p> |
|
|
</div> |
|
|
|
|
|
<details> |
|
|
<summary><strong>Characteristics</strong></summary> |
|
|
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;"> |
|
|
{char_str} |
|
|
</div> |
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><strong>Common Use Cases</strong></summary> |
|
|
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;"> |
|
|
{use_case_str} |
|
|
</div> |
|
|
</details> |
|
|
|
|
|
<details> |
|
|
<summary><strong>Platforms & Models</strong></summary> |
|
|
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;"> |
|
|
<p><strong>Platforms:</strong> {platform_str}</p> |
|
|
<p><strong>Example Models:</strong> {model_str}</p> |
|
|
</div> |
|
|
</details> |
|
|
</div> |
|
|
""" |
|
|
return html |
|
|
|
|
|
def create_overview_page(): |
|
|
"""Create the main overview/index page""" |
|
|
|
|
|
stats_html = "<div style='display: flex; flex-wrap: wrap; justify-content: space-around;'>" |
|
|
|
|
|
total_modalities = 0 |
|
|
for modality_key, operations in taxonomy_data.items(): |
|
|
info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"}) |
|
|
|
|
|
creation_count = len(operations.get('creation', {}).get('modalities', [])) |
|
|
editing_count = len(operations.get('editing', {}).get('modalities', [])) |
|
|
total_count = creation_count + editing_count |
|
|
total_modalities += total_count |
|
|
|
|
|
stats_html += f""" |
|
|
<div class="stat-box" style="border-left: 4px solid {info['color']};"> |
|
|
<div style="font-size: 1.2em; font-weight: bold; margin: 10px 0;">{info['name']}</div> |
|
|
<div style="font-size: 0.9em; color: #666;"> |
|
|
Creation: {creation_count} | Editing: {editing_count} |
|
|
</div> |
|
|
<div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;"> |
|
|
{total_count} modalities |
|
|
</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
stats_html += "</div>" |
|
|
|
|
|
overview_html = f""" |
|
|
<div style="text-align: center; padding: 30px;"> |
|
|
<h1>Multimodal AI Taxonomy</h1> |
|
|
<p style="font-size: 1.2em; color: #666; max-width: 800px; margin: 20px auto;"> |
|
|
An attempt to define a structured taxonomy for multimodal generative AI capabilities, organized by output modality and operation type. |
|
|
</p> |
|
|
<p style="font-size: 1em; color: #666; max-width: 800px; margin: 20px auto;"> |
|
|
Dataset repository: <a href="https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy" target="_blank">danielrosehill/multimodal-ai-taxonomy</a> |
|
|
</p> |
|
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px auto; max-width: 300px;"> |
|
|
<div style="font-size: 3em; font-weight: bold;">{total_modalities}</div> |
|
|
<div style="font-size: 1.2em;">Total Modalities Defined</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
{stats_html} |
|
|
|
|
|
<div style="margin: 30px; padding: 20px; background: #f0f7ff; border-radius: 10px; border-left: 4px solid #2196F3;"> |
|
|
<h3>How to Use This Space</h3> |
|
|
<p>Navigate through the tabs above to explore different output modalities (Video, Audio, Image, Text, 3D).</p> |
|
|
<p>Each modality is organized into <strong>Creation</strong> (generating new content) and <strong>Editing</strong> (modifying existing content) operations.</p> |
|
|
<p>Click on the details sections to expand and see characteristics, use cases, platforms, and example models.</p> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
return overview_html |
|
|
|
|
|
def create_modality_page(modality_key, operation_type): |
|
|
"""Create a page for a specific modality and operation type""" |
|
|
|
|
|
if modality_key not in taxonomy_data: |
|
|
return f"<p>No data found for {modality_key}</p>" |
|
|
|
|
|
if operation_type not in taxonomy_data[modality_key]: |
|
|
return f"<p>No {operation_type} data found for {modality_key}</p>" |
|
|
|
|
|
data = taxonomy_data[modality_key][operation_type] |
|
|
modalities = data.get('modalities', []) |
|
|
|
|
|
info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"}) |
|
|
|
|
|
html = f""" |
|
|
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, {info['color']}22 0%, {info['color']}44 100%); border-radius: 15px; margin-bottom: 20px;"> |
|
|
<h2>{info['name']} - {operation_type.title()}</h2> |
|
|
<p style="color: #666;">{data.get('description', '')}</p> |
|
|
<div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;"> |
|
|
{len(modalities)} modalities defined |
|
|
</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
for modality in modalities: |
|
|
html += create_modality_card(modality) |
|
|
|
|
|
return html |
|
|
|
|
|
def create_comparison_table(modality_key): |
|
|
"""Create a comparison table for creation vs editing""" |
|
|
|
|
|
if modality_key not in taxonomy_data: |
|
|
return pd.DataFrame() |
|
|
|
|
|
rows = [] |
|
|
for operation_type in ['creation', 'editing']: |
|
|
if operation_type in taxonomy_data[modality_key]: |
|
|
modalities = taxonomy_data[modality_key][operation_type].get('modalities', []) |
|
|
for mod in modalities: |
|
|
rows.append({ |
|
|
'Operation': operation_type.title(), |
|
|
'Name': mod['name'], |
|
|
'Primary Input': mod['input']['primary'], |
|
|
'Primary Output': mod['output']['primary'], |
|
|
'Maturity': mod['metadata']['maturityLevel'], |
|
|
'Platforms': len(mod['metadata'].get('platforms', [])), |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(rows) |
|
|
|
|
|
|
|
|
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.Markdown("# Multimodal AI Taxonomy Explorer") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("Overview"): |
|
|
gr.HTML(create_overview_page()) |
|
|
|
|
|
|
|
|
with gr.Tab("Video"): |
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Creation"): |
|
|
gr.HTML(create_modality_page("video_generation", "creation")) |
|
|
with gr.Tab("Editing"): |
|
|
gr.HTML(create_modality_page("video_generation", "editing")) |
|
|
with gr.Tab("Comparison"): |
|
|
gr.Dataframe(create_comparison_table("video_generation"), wrap=True) |
|
|
|
|
|
|
|
|
with gr.Tab("Audio"): |
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Creation"): |
|
|
gr.HTML(create_modality_page("audio_generation", "creation")) |
|
|
with gr.Tab("Editing"): |
|
|
gr.HTML(create_modality_page("audio_generation", "editing")) |
|
|
with gr.Tab("Comparison"): |
|
|
gr.Dataframe(create_comparison_table("audio_generation"), wrap=True) |
|
|
|
|
|
|
|
|
with gr.Tab("Image"): |
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Creation"): |
|
|
gr.HTML(create_modality_page("image_generation", "creation")) |
|
|
with gr.Tab("Editing"): |
|
|
gr.HTML(create_modality_page("image_generation", "editing")) |
|
|
with gr.Tab("Comparison"): |
|
|
gr.Dataframe(create_comparison_table("image_generation"), wrap=True) |
|
|
|
|
|
|
|
|
with gr.Tab("Text"): |
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Creation"): |
|
|
gr.HTML(create_modality_page("text_generation", "creation")) |
|
|
with gr.Tab("Editing"): |
|
|
gr.HTML(create_modality_page("text_generation", "editing")) |
|
|
with gr.Tab("Comparison"): |
|
|
gr.Dataframe(create_comparison_table("text_generation"), wrap=True) |
|
|
|
|
|
|
|
|
with gr.Tab("3D"): |
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Creation"): |
|
|
gr.HTML(create_modality_page("3d_generation", "creation")) |
|
|
with gr.Tab("Editing"): |
|
|
gr.HTML(create_modality_page("3d_generation", "editing")) |
|
|
with gr.Tab("Comparison"): |
|
|
gr.Dataframe(create_comparison_table("3d_generation"), wrap=True) |
|
|
|
|
|
|
|
|
with gr.Tab("About"): |
|
|
gr.Markdown(""" |
|
|
## About This Taxonomy |
|
|
|
|
|
This is an attempt to define a structured taxonomy for multimodal AI capabilities, organized by: |
|
|
|
|
|
- **Output Modality**: The primary type of content being generated (video, audio, image, text, 3D) |
|
|
- **Operation Type**: Whether the task involves creation (from scratch) or editing (modifying existing content) |
|
|
|
|
|
### Key Features |
|
|
|
|
|
- **Structured Metadata**: Each modality includes input/output specs, characteristics, maturity level, use cases, platforms, and example models |
|
|
- **Fine-grained Classification**: Goes beyond simple input/output categorization to capture nuanced differences |
|
|
|
|
|
### Data Schema |
|
|
|
|
|
Each modality entry includes: |
|
|
- Unique identifier and human-readable name |
|
|
- Input specifications (primary and secondary modalities) |
|
|
- Output specifications (with audio metadata for video outputs) |
|
|
- Characteristics (process type, audio handling, motion type, etc.) |
|
|
- Metadata (maturity level, use cases, platforms, example models) |
|
|
|
|
|
### Dataset |
|
|
|
|
|
This visualization is powered by the [multimodal-ai-taxonomy](https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy) dataset on Hugging Face. |
|
|
|
|
|
### Maturity Levels |
|
|
|
|
|
- **Mature**: Well-established, widely available, production-ready |
|
|
- **Emerging**: Growing adoption, increasingly stable |
|
|
- **Experimental**: Cutting-edge, limited availability, proof-of-concept |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|