File size: 15,880 Bytes
62cd7ca 80cfd1e 62cd7ca 80cfd1e 62cd7ca 80cfd1e 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca 597e3a5 62cd7ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 |
import gradio as gr
from datasets import load_dataset
import json
import pandas as pd
# Load the dataset
dataset = load_dataset("danielrosehill/multimodal-ai-taxonomy", split="train")
# Extract taxonomy data and reconstruct nested structure
taxonomy_data = {}
for record in dataset:
# Get modality info
output_modality = record['output_modality']
operation_type = record['operation_type']
# Map output_modality to the keys used in MODALITY_INFO
modality_key_map = {
"video": "video_generation",
"audio": "audio_generation",
"image": "image_generation",
"text": "text_generation",
"3d": "3d_generation",
"3d-model": "3d_generation"
}
modality_key = modality_key_map.get(output_modality, f"{output_modality}_generation")
# Initialize nested structure
if modality_key not in taxonomy_data:
taxonomy_data[modality_key] = {}
if operation_type not in taxonomy_data[modality_key]:
taxonomy_data[modality_key][operation_type] = {
"description": f"{output_modality.title()} {operation_type} modalities",
"outputModality": output_modality,
"operationType": operation_type,
"modalities": []
}
# Reconstruct the nested modality object
modality_obj = {
"id": record['id'],
"name": record['name'],
"input": {
"primary": record['input_primary'],
"secondary": record['input_secondary']
},
"output": {
"primary": record['output_primary'],
"audio": record['output_audio']
},
"characteristics": json.loads(record['characteristics']) if record['characteristics'] else {},
"metadata": {
"maturityLevel": record['metadata_maturity_level'],
"commonUseCases": record['metadata_common_use_cases'],
"platforms": record['metadata_platforms'],
"exampleModels": record['metadata_example_models']
},
"relationships": json.loads(record['relationships']) if record['relationships'] else {}
}
# Add audio type if present
if record['output_audio'] and record.get('output_audio_type'):
modality_obj["output"]["audioType"] = record['output_audio_type']
# Add to taxonomy data
taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj)
# Define modality display names
MODALITY_INFO = {
"video_generation": {"name": "Video Generation", "color": "#FF6B6B"},
"audio_generation": {"name": "Audio Generation", "color": "#4ECDC4"},
"image_generation": {"name": "Image Generation", "color": "#95E1D3"},
"text_generation": {"name": "Text Generation", "color": "#F38181"},
"3d_generation": {"name": "3D Generation", "color": "#AA96DA"},
}
# CSS for styling
custom_css = """
.modality-card {
border: 2px solid #e0e0e0;
border-radius: 10px;
padding: 20px;
margin: 10px 0;
background: white;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.modality-header {
font-size: 1.5em;
font-weight: bold;
margin-bottom: 10px;
color: #333;
}
.modality-meta {
background: #f5f5f5;
padding: 10px;
border-radius: 5px;
margin: 10px 0;
}
.badge {
display: inline-block;
padding: 4px 12px;
border-radius: 12px;
margin: 2px;
font-size: 0.85em;
font-weight: 500;
}
.badge-mature { background: #4CAF50; color: white; }
.badge-emerging { background: #FF9800; color: white; }
.badge-experimental { background: #9C27B0; color: white; }
.index-card {
border: 2px solid #ddd;
border-radius: 15px;
padding: 30px;
margin: 15px;
text-align: center;
cursor: pointer;
transition: all 0.3s;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.index-card:hover {
transform: translateY(-5px);
box-shadow: 0 10px 20px rgba(0,0,0,0.2);
}
.stat-box {
background: #f8f9fa;
border-radius: 10px;
padding: 15px;
margin: 10px;
text-align: center;
}
"""
def create_modality_card(modality_obj):
"""Create an HTML card for a single modality"""
# Maturity badge
maturity = modality_obj['metadata']['maturityLevel']
badge_class = f"badge badge-{maturity}"
# Input/Output info
input_primary = modality_obj['input']['primary']
input_secondary = modality_obj['input'].get('secondary', [])
output_primary = modality_obj['output']['primary']
# Build input string
input_str = f"**Primary:** {input_primary}"
if input_secondary:
input_str += f"<br>**Secondary:** {', '.join(input_secondary)}"
# Audio info for output
audio_info = ""
if modality_obj['output'].get('audio'):
audio_type = modality_obj['output'].get('audioType', 'N/A')
audio_info = f"<br>**Audio:** {audio_type}"
# Characteristics
chars = modality_obj.get('characteristics', {})
char_items = [f"**{k}:** {v}" for k, v in chars.items()]
char_str = "<br>".join(char_items) if char_items else "N/A"
# Use cases
use_cases = modality_obj['metadata'].get('commonUseCases', [])
use_case_str = "<br>• " + "<br>• ".join(use_cases) if use_cases else "N/A"
# Platforms
platforms = modality_obj['metadata'].get('platforms', [])
platform_str = ", ".join(platforms) if platforms else "N/A"
# Example models
models = modality_obj['metadata'].get('exampleModels', [])
model_str = ", ".join(models) if models else "N/A"
html = f"""
<div class="modality-card">
<div class="modality-header">
{modality_obj['name']}
<span class="{badge_class}">{maturity}</span>
</div>
<div class="modality-meta">
<p><strong>Input</strong><br>{input_str}</p>
<p><strong>Output</strong><br>**Primary:** {output_primary}{audio_info}</p>
</div>
<details>
<summary><strong>Characteristics</strong></summary>
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
{char_str}
</div>
</details>
<details>
<summary><strong>Common Use Cases</strong></summary>
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
{use_case_str}
</div>
</details>
<details>
<summary><strong>Platforms & Models</strong></summary>
<div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
<p><strong>Platforms:</strong> {platform_str}</p>
<p><strong>Example Models:</strong> {model_str}</p>
</div>
</details>
</div>
"""
return html
def create_overview_page():
"""Create the main overview/index page"""
stats_html = "<div style='display: flex; flex-wrap: wrap; justify-content: space-around;'>"
total_modalities = 0
for modality_key, operations in taxonomy_data.items():
info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"})
creation_count = len(operations.get('creation', {}).get('modalities', []))
editing_count = len(operations.get('editing', {}).get('modalities', []))
total_count = creation_count + editing_count
total_modalities += total_count
stats_html += f"""
<div class="stat-box" style="border-left: 4px solid {info['color']};">
<div style="font-size: 1.2em; font-weight: bold; margin: 10px 0;">{info['name']}</div>
<div style="font-size: 0.9em; color: #666;">
Creation: {creation_count} | Editing: {editing_count}
</div>
<div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;">
{total_count} modalities
</div>
</div>
"""
stats_html += "</div>"
overview_html = f"""
<div style="text-align: center; padding: 30px;">
<h1>Multimodal AI Taxonomy</h1>
<p style="font-size: 1.2em; color: #666; max-width: 800px; margin: 20px auto;">
An attempt to define a structured taxonomy for multimodal generative AI capabilities, organized by output modality and operation type.
</p>
<p style="font-size: 1em; color: #666; max-width: 800px; margin: 20px auto;">
Dataset repository: <a href="https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy" target="_blank">danielrosehill/multimodal-ai-taxonomy</a>
</p>
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px auto; max-width: 300px;">
<div style="font-size: 3em; font-weight: bold;">{total_modalities}</div>
<div style="font-size: 1.2em;">Total Modalities Defined</div>
</div>
</div>
{stats_html}
<div style="margin: 30px; padding: 20px; background: #f0f7ff; border-radius: 10px; border-left: 4px solid #2196F3;">
<h3>How to Use This Space</h3>
<p>Navigate through the tabs above to explore different output modalities (Video, Audio, Image, Text, 3D).</p>
<p>Each modality is organized into <strong>Creation</strong> (generating new content) and <strong>Editing</strong> (modifying existing content) operations.</p>
<p>Click on the details sections to expand and see characteristics, use cases, platforms, and example models.</p>
</div>
"""
return overview_html
def create_modality_page(modality_key, operation_type):
"""Create a page for a specific modality and operation type"""
if modality_key not in taxonomy_data:
return f"<p>No data found for {modality_key}</p>"
if operation_type not in taxonomy_data[modality_key]:
return f"<p>No {operation_type} data found for {modality_key}</p>"
data = taxonomy_data[modality_key][operation_type]
modalities = data.get('modalities', [])
info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"})
html = f"""
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, {info['color']}22 0%, {info['color']}44 100%); border-radius: 15px; margin-bottom: 20px;">
<h2>{info['name']} - {operation_type.title()}</h2>
<p style="color: #666;">{data.get('description', '')}</p>
<div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;">
{len(modalities)} modalities defined
</div>
</div>
"""
for modality in modalities:
html += create_modality_card(modality)
return html
def create_comparison_table(modality_key):
"""Create a comparison table for creation vs editing"""
if modality_key not in taxonomy_data:
return pd.DataFrame()
rows = []
for operation_type in ['creation', 'editing']:
if operation_type in taxonomy_data[modality_key]:
modalities = taxonomy_data[modality_key][operation_type].get('modalities', [])
for mod in modalities:
rows.append({
'Operation': operation_type.title(),
'Name': mod['name'],
'Primary Input': mod['input']['primary'],
'Primary Output': mod['output']['primary'],
'Maturity': mod['metadata']['maturityLevel'],
'Platforms': len(mod['metadata'].get('platforms', [])),
})
return pd.DataFrame(rows)
# Create the Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
gr.Markdown("# Multimodal AI Taxonomy Explorer")
with gr.Tabs():
# Overview tab
with gr.Tab("Overview"):
gr.HTML(create_overview_page())
# Video Generation
with gr.Tab("Video"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("video_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("video_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("video_generation"), wrap=True)
# Audio Generation
with gr.Tab("Audio"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("audio_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("audio_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("audio_generation"), wrap=True)
# Image Generation
with gr.Tab("Image"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("image_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("image_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("image_generation"), wrap=True)
# Text Generation
with gr.Tab("Text"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("text_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("text_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("text_generation"), wrap=True)
# 3D Generation
with gr.Tab("3D"):
with gr.Tabs():
with gr.Tab("Creation"):
gr.HTML(create_modality_page("3d_generation", "creation"))
with gr.Tab("Editing"):
gr.HTML(create_modality_page("3d_generation", "editing"))
with gr.Tab("Comparison"):
gr.Dataframe(create_comparison_table("3d_generation"), wrap=True)
# About tab
with gr.Tab("About"):
gr.Markdown("""
## About This Taxonomy
This is an attempt to define a structured taxonomy for multimodal AI capabilities, organized by:
- **Output Modality**: The primary type of content being generated (video, audio, image, text, 3D)
- **Operation Type**: Whether the task involves creation (from scratch) or editing (modifying existing content)
### Key Features
- **Structured Metadata**: Each modality includes input/output specs, characteristics, maturity level, use cases, platforms, and example models
- **Fine-grained Classification**: Goes beyond simple input/output categorization to capture nuanced differences
### Data Schema
Each modality entry includes:
- Unique identifier and human-readable name
- Input specifications (primary and secondary modalities)
- Output specifications (with audio metadata for video outputs)
- Characteristics (process type, audio handling, motion type, etc.)
- Metadata (maturity level, use cases, platforms, example models)
### Dataset
This visualization is powered by the [multimodal-ai-taxonomy](https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy) dataset on Hugging Face.
### Maturity Levels
- **Mature**: Well-established, widely available, production-ready
- **Emerging**: Growing adoption, increasingly stable
- **Experimental**: Cutting-edge, limited availability, proof-of-concept
""")
if __name__ == "__main__":
demo.launch()
|