casa-samples / app.py
ameroyer's picture
Add target="_blank" to links
374e67f verified
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "gradio",
# "moviepy"
# ]
# ///
import json
import os
from pathlib import Path
import gradio as gr
def get_video_duration(video_path: str) -> float:
try:
from moviepy import VideoFileClip
clip = VideoFileClip(video_path)
duration = clip.duration
clip.close()
return duration
except Exception as e: # noqa: E722
print(e)
# Fallback: estimate from file size (very rough)
size_mb = os.path.getsize(video_path) / (1024 * 1024)
return size_mb
def organize_videos_by_duration(video_folder: str | Path = "videos") -> None | dict:
if isinstance(video_folder, str):
video_folder = Path(video_folder)
if not video_folder.exists():
return None
video_extensions = (".mp4", ".avi", ".mov", ".webm", ".mkv", ".flv")
video_files = [
f for f in video_folder.iterdir() if f.as_posix().lower().endswith(video_extensions)
]
if not video_files:
return None
categories = {"Under 1min": [], "1min - 5min": [], "Over 5min": []}
metadata = {}
if (metadata_file := (video_folder / "index.json")).exists():
with open(metadata_file, "r") as mf:
metadata = json.load(mf)
for video_path in video_files:
try:
duration = get_video_duration(video_path)
obj = (video_path, metadata.get(video_path.name, {}))
if duration < 60:
categories["Under 1min"].append(obj)
elif duration < 300:
categories["1min - 5min"].append(obj)
else:
categories["Over 5min"].append(obj)
except Exception as e:
print(f"Error processing {video_path}: {e}")
# Add to first category by default if duration can't be determined
categories["Under 1min"].append(obj)
return categories
# Custom CSS for sleek appearance
css = """
.gradio-container {
font-family: 'Inter', sans-serif;
}
.header {
padding: 2rem;
background: linear-gradient(#39F2AE 0%, rgba(255,0,0,0) 100%);
border-radius: 10px;
margin-bottom: 2rem;
}
.header h1 {
color: white;
font-size: 2.5em;
font-weight: 800;
margin: 0;
}
.category-title {
color: #667eea;
font-weight: 600;
font-size: 1.5em;
margin: 2rem 0 1rem 0;
padding-bottom: 0.5rem;
border-bottom: 2px solid #667eea;
}
a {
color: #b1b5bb;
text-decoration: none;
position: relative;
transition: color 0.3s ease;
font-weight: 500;
}
a:hover {
color: #ff8c42;
}
a::after {
content: '';
position: absolute;
width: 0;
height: 2px;
bottom: -2px;
left: 0;
background-color: #ff8c42;
transition: width 0.3s ease;
}
a:hover::after {
width: 100%;
}
.empty-state {
text-align: center;
padding: 3rem;
color: #666;
}
.instructions {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 8px;
margin-top: 2rem;
}
"""
def create_video_gallery():
"""
Create the main gallery interface
"""
categories = organize_videos_by_duration()
with gr.Blocks() as demo:
gr.HTML("""
<div class="header">
<div style="text-align: center">
<h1> 🏠 CASA Samples Gallery 🏠 </h1>
<p style="color: white; margin: 0;">This gallery contains qualitative samples of live video captions generated by our <code>CASA-Qwen2_5-VL-3B</code> model.
<br>For more information please check our <a href="https://kyutai.org/casa" target="_blank">project page</a>, <a href="https://arxiv.org/abs/2512.19535" target="_blank">preprint</a> and associated <a href="https://huggingface.co/collections/kyutai/casa" target="_blank">HuggingFace collection</a></p>
</div>
<p style="margin-top: 10px">Each video contains the following information:
<ul>
<li> Captions generated by CASA, appearing at the real time they are generated
<li> Average time to first token (<i>averaged across each frame / generation</i>)
<li> Average tokens / s (<i>averaged across all generated tokens so far</i>)
<li> Number of tokens generated so far (<i>i.e., KV-Cache size</i>)
<li> Current memory usage (<i>Note that the displayed memory includes everything present in memory including the model and the preloaded video frames</i>)
</ul>
Videos are processed at native resolution (with a maximum number of pixels of 448**2 pixels) and are then resized to a max width of 700 pixels after caption generation for display
</p>
</div>
""")
if categories is None:
gr.HTML("""
<div class="empty-state">
<h2>πŸ“ No videos found</h2>
<p>Upload videos to the <code>videos/</code> folder to get started!</p>
</div>
""")
else:
for category, videos in categories.items():
if videos:
gr.HTML(f'<div class="category-title">{category}</div>')
# Create rows of 3 videos each
for i in range(0, len(videos), 3):
with gr.Row():
for video_path, data in videos[i : i + 3]:
with gr.Column(scale=1, min_width=300):
gr.Video(
value=video_path,
label=data.get("name", video_path.stem),
height=250,
autoplay=False,
include_audio=False
)
gr.HTML(
f'<span style="font-size: 12px;">Input video source: {data.get("origin", "Unknown")}</span>'
)
with gr.Accordion("Transcript", open=False):
gr.Markdown(data.get("transcript", "Not available"))
return demo
if __name__ == "__main__":
demo = create_video_gallery()
demo.launch(css=css)