Spaces:

kyutai
/

casa-samples

Running

App Files Files Community

casa-samples / app.py

ameroyer

Add target="_blank" to links

374e67f verified 3 days ago

raw

history blame contribute delete

6.46 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "gradio",
	# "moviepy"
	# ]
	# ///

	import json
	import os
	from pathlib import Path

	import gradio as gr


	def get_video_duration(video_path: str) -> float:
	try:
	from moviepy import VideoFileClip

	clip = VideoFileClip(video_path)
	duration = clip.duration
	clip.close()
	return duration
	except Exception as e: # noqa: E722
	print(e)
	# Fallback: estimate from file size (very rough)
	size_mb = os.path.getsize(video_path) / (1024 * 1024)
	return size_mb


	def organize_videos_by_duration(video_folder: str \| Path = "videos") -> None \| dict:
	if isinstance(video_folder, str):
	video_folder = Path(video_folder)

	if not video_folder.exists():
	return None

	video_extensions = (".mp4", ".avi", ".mov", ".webm", ".mkv", ".flv")
	video_files = [
	f for f in video_folder.iterdir() if f.as_posix().lower().endswith(video_extensions)
	]

	if not video_files:
	return None

	categories = {"Under 1min": [], "1min - 5min": [], "Over 5min": []}
	metadata = {}
	if (metadata_file := (video_folder / "index.json")).exists():
	with open(metadata_file, "r") as mf:
	metadata = json.load(mf)

	for video_path in video_files:
	try:
	duration = get_video_duration(video_path)
	obj = (video_path, metadata.get(video_path.name, {}))

	if duration < 60:
	categories["Under 1min"].append(obj)
	elif duration < 300:
	categories["1min - 5min"].append(obj)
	else:
	categories["Over 5min"].append(obj)
	except Exception as e:
	print(f"Error processing {video_path}: {e}")
	# Add to first category by default if duration can't be determined
	categories["Under 1min"].append(obj)

	return categories


	# Custom CSS for sleek appearance
	css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.header {
	padding: 2rem;
	background: linear-gradient(#39F2AE 0%, rgba(255,0,0,0) 100%);
	border-radius: 10px;
	margin-bottom: 2rem;
	}
	.header h1 {
	color: white;
	font-size: 2.5em;
	font-weight: 800;
	margin: 0;
	}
	.category-title {
	color: #667eea;
	font-weight: 600;
	font-size: 1.5em;
	margin: 2rem 0 1rem 0;
	padding-bottom: 0.5rem;
	border-bottom: 2px solid #667eea;
	}
	a {
	color: #b1b5bb;
	text-decoration: none;
	position: relative;
	transition: color 0.3s ease;
	font-weight: 500;
	}
	a:hover {
	color: #ff8c42;
	}
	a::after {
	content: '';
	position: absolute;
	width: 0;
	height: 2px;
	bottom: -2px;
	left: 0;
	background-color: #ff8c42;
	transition: width 0.3s ease;
	}
	a:hover::after {
	width: 100%;
	}
	.empty-state {
	text-align: center;
	padding: 3rem;
	color: #666;
	}
	.instructions {
	background: #f8f9fa;
	padding: 1.5rem;
	border-radius: 8px;
	margin-top: 2rem;
	}
	"""


	def create_video_gallery():
	"""
	Create the main gallery interface
	"""
	categories = organize_videos_by_duration()

	with gr.Blocks() as demo:
	gr.HTML("""
	<div class="header">
	<div style="text-align: center">
	<h1> 🏠 CASA Samples Gallery 🏠 </h1>
	<p style="color: white; margin: 0;">This gallery contains qualitative samples of live video captions generated by our <code>CASA-Qwen2_5-VL-3B</code> model.
	<br>For more information please check our <a href="https://kyutai.org/casa" target="_blank">project page</a>, <a href="https://arxiv.org/abs/2512.19535" target="_blank">preprint</a> and associated <a href="https://huggingface.co/collections/kyutai/casa" target="_blank">HuggingFace collection</a></p>
	</div>
	<p style="margin-top: 10px">Each video contains the following information:
	<ul>
	<li> Captions generated by CASA, appearing at the real time they are generated
	<li> Average time to first token (<i>averaged across each frame / generation</i>)
	<li> Average tokens / s (<i>averaged across all generated tokens so far</i>)
	<li> Number of tokens generated so far (<i>i.e., KV-Cache size</i>)
	<li> Current memory usage (<i>Note that the displayed memory includes everything present in memory including the model and the preloaded video frames</i>)
	</ul>
	Videos are processed at native resolution (with a maximum number of pixels of 448**2 pixels) and are then resized to a max width of 700 pixels after caption generation for display
	</p>
	</div>
	""")

	if categories is None:
	gr.HTML("""
	<div class="empty-state">
	<h2>📁 No videos found</h2>
	<p>Upload videos to the <code>videos/</code> folder to get started!</p>
	</div>
	""")
	else:
	for category, videos in categories.items():
	if videos:
	gr.HTML(f'<div class="category-title">{category}</div>')

	# Create rows of 3 videos each
	for i in range(0, len(videos), 3):
	with gr.Row():
	for video_path, data in videos[i : i + 3]:
	with gr.Column(scale=1, min_width=300):
	gr.Video(
	value=video_path,
	label=data.get("name", video_path.stem),
	height=250,
	autoplay=False,
	include_audio=False
	)
	gr.HTML(
	f'<span style="font-size: 12px;">Input video source: {data.get("origin", "Unknown")}</span>'
	)
	with gr.Accordion("Transcript", open=False):
	gr.Markdown(data.get("transcript", "Not available"))

	return demo


	if __name__ == "__main__":
	demo = create_video_gallery()
	demo.launch(css=css)