Spaces:
Running
Running
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = [ | |
| # "gradio", | |
| # "moviepy" | |
| # ] | |
| # /// | |
| import json | |
| import os | |
| from pathlib import Path | |
| import gradio as gr | |
| def get_video_duration(video_path: str) -> float: | |
| try: | |
| from moviepy import VideoFileClip | |
| clip = VideoFileClip(video_path) | |
| duration = clip.duration | |
| clip.close() | |
| return duration | |
| except Exception as e: # noqa: E722 | |
| print(e) | |
| # Fallback: estimate from file size (very rough) | |
| size_mb = os.path.getsize(video_path) / (1024 * 1024) | |
| return size_mb | |
| def organize_videos_by_duration(video_folder: str | Path = "videos") -> None | dict: | |
| if isinstance(video_folder, str): | |
| video_folder = Path(video_folder) | |
| if not video_folder.exists(): | |
| return None | |
| video_extensions = (".mp4", ".avi", ".mov", ".webm", ".mkv", ".flv") | |
| video_files = [ | |
| f for f in video_folder.iterdir() if f.as_posix().lower().endswith(video_extensions) | |
| ] | |
| if not video_files: | |
| return None | |
| categories = {"Under 1min": [], "1min - 5min": [], "Over 5min": []} | |
| metadata = {} | |
| if (metadata_file := (video_folder / "index.json")).exists(): | |
| with open(metadata_file, "r") as mf: | |
| metadata = json.load(mf) | |
| for video_path in video_files: | |
| try: | |
| duration = get_video_duration(video_path) | |
| obj = (video_path, metadata.get(video_path.name, {})) | |
| if duration < 60: | |
| categories["Under 1min"].append(obj) | |
| elif duration < 300: | |
| categories["1min - 5min"].append(obj) | |
| else: | |
| categories["Over 5min"].append(obj) | |
| except Exception as e: | |
| print(f"Error processing {video_path}: {e}") | |
| # Add to first category by default if duration can't be determined | |
| categories["Under 1min"].append(obj) | |
| return categories | |
| # Custom CSS for sleek appearance | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .header { | |
| padding: 2rem; | |
| background: linear-gradient(#39F2AE 0%, rgba(255,0,0,0) 100%); | |
| border-radius: 10px; | |
| margin-bottom: 2rem; | |
| } | |
| .header h1 { | |
| color: white; | |
| font-size: 2.5em; | |
| font-weight: 800; | |
| margin: 0; | |
| } | |
| .category-title { | |
| color: #667eea; | |
| font-weight: 600; | |
| font-size: 1.5em; | |
| margin: 2rem 0 1rem 0; | |
| padding-bottom: 0.5rem; | |
| border-bottom: 2px solid #667eea; | |
| } | |
| a { | |
| color: #b1b5bb; | |
| text-decoration: none; | |
| position: relative; | |
| transition: color 0.3s ease; | |
| font-weight: 500; | |
| } | |
| a:hover { | |
| color: #ff8c42; | |
| } | |
| a::after { | |
| content: ''; | |
| position: absolute; | |
| width: 0; | |
| height: 2px; | |
| bottom: -2px; | |
| left: 0; | |
| background-color: #ff8c42; | |
| transition: width 0.3s ease; | |
| } | |
| a:hover::after { | |
| width: 100%; | |
| } | |
| .empty-state { | |
| text-align: center; | |
| padding: 3rem; | |
| color: #666; | |
| } | |
| .instructions { | |
| background: #f8f9fa; | |
| padding: 1.5rem; | |
| border-radius: 8px; | |
| margin-top: 2rem; | |
| } | |
| """ | |
| def create_video_gallery(): | |
| """ | |
| Create the main gallery interface | |
| """ | |
| categories = organize_videos_by_duration() | |
| with gr.Blocks() as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <div style="text-align: center"> | |
| <h1> π CASA Samples Gallery π </h1> | |
| <p style="color: white; margin: 0;">This gallery contains qualitative samples of live video captions generated by our <code>CASA-Qwen2_5-VL-3B</code> model. | |
| <br>For more information please check our <a href="https://kyutai.org/casa" target="_blank">project page</a>, <a href="https://arxiv.org/abs/2512.19535" target="_blank">preprint</a> and associated <a href="https://huggingface.co/collections/kyutai/casa" target="_blank">HuggingFace collection</a></p> | |
| </div> | |
| <p style="margin-top: 10px">Each video contains the following information: | |
| <ul> | |
| <li> Captions generated by CASA, appearing at the real time they are generated | |
| <li> Average time to first token (<i>averaged across each frame / generation</i>) | |
| <li> Average tokens / s (<i>averaged across all generated tokens so far</i>) | |
| <li> Number of tokens generated so far (<i>i.e., KV-Cache size</i>) | |
| <li> Current memory usage (<i>Note that the displayed memory includes everything present in memory including the model and the preloaded video frames</i>) | |
| </ul> | |
| Videos are processed at native resolution (with a maximum number of pixels of 448**2 pixels) and are then resized to a max width of 700 pixels after caption generation for display | |
| </p> | |
| </div> | |
| """) | |
| if categories is None: | |
| gr.HTML(""" | |
| <div class="empty-state"> | |
| <h2>π No videos found</h2> | |
| <p>Upload videos to the <code>videos/</code> folder to get started!</p> | |
| </div> | |
| """) | |
| else: | |
| for category, videos in categories.items(): | |
| if videos: | |
| gr.HTML(f'<div class="category-title">{category}</div>') | |
| # Create rows of 3 videos each | |
| for i in range(0, len(videos), 3): | |
| with gr.Row(): | |
| for video_path, data in videos[i : i + 3]: | |
| with gr.Column(scale=1, min_width=300): | |
| gr.Video( | |
| value=video_path, | |
| label=data.get("name", video_path.stem), | |
| height=250, | |
| autoplay=False, | |
| include_audio=False | |
| ) | |
| gr.HTML( | |
| f'<span style="font-size: 12px;">Input video source: {data.get("origin", "Unknown")}</span>' | |
| ) | |
| with gr.Accordion("Transcript", open=False): | |
| gr.Markdown(data.get("transcript", "Not available")) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_video_gallery() | |
| demo.launch(css=css) |