# /// script # requires-python = ">=3.10" # dependencies = [ # "gradio", # "moviepy" # ] # /// import json import os from pathlib import Path import gradio as gr def get_video_duration(video_path: str) -> float: try: from moviepy import VideoFileClip clip = VideoFileClip(video_path) duration = clip.duration clip.close() return duration except Exception as e: # noqa: E722 print(e) # Fallback: estimate from file size (very rough) size_mb = os.path.getsize(video_path) / (1024 * 1024) return size_mb def organize_videos_by_duration(video_folder: str | Path = "videos") -> None | dict: if isinstance(video_folder, str): video_folder = Path(video_folder) if not video_folder.exists(): return None video_extensions = (".mp4", ".avi", ".mov", ".webm", ".mkv", ".flv") video_files = [ f for f in video_folder.iterdir() if f.as_posix().lower().endswith(video_extensions) ] if not video_files: return None categories = {"Under 1min": [], "1min - 5min": [], "Over 5min": []} metadata = {} if (metadata_file := (video_folder / "index.json")).exists(): with open(metadata_file, "r") as mf: metadata = json.load(mf) for video_path in video_files: try: duration = get_video_duration(video_path) obj = (video_path, metadata.get(video_path.name, {})) if duration < 60: categories["Under 1min"].append(obj) elif duration < 300: categories["1min - 5min"].append(obj) else: categories["Over 5min"].append(obj) except Exception as e: print(f"Error processing {video_path}: {e}") # Add to first category by default if duration can't be determined categories["Under 1min"].append(obj) return categories # Custom CSS for sleek appearance css = """ .gradio-container { font-family: 'Inter', sans-serif; } .header { padding: 2rem; background: linear-gradient(#39F2AE 0%, rgba(255,0,0,0) 100%); border-radius: 10px; margin-bottom: 2rem; } .header h1 { color: white; font-size: 2.5em; font-weight: 800; margin: 0; } .category-title { color: #667eea; font-weight: 600; font-size: 1.5em; margin: 2rem 0 1rem 0; padding-bottom: 0.5rem; border-bottom: 2px solid #667eea; } a { color: #b1b5bb; text-decoration: none; position: relative; transition: color 0.3s ease; font-weight: 500; } a:hover { color: #ff8c42; } a::after { content: ''; position: absolute; width: 0; height: 2px; bottom: -2px; left: 0; background-color: #ff8c42; transition: width 0.3s ease; } a:hover::after { width: 100%; } .empty-state { text-align: center; padding: 3rem; color: #666; } .instructions { background: #f8f9fa; padding: 1.5rem; border-radius: 8px; margin-top: 2rem; } """ def create_video_gallery(): """ Create the main gallery interface """ categories = organize_videos_by_duration() with gr.Blocks() as demo: gr.HTML("""

🏠 CASA Samples Gallery 🏠

This gallery contains qualitative samples of live video captions generated by our CASA-Qwen2_5-VL-3B model.
For more information please check our project page, preprint and associated HuggingFace collection

Each video contains the following information:

Captions generated by CASA, appearing at the real time they are generated
Average time to first token (averaged across each frame / generation)
Average tokens / s (averaged across all generated tokens so far)
Number of tokens generated so far (i.e., KV-Cache size)
Current memory usage (Note that the displayed memory includes everything present in memory including the model and the preloaded video frames)

Videos are processed at native resolution (with a maximum number of pixels of 448**2 pixels) and are then resized to a max width of 700 pixels after caption generation for display

""") if categories is None: gr.HTML("""

📁 No videos found

Upload videos to the videos/ folder to get started!

""") else: for category, videos in categories.items(): if videos: gr.HTML(f'

{category}

') # Create rows of 3 videos each for i in range(0, len(videos), 3): with gr.Row(): for video_path, data in videos[i : i + 3]: with gr.Column(scale=1, min_width=300): gr.Video( value=video_path, label=data.get("name", video_path.stem), height=250, autoplay=False, include_audio=False ) gr.HTML( f'Input video source: {data.get("origin", "Unknown")}' ) with gr.Accordion("Transcript", open=False): gr.Markdown(data.get("transcript", "Not available")) return demo if __name__ == "__main__": demo = create_video_gallery() demo.launch(css=css)