Spaces:
Sleeping
Sleeping
| import os | |
| import cv2 | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| import spaces | |
| from main import ( | |
| run, | |
| detect_scenes, | |
| extract_keyframes, | |
| generate_scene_caption, | |
| generate_video_summary, | |
| generate_video_summary_groq, | |
| vqa_matches, | |
| semantic_matches, | |
| remove_scenes, | |
| ) | |
| # Load environment variables | |
| load_dotenv() | |
| if not os.getenv("HF_TOKEN"): | |
| raise ValueError("❌ Error: HF_TOKEN not found in .env file") | |
| def process_video(video_path, query, progress=gr.Progress()): | |
| """Scene‐filtering tab: remove scenes matching the query.""" | |
| try: | |
| os.makedirs("outputs", exist_ok=True) | |
| output_path = os.path.join("outputs", "trimmed_video.mp4") | |
| # 1) Detect scenes | |
| progress(0.0, desc="Detecting scenes...") | |
| scenes = detect_scenes(video_path) | |
| # 2) Extract keyframes | |
| progress(0.2, desc="Extracting keyframes...") | |
| keyframes = extract_keyframes(video_path, scenes) | |
| # 3) Caption each keyframe | |
| progress(0.4, desc="Generating captions...") | |
| captions = [generate_scene_caption(frame) for _, frame in keyframes] | |
| # 4) VQA + semantic filtering | |
| progress(0.6, desc="Analyzing scenes...") | |
| vqa_mask = vqa_matches(keyframes, query) | |
| sem_idxs, _= semantic_matches(captions, query) | |
| # 5) Build removal list | |
| to_remove = sorted({i for i, flag in enumerate(vqa_mask) if flag} | set(sem_idxs)) | |
| # 6) Trim via ffmpeg | |
| progress(0.8, desc="Processing video...") | |
| if to_remove: | |
| remove_scenes(video_path, scenes, to_remove, output_path) | |
| # Verify the output video | |
| if not os.path.exists(output_path): | |
| return None, "❌ Error: Failed to create output video" | |
| # Check if video is valid | |
| cap = cv2.VideoCapture(output_path) | |
| if not cap.isOpened(): | |
| return None, "❌ Error: Generated video is invalid" | |
| cap.release() | |
| stats = [ | |
| "✅ Processing complete!", | |
| f"📊 Total scenes: {len(scenes)}", | |
| f"🗑️ Scenes removed: {len(to_remove)}", | |
| f"🎬 Scenes kept: {len(scenes)-len(to_remove)}", | |
| "\n🔍 Scene captions:", | |
| *[f"[Scene {i}]: {cap}" for i, cap in enumerate(captions)] | |
| ] | |
| return output_path, "\n".join(stats) | |
| else: | |
| return None, "⚠️ No matching scenes found; no trimming done." | |
| except Exception as e: | |
| return None, f"❌ Error: {e}" | |
| def generate_video_description(video_path, progress=gr.Progress()): | |
| """Video‐description tab: full scene‐by‐scene summary.""" | |
| try: | |
| progress(0.0, desc="Detecting scenes...") | |
| scenes = detect_scenes(video_path) | |
| progress(0.3, desc="Extracting keyframes...") | |
| keyframes = extract_keyframes(video_path, scenes) | |
| progress(0.6, desc="Captioning scenes...") | |
| captions = [generate_scene_caption(frame) for _, frame in keyframes] | |
| # build & return the summary paragraph | |
| summary = generate_video_summary(captions) | |
| return summary | |
| except Exception as e: | |
| return f"❌ Error: {e}" | |
| def get_frame_description(video_path, frame_number): | |
| """Frame‐analysis tab: caption a single frame.""" | |
| try: | |
| cap = cv2.VideoCapture(video_path) | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_number)) | |
| ret, frame = cap.read() | |
| cap.release() | |
| if not ret: | |
| return "❌ Invalid frame number" | |
| return f"Frame {frame_number}:\n{generate_scene_caption(frame)}" | |
| except Exception as e: | |
| return f"❌ Error: {e}" | |
| # ─── Gradio UI ──────────────────────────────────────────────────────────────── | |
| with gr.Blocks(theme=gr.themes.Soft(), css=""" | |
| footer {visibility: hidden} | |
| .custom-footer { | |
| text-align: center; | |
| margin-top: 2em; | |
| margin-bottom: 1em; | |
| color: #666; | |
| } | |
| .description { | |
| color: #666; | |
| font-size: 0.9em; | |
| line-height: 1.5; | |
| } | |
| .tech-stack { | |
| background: #f5f5f5; | |
| padding: 1em; | |
| border-radius: 8px; | |
| margin: 1em 0; | |
| } | |
| """) as demo: | |
| gr.Markdown(""" | |
| # Videoxity | |
| A powerful playground for video analysis and manipulation using state-of-the-art Vision-Language models. | |
| <div class="description"> | |
| This application demonstrates the capabilities of modern AI in video processing, offering a foundation for developers to build upon and optimize. | |
| Whether you're exploring scene detection, content filtering, or video summarization, Videoxity provides the tools to experiment with and enhance video understanding. | |
| </div> | |
| <div class="tech-stack"> | |
| <strong>Technical Stack:</strong> | |
| - Scene Detection: PySceneDetect with ContentDetector | |
| - Vision Models: BLIP (Image Captioning & VQA) | |
| - Language Models: Groq LLM (Llama 3.1) | |
| - Video Processing: OpenCV & FFmpeg | |
| - Embeddings: BGE-Small for semantic search | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # 1) Scene Filtering | |
| with gr.TabItem("Frames to Cut"): | |
| gr.Markdown(""" | |
| ### Remove specific scenes from your video | |
| Upload a video and describe which scenes you want to remove. The AI will analyze each scene and cut out the matching ones. | |
| Examples: | |
| - "Remove the part where there is a cat in the video" | |
| - "Cut out the scene where people are dancing" | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| vid1 = gr.Video( | |
| label="Upload Video", | |
| format="mp4", | |
| interactive=True | |
| ) | |
| qry1 = gr.Textbox( | |
| label="Scenes to Remove", | |
| placeholder="e.g., 'Remove the part where there is a cat in the video'", | |
| lines=2 | |
| ) | |
| btn1 = gr.Button("Process Video", variant="primary") | |
| with gr.Column(): | |
| outVid = gr.Video( | |
| label="Processed Video", | |
| format="mp4", | |
| interactive=True | |
| ) | |
| outTxt = gr.Textbox(label="Results", lines=10) | |
| btn1.click( | |
| fn=process_video, | |
| inputs=[vid1, qry1], | |
| outputs=[outVid, outTxt] | |
| ) | |
| # 2) Video Description | |
| with gr.TabItem("Video Description"): | |
| gr.Markdown(""" | |
| ### Generate a comprehensive description of your video | |
| Get AI-generated descriptions for all scenes in your video. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| vid2 = gr.Video(label="Upload Video") | |
| btn2 = gr.Button("Generate Description", variant="primary") | |
| with gr.Column(): | |
| outDesc = gr.Textbox( | |
| label="Video Description", | |
| lines=15, | |
| show_copy_button=True | |
| ) | |
| btn2.click( | |
| fn=generate_video_description, | |
| inputs=[vid2], | |
| outputs=[outDesc] | |
| ) | |
| # 3) Frame Analysis | |
| with gr.TabItem("Frame Analysis"): | |
| gr.Markdown(""" | |
| ### Analyze specific frames in your video | |
| Get detailed descriptions for individual frames. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| vid3 = gr.Video(label="Upload Video") | |
| fn3 = gr.Number( | |
| label="Frame Number", | |
| value=0, | |
| precision=0, | |
| minimum=0 | |
| ) | |
| btn3 = gr.Button("Analyze Frame", variant="primary") | |
| with gr.Column(): | |
| outFrm = gr.Textbox( | |
| label="Frame Description", | |
| lines=5, | |
| show_copy_button=True | |
| ) | |
| btn3.click( | |
| fn=get_frame_description, | |
| inputs=[vid3, fn3], | |
| outputs=[outFrm] | |
| ) | |
| # Add custom centered footer | |
| gr.Markdown(""" | |
| <div class="custom-footer"> | |
| Made with ❤️ | |
| </div> | |
| """, elem_classes=["custom-footer"]) | |
| if __name__ == "__main__": | |
| demo.launch(share=True, show_error=True, show_api=False) | |