Spaces:

jonathanagustin
/

video_analyzer

Runtime error

Claude commited on Dec 27, 2025

Commit

e95686f

unverified ·

1 Parent(s): f2820bb

feat: Add vector DB and RAG chatbot

- Add ChromaDB for vector storage of video content
- Add sentence-transformers for embeddings
- Add FLAN-T5 for chat responses
- Store transcripts and visual context in vector DB
- Add 'Chat with Videos' tab with RAG-based Q&A
- Add requirements.txt for HuggingFace Spaces compatibility
- Chunk text with overlap for better retrieval

Files changed (4) hide show

app.py +237 -48
pyproject.toml +2 -0
requirements.txt +10 -0
uv.lock +0 -0

app.py CHANGED Viewed

@@ -1,17 +1,38 @@
 from __future__ import annotations
 import os
 import tempfile
 from pathlib import Path
 import cv2
 import gradio as gr
 import torch
 import yt_dlp
 from huggingface_hub import whoami
 from PIL import Image
 from transformers import BlipForConditionalGeneration, BlipProcessor, pipeline
 def hello(profile: gr.OAuthProfile | None) -> str:
     if profile is None:
@@ -49,6 +70,14 @@ def get_vision_model():
     return processor, model
 def download_video(url: str, output_dir: str) -> list[dict]:
     """Download video from YouTube URL (video or playlist)."""
     ydl_opts = {
@@ -83,18 +112,6 @@ def download_video(url: str, output_dir: str) -> list[dict]:
 def extract_audio(video_path: str, output_dir: str) -> str:
     """Extract audio from video file."""
     audio_path = os.path.join(output_dir, "audio.mp3")
-    ydl_opts = {
-        "format": "bestaudio/best",
-        "postprocessors": [{
-            "key": "FFmpegExtractAudio",
-            "preferredcodec": "mp3",
-            "preferredquality": "192",
-        }],
-        "outtmpl": os.path.join(output_dir, "audio"),
-        "quiet": True,
-    }
-    # Use ffmpeg directly via yt-dlp's post-processor on local file
-    import subprocess
     subprocess.run([
         "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame",
         "-q:a", "2", audio_path, "-y"
@@ -112,14 +129,12 @@ def extract_frames(video_path: str, num_frames: int = 5) -> list[Image.Image]:
         cap.release()
         return frames
-    # Get evenly spaced frame indices
     indices = [int(i * total_frames / (num_frames + 1)) for i in range(1, num_frames + 1)]
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
-            # Convert BGR to RGB
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
@@ -143,6 +158,80 @@ def transcribe_audio(audio_path: str, whisper_model) -> str:
     return result["text"]
 def process_youtube(
     url: str,
     analyze_frames: bool,
@@ -171,10 +260,9 @@ def process_youtube(
             total = len(downloaded)
             for i, item in enumerate(downloaded):
-                base_progress = 0.1 + 0.9 * (i / total)
                 video_result = [f"## {item['title']}"]
-                # Find the actual video file
                 video_files = list(Path(tmpdir).glob("*.mp4")) + \
                               list(Path(tmpdir).glob("*.webm")) + \
                               list(Path(tmpdir).glob("*.mkv"))
@@ -187,27 +275,35 @@ def process_youtube(
                 video_path = str(video_files[0])
                 # Extract and transcribe audio
-                progress(base_progress + 0.3 * (1/total), desc=f"Extracting audio: {item['title']}")
                 audio_path = extract_audio(video_path, tmpdir)
-                progress(base_progress + 0.5 * (1/total), desc=f"Transcribing: {item['title']}")
                 transcript = transcribe_audio(audio_path, whisper_model)
                 if transcript:
                     video_result.append("### Transcript")
                     video_result.append(transcript)
                 # Analyze frames if enabled
                 if analyze_frames:
-                    progress(base_progress + 0.7 * (1/total), desc=f"Analyzing frames: {item['title']}")
                     frames = extract_frames(video_path, num_frames)
                     if frames:
                         video_result.append("\n### Visual Context")
                         for j, frame in enumerate(frames):
                             caption = describe_frame(frame, vision_processor, vision_model)
                             video_result.append(f"**Frame {j+1}:** {caption}")
                 results.append("\n\n".join(video_result))
             progress(1.0, desc="Done!")
@@ -217,9 +313,70 @@ def process_youtube(
         return f"Error: {e!s}"
 with gr.Blocks() as demo:
     gr.Markdown("# Video Analyzer")
-    gr.Markdown("Download, transcribe, and analyze YouTube videos using AI")
     gr.LoginButton()
     m1 = gr.Markdown()
@@ -227,34 +384,66 @@ with gr.Blocks() as demo:
     gr.Markdown("---")
-    with gr.Row():
-        url_input = gr.Textbox(
-            label="YouTube URL",
-            placeholder="Enter a YouTube video or playlist URL",
-            scale=4,
-        )
-    with gr.Row():
-        analyze_frames = gr.Checkbox(
-            label="Analyze video frames (visual context)",
-            value=True,
-        )
-        num_frames = gr.Slider(
-            label="Number of frames to analyze",
-            minimum=1,
-            maximum=10,
-            value=5,
-            step=1,
-        )
-    submit_btn = gr.Button("Analyze Video", variant="primary")
-    output = gr.Markdown(label="Analysis")
-    submit_btn.click(
-        fn=process_youtube,
-        inputs=[url_input, analyze_frames, num_frames],
-        outputs=[output],
-    )
     demo.load(hello, inputs=None, outputs=m1)
     demo.load(list_organizations, inputs=None, outputs=m2)

 from __future__ import annotations
 import os
+import subprocess
 import tempfile
+import uuid
 from pathlib import Path
+import chromadb
 import cv2
 import gradio as gr
 import torch
 import yt_dlp
 from huggingface_hub import whoami
 from PIL import Image
+from sentence_transformers import SentenceTransformer
 from transformers import BlipForConditionalGeneration, BlipProcessor, pipeline
+# Initialize ChromaDB client (persistent storage)
+chroma_client = chromadb.Client()
+collection = chroma_client.get_or_create_collection(
+    name="video_knowledge",
+    metadata={"hnsw:space": "cosine"}
+)
+# Global embedding model
+embedding_model = None
+def get_embedding_model():
+    global embedding_model
+    if embedding_model is None:
+        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+    return embedding_model
 def hello(profile: gr.OAuthProfile | None) -> str:
     if profile is None:
     return processor, model
+def get_text_generation_model():
+    return pipeline(
+        "text2text-generation",
+        model="google/flan-t5-base",
+        device=get_device(),
+    )
 def download_video(url: str, output_dir: str) -> list[dict]:
     """Download video from YouTube URL (video or playlist)."""
     ydl_opts = {
 def extract_audio(video_path: str, output_dir: str) -> str:
     """Extract audio from video file."""
     audio_path = os.path.join(output_dir, "audio.mp3")
     subprocess.run([
         "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame",
         "-q:a", "2", audio_path, "-y"
         cap.release()
         return frames
     indices = [int(i * total_frames / (num_frames + 1)) for i in range(1, num_frames + 1)]
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
     return result["text"]
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
+    """Split text into overlapping chunks."""
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i + chunk_size])
+        if chunk:
+            chunks.append(chunk)
+    return chunks
+def add_to_vector_db(title: str, transcript: str, visual_contexts: list[str]):
+    """Add video content to vector database."""
+    embed_model = get_embedding_model()
+    documents = []
+    metadatas = []
+    ids = []
+    # Add transcript chunks
+    if transcript:
+        chunks = chunk_text(transcript)
+        for i, chunk in enumerate(chunks):
+            documents.append(chunk)
+            metadatas.append({
+                "title": title,
+                "type": "transcript",
+                "chunk_index": i,
+            })
+            ids.append(f"{title}_transcript_{i}_{uuid.uuid4().hex[:8]}")
+    # Add visual context
+    for i, context in enumerate(visual_contexts):
+        documents.append(f"Visual scene from {title}: {context}")
+        metadatas.append({
+            "title": title,
+            "type": "visual",
+            "frame_index": i,
+        })
+        ids.append(f"{title}_visual_{i}_{uuid.uuid4().hex[:8]}")
+    if documents:
+        embeddings = embed_model.encode(documents).tolist()
+        collection.add(
+            documents=documents,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            ids=ids,
+        )
+    return len(documents)
+def search_knowledge(query: str, n_results: int = 5) -> list[dict]:
+    """Search the vector database for relevant content."""
+    embed_model = get_embedding_model()
+    query_embedding = embed_model.encode([query]).tolist()
+    results = collection.query(
+        query_embeddings=query_embedding,
+        n_results=n_results,
+    )
+    matches = []
+    if results["documents"] and results["documents"][0]:
+        for doc, metadata in zip(results["documents"][0], results["metadatas"][0]):
+            matches.append({
+                "content": doc,
+                "title": metadata.get("title", "Unknown"),
+                "type": metadata.get("type", "unknown"),
+            })
+    return matches
 def process_youtube(
     url: str,
     analyze_frames: bool,
             total = len(downloaded)
             for i, item in enumerate(downloaded):
+                base_progress = 0.1 + 0.8 * (i / total)
                 video_result = [f"## {item['title']}"]
                 video_files = list(Path(tmpdir).glob("*.mp4")) + \
                               list(Path(tmpdir).glob("*.webm")) + \
                               list(Path(tmpdir).glob("*.mkv"))
                 video_path = str(video_files[0])
                 # Extract and transcribe audio
+                progress(base_progress + 0.2 * (1/total), desc=f"Extracting audio: {item['title']}")
                 audio_path = extract_audio(video_path, tmpdir)
+                progress(base_progress + 0.4 * (1/total), desc=f"Transcribing: {item['title']}")
                 transcript = transcribe_audio(audio_path, whisper_model)
+                visual_contexts = []
                 if transcript:
                     video_result.append("### Transcript")
                     video_result.append(transcript)
                 # Analyze frames if enabled
                 if analyze_frames:
+                    progress(base_progress + 0.6 * (1/total), desc=f"Analyzing frames: {item['title']}")
                     frames = extract_frames(video_path, num_frames)
                     if frames:
                         video_result.append("\n### Visual Context")
                         for j, frame in enumerate(frames):
                             caption = describe_frame(frame, vision_processor, vision_model)
+                            visual_contexts.append(caption)
                             video_result.append(f"**Frame {j+1}:** {caption}")
+                # Store in vector DB
+                progress(base_progress + 0.8 * (1/total), desc=f"Storing in knowledge base: {item['title']}")
+                num_stored = add_to_vector_db(item["title"], transcript, visual_contexts)
+                video_result.append(f"\n*Added {num_stored} chunks to knowledge base*")
                 results.append("\n\n".join(video_result))
             progress(1.0, desc="Done!")
         return f"Error: {e!s}"
+def chat_with_videos(
+    message: str,
+    history: list[dict],
+    profile: gr.OAuthProfile | None,
+) -> str:
+    if profile is None:
+        return "Please log in to use the chat feature."
+    if not message or not message.strip():
+        return "Please enter a question."
+    # Check if we have any content in the knowledge base
+    if collection.count() == 0:
+        return "No videos have been analyzed yet. Please analyze some videos first to build the knowledge base."
+    # Search for relevant context
+    matches = search_knowledge(message.strip(), n_results=5)
+    if not matches:
+        return "I couldn't find any relevant information in the analyzed videos."
+    # Build context from matches
+    context_parts = []
+    for match in matches:
+        source = f"[{match['title']} - {match['type']}]"
+        context_parts.append(f"{source}: {match['content']}")
+    context = "\n\n".join(context_parts)
+    # Generate response using the LLM
+    try:
+        llm = get_text_generation_model()
+        prompt = f"""Based on the following video content, answer the question.
+Video Content:
+{context}
+Question: {message}
+Answer:"""
+        response = llm(prompt, max_length=512, do_sample=False)[0]["generated_text"]
+        # Add sources
+        sources = list(set(m["title"] for m in matches))
+        response += f"\n\n*Sources: {', '.join(sources)}*"
+        return response
+    except Exception as e:
+        return f"Error generating response: {e!s}"
+def get_knowledge_stats() -> str:
+    """Get statistics about the knowledge base."""
+    count = collection.count()
+    if count == 0:
+        return "Knowledge base is empty. Analyze some videos to get started!"
+    return f"Knowledge base contains **{count}** chunks from analyzed videos."
 with gr.Blocks() as demo:
     gr.Markdown("# Video Analyzer")
+    gr.Markdown("Download, transcribe, analyze, and chat with YouTube videos using AI")
     gr.LoginButton()
     m1 = gr.Markdown()
     gr.Markdown("---")
+    with gr.Tabs():
+        with gr.TabItem("Analyze Videos"):
+            with gr.Row():
+                url_input = gr.Textbox(
+                    label="YouTube URL",
+                    placeholder="Enter a YouTube video or playlist URL",
+                    scale=4,
+                )
+            with gr.Row():
+                analyze_frames = gr.Checkbox(
+                    label="Analyze video frames (visual context)",
+                    value=True,
+                )
+                num_frames = gr.Slider(
+                    label="Number of frames to analyze",
+                    minimum=1,
+                    maximum=10,
+                    value=5,
+                    step=1,
+                )
+            submit_btn = gr.Button("Analyze Video", variant="primary")
+            output = gr.Markdown(label="Analysis")
+            submit_btn.click(
+                fn=process_youtube,
+                inputs=[url_input, analyze_frames, num_frames],
+                outputs=[output],
+            )
+        with gr.TabItem("Chat with Videos"):
+            kb_stats = gr.Markdown()
+            chatbot = gr.Chatbot(label="Video Chat", type="messages")
+            chat_input = gr.Textbox(
+                label="Ask a question about your videos",
+                placeholder="What did the video say about...?",
+            )
+            chat_btn = gr.Button("Ask", variant="primary")
+            def respond(message, history, profile):
+                response = chat_with_videos(message, history, profile)
+                history = history or []
+                history.append({"role": "user", "content": message})
+                history.append({"role": "assistant", "content": response})
+                return history, ""
+            chat_btn.click(
+                fn=respond,
+                inputs=[chat_input, chatbot],
+                outputs=[chatbot, chat_input],
+            )
+            chat_input.submit(
+                fn=respond,
+                inputs=[chat_input, chatbot],
+                outputs=[chatbot, chat_input],
+            )
+            # Update stats on tab load
+            demo.load(get_knowledge_stats, outputs=kb_stats)
     demo.load(hello, inputs=None, outputs=m1)
     demo.load(list_organizations, inputs=None, outputs=m2)

pyproject.toml CHANGED Viewed

@@ -13,4 +13,6 @@ dependencies = [
     "accelerate>=0.25.0",
     "opencv-python-headless>=4.8.0",
     "Pillow>=10.0.0",
 ]

     "accelerate>=0.25.0",
     "opencv-python-headless>=4.8.0",
     "Pillow>=10.0.0",
+    "chromadb>=0.4.0",
+    "sentence-transformers>=2.2.0",
 ]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio>=6.0.0
+huggingface_hub>=0.20.0
+yt-dlp>=2024.1.0
+transformers>=4.36.0
+torch>=2.0.0
+accelerate>=0.25.0
+opencv-python-headless>=4.8.0
+Pillow>=10.0.0
+chromadb>=0.4.0
+sentence-transformers>=2.2.0

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff