Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

98b2fa8

verified ·

1 Parent(s): 67e6bf3

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -82

app.py CHANGED Viewed

@@ -35,30 +35,30 @@ model_loaded = False
 @spaces.GPU
 def load_videollama3_model():
-    """Load VideoLLaMA3 model"""
     global model, processor, model_loaded
     try:
         print("🔄 Loading VideoLLaMA3-7B model...")
-        model_name = "DAMO-NLP-SG/VideoLLaMA3-7B"
         print("Loading processor...")
         processor = AutoProcessor.from_pretrained(
-            model_name,
             trust_remote_code=True
         )
         print("Loading VideoLLaMA3 model (this may take several minutes)...")
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
             trust_remote_code=True,
-            device_map="auto",
             torch_dtype=torch.bfloat16,
         )
         model_loaded = True
-        success_msg = "✅ VideoLLaMA3-7B model loaded successfully! You can now analyze videos with AI."
         print(success_msg)
         return success_msg
@@ -70,7 +70,7 @@ def load_videollama3_model():
 @spaces.GPU
 def analyze_video_with_videollama3(video_file, question, progress=gr.Progress()):
-    """Analyze video using VideoLLaMA3"""
     if video_file is None:
         return "❌ Please upload a video file first."
@@ -82,63 +82,72 @@ def analyze_video_with_videollama3(video_file, question, progress=gr.Progress())
         return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for completion."
     try:
-        progress(0.1, desc="Preparing video for analysis...")
-        # Create the conversation in the format VideoLLaMA3 expects
         conversation = [
-            {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
             {
                 "role": "user",
                 "content": [
-                    {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 64}},
                     {"type": "text", "text": question}
                 ]
             }
         ]
-        progress(0.3, desc="Processing video with VideoLLaMA3...")
-        # Process the conversation
-        inputs = processor(conversation=conversation, return_tensors="pt")
         inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
-        progress(0.7, desc="Generating AI response...")
-        # Generate response
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=512,
-                temperature=0.7,
                 do_sample=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
                 eos_token_id=processor.tokenizer.eos_token_id
             )
-        progress(0.9, desc="Processing response...")
-        # Decode response
         response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-        # Extract assistant response
         if "assistant" in response.lower():
             ai_response = response.split("assistant")[-1].strip()
-        elif "user:" in response.lower():
-            parts = response.split("user:")
             if len(parts) > 1:
                 ai_response = parts[-1].strip()
             else:
                 ai_response = response.strip()
-        else:
-            ai_response = response.strip()
-        # Clean up the response
-        ai_response = ai_response.replace("</s>", "").strip()
-        # Get video info for technical details
         cap = cv2.VideoCapture(video_file)
         fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -147,74 +156,52 @@ def analyze_video_with_videollama3(video_file, question, progress=gr.Progress())
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         cap.release()
-        progress(1.0, desc="Analysis complete!")
-        # Format the final response
         formatted_response = f"""🎥 **VideoLLaMA3 AI Video Analysis**
 ❓ **Your Question:**
 {question}
-🤖 **AI Analysis:**
 {ai_response}
-📊 **Video Information:**
 • Duration: {duration:.1f} seconds
 • Frame Rate: {fps:.1f} FPS
 • Total Frames: {total_frames:,}
-• Resolution: {width}x{height}
-⚡ **Powered by:** VideoLLaMA3-7B (Multimodal AI)
 """
         return formatted_response
     except Exception as e:
-        error_msg = f"❌ Error during VideoLLaMA3 analysis: {str(e)}"
-        print(error_msg)
-        # Fallback: Basic video analysis if VideoLLaMA3 fails
-        try:
-            cap = cv2.VideoCapture(video_file)
-            fps = cap.get(cv2.CAP_PROP_FPS)
-            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            duration = total_frames / fps if fps > 0 else 0
-            cap.release()
-            fallback_response = f"""❌ VideoLLaMA3 analysis failed, but here's what I can tell you:
-**Video Technical Info:**
-• Duration: {duration:.1f} seconds
-• Frame Rate: {fps:.1f} FPS
-• Total Frames: {total_frames:,}
-**Error:** {str(e)}
-**Suggestion:** Try reloading the model or using a shorter video file.
-"""
-            return fallback_response
-        except:
-            return error_msg
 def create_interface():
-    """Create the Gradio interface"""
-    with gr.Blocks(title="VideoLLaMA3 AI Analyzer", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🎥 VideoLLaMA3 Video Analysis Tool")
-        gr.Markdown("Upload videos and get detailed AI-powered analysis using VideoLLaMA3-7B!")
         # Model loading section
         with gr.Row():
             with gr.Column(scale=3):
                 model_status = gr.Textbox(
-                    label="🤖 Model Status",
-                    value="Model not loaded - Click the button to load VideoLLaMA3-7B →",
                     interactive=False,
                     lines=2
                 )
             with gr.Column(scale=1):
-                load_btn = gr.Button("🚀 Load VideoLLaMA3 Model", variant="primary", size="lg")
         load_btn.click(load_videollama3_model, outputs=model_status)
@@ -228,31 +215,31 @@ def create_interface():
                     height=350
                 )
                 question_input = gr.Textbox(
-                    label="❓ Ask about the video",
                     placeholder="What is happening in this video? Describe it in detail.",
                     lines=3,
                     max_lines=5
                 )
-                analyze_btn = gr.Button("🔍 Analyze Video with VideoLLaMA3", variant="primary", size="lg")
             with gr.Column(scale=1):
                 output = gr.Textbox(
-                    label="🎯 AI Analysis Results",
                     lines=25,
                     max_lines=30,
                     show_copy_button=True
                 )
         # Example questions
-        gr.Markdown("### 💡 Example Questions (click to use):")
         example_questions = [
             "What is happening in this video? Describe the scene in detail.",
             "Who are the people in this video and what are they doing?",
             "Describe the setting, location, and environment shown.",
-            "What objects, animals, or items can you see in the video?",
-            "What is the mood, atmosphere, or emotion conveyed?",
-            "Summarize the key events that occur chronologically."
         ]
         with gr.Row():
@@ -265,7 +252,7 @@ def create_interface():
                         btn2 = gr.Button(example_questions[i+1], size="sm")
                         btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input)
-        # Connect the analyze button
         analyze_btn.click(
             analyze_video_with_videollama3,
             inputs=[video_input, question_input],
@@ -275,16 +262,17 @@ def create_interface():
         gr.Markdown("---")
         gr.Markdown("""
-        ### 📋 Instructions:
-        1. **First:** Click "Load VideoLLaMA3 Model" and wait for it to complete (~5-10 minutes)
-        2. **Then:** Upload your video file (works best with videos under 2 minutes)
-        3. **Ask:** Type your question about the video content
-        4. **Analyze:** Click "Analyze Video with VideoLLaMA3" to get detailed insights
-        💡 **Tips:**
-        - Keep videos under 2 minutes for best performance
-        - Ask specific, detailed questions for better results
-        - The model will analyze up to 64 frames from your video
         """)
     return demo

 @spaces.GPU
 def load_videollama3_model():
+    """Load VideoLLaMA3 model with the correct implementation"""
     global model, processor, model_loaded
     try:
         print("🔄 Loading VideoLLaMA3-7B model...")
+        model_path = "DAMO-NLP-SG/VideoLLaMA3-7B"
         print("Loading processor...")
         processor = AutoProcessor.from_pretrained(
+            model_path,
             trust_remote_code=True
         )
         print("Loading VideoLLaMA3 model (this may take several minutes)...")
         model = AutoModelForCausalLM.from_pretrained(
+            model_path,
             trust_remote_code=True,
+            device_map={"": device},
             torch_dtype=torch.bfloat16,
         )
         model_loaded = True
+        success_msg = "✅ VideoLLaMA3-7B model loaded successfully! Ready for video analysis."
         print(success_msg)
         return success_msg
 @spaces.GPU
 def analyze_video_with_videollama3(video_file, question, progress=gr.Progress()):
+    """Analyze video using VideoLLaMA3 - REAL implementation"""
     if video_file is None:
         return "❌ Please upload a video file first."
         return "❌ VideoLLaMA3 model is not loaded. Please click 'Load VideoLLaMA3 Model' first and wait for completion."
     try:
+        progress(0.1, desc="Preparing video for VideoLLaMA3...")
+        # Create the exact conversation format from VideoLLaMA3 official implementation
         conversation = [
+            {"role": "system", "content": "You are a helpful assistant."},
             {
                 "role": "user",
                 "content": [
+                    {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 128}},
                     {"type": "text", "text": question}
                 ]
             }
         ]
+        progress(0.3, desc="Processing with VideoLLaMA3...")
+        # Use the EXACT processor call from official VideoLLaMA3 code
+        inputs = processor(
+            conversation=conversation,
+            add_system_prompt=True,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        )
+        # Move inputs to device
         inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
         if "pixel_values" in inputs:
             inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+        progress(0.7, desc="Generating VideoLLaMA3 response...")
+        # Generate response with VideoLLaMA3
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 max_new_tokens=512,
                 do_sample=True,
+                temperature=0.1,
+                use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
                 eos_token_id=processor.tokenizer.eos_token_id
             )
+        progress(0.9, desc="Processing VideoLLaMA3 response...")
+        # Decode the response
         response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+        # Extract assistant response - VideoLLaMA3 specific parsing
         if "assistant" in response.lower():
             ai_response = response.split("assistant")[-1].strip()
+        elif "<|im_start|>assistant" in response:
+            ai_response = response.split("<|im_start|>assistant")[-1].strip()
+        else:
+            # Fallback: extract everything after the user's question
+            parts = response.split(question)
             if len(parts) > 1:
                 ai_response = parts[-1].strip()
             else:
                 ai_response = response.strip()
+        # Clean up response tokens
+        ai_response = ai_response.replace("<|im_end|>", "").replace("</s>", "").strip()
+        # Get video metadata
         cap = cv2.VideoCapture(video_file)
         fps = cap.get(cv2.CAP_PROP_FPS)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         cap.release()
+        progress(1.0, desc="VideoLLaMA3 analysis complete!")
+        # Format response
         formatted_response = f"""🎥 **VideoLLaMA3 AI Video Analysis**
 ❓ **Your Question:**
 {question}
+🤖 **VideoLLaMA3 Response:**
 {ai_response}
+📊 **Video Details:**
 • Duration: {duration:.1f} seconds
+• Resolution: {width}x{height}
 • Frame Rate: {fps:.1f} FPS
 • Total Frames: {total_frames:,}
+• Analyzed with: Up to 128 frames at 1 FPS
+⚡ **Powered by:** VideoLLaMA3-7B (Official Implementation)
 """
         return formatted_response
     except Exception as e:
+        error_msg = f"❌ VideoLLaMA3 analysis failed: {str(e)}"
+        print(f"Full error: {e}")
+        return error_msg
 def create_interface():
+    """Create the VideoLLaMA3 interface"""
+    with gr.Blocks(title="VideoLLaMA3 Official", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🎥 VideoLLaMA3 Video Analysis Tool")
+        gr.Markdown("**Official VideoLLaMA3-7B implementation** - Upload videos and get detailed AI analysis!")
         # Model loading section
         with gr.Row():
             with gr.Column(scale=3):
                 model_status = gr.Textbox(
+                    label="🤖 VideoLLaMA3 Model Status",
+                    value="Model not loaded - Click button to load VideoLLaMA3-7B →",
                     interactive=False,
                     lines=2
                 )
             with gr.Column(scale=1):
+                load_btn = gr.Button("🚀 Load VideoLLaMA3", variant="primary", size="lg")
         load_btn.click(load_videollama3_model, outputs=model_status)
                     height=350
                 )
                 question_input = gr.Textbox(
+                    label="❓ Ask VideoLLaMA3 about the video",
                     placeholder="What is happening in this video? Describe it in detail.",
                     lines=3,
                     max_lines=5
                 )
+                analyze_btn = gr.Button("🔍 Analyze with VideoLLaMA3", variant="primary", size="lg")
             with gr.Column(scale=1):
                 output = gr.Textbox(
+                    label="🎯 VideoLLaMA3 Analysis Results",
                     lines=25,
                     max_lines=30,
                     show_copy_button=True
                 )
         # Example questions
+        gr.Markdown("### 💡 Example Questions for VideoLLaMA3:")
         example_questions = [
             "What is happening in this video? Describe the scene in detail.",
             "Who are the people in this video and what are they doing?",
             "Describe the setting, location, and environment shown.",
+            "What objects can you identify in this video?",
+            "What is the mood or atmosphere of this video?",
+            "Can you summarize the key events chronologically?"
         ]
         with gr.Row():
                         btn2 = gr.Button(example_questions[i+1], size="sm")
                         btn2.click(lambda x=example_questions[i+1]: x, outputs=question_input)
+        # Connect analyze button
         analyze_btn.click(
             analyze_video_with_videollama3,
             inputs=[video_input, question_input],
         gr.Markdown("---")
         gr.Markdown("""
+        ### 📋 How to Use VideoLLaMA3:
+        1. **Load Model:** Click "Load VideoLLaMA3" and wait (~10 minutes for first load)
+        2. **Upload Video:** Choose your video file (works best under 2 minutes)
+        3. **Ask Question:** Type what you want to know about the video
+        4. **Analyze:** Click "Analyze with VideoLLaMA3" for AI-powered insights
+        ### 🔧 Technical Details:
+        - **Model:** VideoLLaMA3-7B (Official DAMO-NLP-SG implementation)
+        - **Analysis:** Processes up to 128 frames at 1 FPS sampling
+        - **Capabilities:** Video understanding, object detection, scene description, temporal reasoning
+        - **Best Performance:** Videos under 2 minutes, clear visuals, specific questions
         """)
     return demo