Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

266caad

verified ·

1 Parent(s): dab64dd

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -36

app.py CHANGED Viewed

@@ -1,33 +1,50 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# HERE IS WHERE THE MODEL NAME GOES ⬇️
 model_name = "DAMO-NLP-SG/VideoRefer-VideoLLaMA3-7B"
-# Load the model function
 def load_model():
     try:
-        # Use the model name here ⬇️
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,  # And here ⬇️
-            torch_dtype=torch.float16,
             device_map="auto",
-            trust_remote_code=True  # May be needed for some models
         )
-        return tokenizer, model
     except Exception as e:
-        return None, None
-# Initialize model (this happens when the Space starts)
-print(f"Loading model: {model_name}")  # And you can use it here ⬇️
-tokenizer, model = load_model()
 def process_video_question(video_file, question):
-    """Process video and answer questions about it"""
-    if model is None:
-        return "Sorry, the model failed to load. Please try again later."
     if video_file is None:
         return "Please upload a video file first."
@@ -36,20 +53,84 @@ def process_video_question(video_file, question):
         return "Please enter a question about the video."
     try:
-        # Your video processing logic would go here
-        # This is a placeholder - you'll need to implement the actual VideoLLaMA3 pipeline
-        # For now, just return a simple response
-        response = f"I received your video and question: '{question}'. Video processing with {model_name} would happen here."
         return response
     except Exception as e:
-        return f"Error processing video: {str(e)}"
 # Create the Gradio interface
-with gr.Blocks(title="VideoLLaMA3 Demo") as demo:
-    gr.Markdown("# 🎥 VideoLLaMA3 Interactive Demo")
-    gr.Markdown(f"**Model:** `{model_name}`")  # Display the model name ⬇️
     gr.Markdown("Upload a video and ask questions about its content!")
     with gr.Row():
@@ -61,25 +142,29 @@ with gr.Blocks(title="VideoLLaMA3 Demo") as demo:
             question_input = gr.Textbox(
                 label="❓ Ask a question about the video",
                 placeholder="What is happening in this video?",
-                lines=2
             )
-            submit_btn = gr.Button("🚀 Analyze Video", variant="primary")
         with gr.Column(scale=1):
             output_text = gr.Textbox(
                 label="🤖 AI Response",
-                lines=10,
-                placeholder="The AI response will appear here..."
             )
     # Examples section
-    gr.Markdown("### 💡 Example Questions:")
-    gr.Markdown("""
-    - "What objects can you see in this video?"
-    - "Describe the main action happening"
-    - "What is the setting or location?"
-    - "How many people are in the video?"
-    """)
     # Connect the button to the function
     submit_btn.click(
@@ -87,6 +172,13 @@ with gr.Blocks(title="VideoLLaMA3 Demo") as demo:
         inputs=[video_input, question_input],
         outputs=output_text
     )
 # Launch the app
 if __name__ == "__main__":

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+# Model name
 model_name = "DAMO-NLP-SG/VideoRefer-VideoLLaMA3-7B"
+# Global variables for model and processor
+model = None
+processor = None
 def load_model():
+    global model, processor
     try:
+        print("Loading VideoLLaMA3 model...")
+        print("This may take several minutes on first load...")
+        # Load model with correct parameters based on official documentation
         model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
             device_map="auto",
+            torch_dtype=torch.bfloat16,  # Changed from float16 to bfloat16
+            attn_implementation="flash_attention_2",  # Added for better performance
+        )
+        # Load processor (not tokenizer)
+        processor = AutoProcessor.from_pretrained(
+            model_name,
+            trust_remote_code=True
         )
+        print("Model and processor loaded successfully!")
+        return True
     except Exception as e:
+        print(f"Error loading model: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
 def process_video_question(video_file, question):
+    """Process video and answer questions about it using VideoLLaMA3"""
+    global model, processor
+    if model is None or processor is None:
+        return "Model is not loaded. Please wait for the model to initialize or check the logs for errors."
     if video_file is None:
         return "Please upload a video file first."
         return "Please enter a question about the video."
     try:
+        print(f"Processing video: {video_file}")
+        print(f"Question: {question}")
+        # Prepare conversation in the format expected by VideoLLaMA3
+        conversation = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": {
+                            "video_path": video_file,
+                            "fps": 1,
+                            "max_frames": 128
+                        }
+                    },
+                    {"type": "text", "text": question}
+                ]
+            }
+        ]
+        # Process the conversation
+        inputs = processor(conversation=conversation, return_tensors="pt")
+        # Move inputs to GPU if available
+        inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        # Convert pixel values to bfloat16 if present
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+        # Generate response
+        print("Generating response...")
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                use_cache=True
+            )
+        # Decode the response
+        response = processor.decode(output_ids[0], skip_special_tokens=True)
+        # Extract just the assistant's response (remove the conversation history)
+        if "assistant" in response:
+            response = response.split("assistant")[-1].strip()
+        print(f"Generated response: {response}")
         return response
     except Exception as e:
+        error_msg = f"Error processing video: {str(e)}"
+        print(error_msg)
+        import traceback
+        traceback.print_exc()
+        return error_msg
+# Initialize model when the Space starts
+print(f"Initializing {model_name}...")
+model_loaded = load_model()
+if not model_loaded:
+    print("❌ Failed to load model. Check the logs above for details.")
 # Create the Gradio interface
+with gr.Blocks(title="VideoLLaMA3 Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎥 VideoRefer-VideoLLaMA3 Interactive Demo")
+    gr.Markdown(f"**Model:** `{model_name}`")
+    if model_loaded:
+        gr.Markdown("✅ **Model Status:** Loaded and ready!")
+    else:
+        gr.Markdown("❌ **Model Status:** Failed to load. Check logs for details.")
     gr.Markdown("Upload a video and ask questions about its content!")
     with gr.Row():
             question_input = gr.Textbox(
                 label="❓ Ask a question about the video",
                 placeholder="What is happening in this video?",
+                lines=3
             )
+            submit_btn = gr.Button("🚀 Analyze Video", variant="primary", size="lg")
         with gr.Column(scale=1):
             output_text = gr.Textbox(
                 label="🤖 AI Response",
+                lines=12,
+                placeholder="The AI response will appear here...",
+                show_copy_button=True
             )
     # Examples section
+    with gr.Row():
+        gr.Markdown("""
+        ### 💡 Example Questions:
+        - "What objects can you see in this video?"
+        - "Describe the main action happening in detail"
+        - "What is the setting or location of this video?"
+        - "How many people are in the video and what are they doing?"
+        - "What emotions or mood does this video convey?"
+        - "Describe the sequence of events in chronological order"
+        """)
     # Connect the button to the function
     submit_btn.click(
         inputs=[video_input, question_input],
         outputs=output_text
     )
+    # Auto-submit when Enter is pressed in the question box
+    question_input.submit(
+        fn=process_video_question,
+        inputs=[video_input, question_input],
+        outputs=output_text
+    )
 # Launch the app
 if __name__ == "__main__":