Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

cweigendev commited on Aug 6

Commit

b6537a4

verified ·

1 Parent(s): e43572f

updating app.py

Browse files

Files changed (1) hide show

app.py +30 -41

app.py CHANGED Viewed

@@ -6,20 +6,20 @@ from PIL import Image
 import spaces
 import tempfile
 import os
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import warnings
 warnings.filterwarnings("ignore")
 # Global variables
 model = None
-tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
 @spaces.GPU
 def load_videollama3_model():
     """Load VideoLLaMA3 model with proper configuration"""
-    global model, tokenizer, model_loaded
     try:
         print("🔄 Loading VideoLLaMA3-7B model...")
@@ -34,17 +34,13 @@ def load_videollama3_model():
             bnb_4bit_quant_type="nf4"
         )
-        # Load tokenizer
-        print("Loading tokenizer...")
-        tokenizer = AutoTokenizer.from_pretrained(
             model_name,
-            trust_remote_code=True,
-            use_fast=False
         )
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
         # Load model
         print("Loading VideoLLaMA3 model (this may take several minutes)...")
         model = AutoModelForCausalLM.from_pretrained(
@@ -53,7 +49,7 @@ def load_videollama3_model():
             device_map="auto",
             torch_dtype=torch.float16,
             trust_remote_code=True,
-            low_cpu_mem_usage=True,
         )
         model_loaded = True
@@ -141,33 +137,26 @@ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
         progress(0.3, desc="Preparing AI input...")
-        # Create a detailed prompt for video analysis
-        system_prompt = "You are VideoLLaMA3, an advanced AI assistant specialized in video understanding. Analyze the video frames and provide detailed, accurate responses about the video content."
-        user_prompt = f"""I have a video with the following specifications:
-- Duration: {video_info['duration']:.1f} seconds
-- Original FPS: {video_info['original_fps']:.1f}
-- Total frames: {video_info['total_frames']}
-- Analyzed frames: {video_info['extracted_frames']}
-- Resolution: {video_info['resolution']}
-Question: {question}
-Please analyze the video content and provide a comprehensive answer based on what you observe in the video frames."""
         progress(0.5, desc="Processing with VideoLLaMA3...")
-        # Prepare conversation format
-        conversation = f"System: {system_prompt}\n\nHuman: {user_prompt}\n\nAssistant:"
-        # Tokenize input
-        inputs = tokenizer(
-            conversation,
-            return_tensors="pt",
-            max_length=2048,
-            truncation=True,
-            padding=True
-        ).to(device)
         progress(0.7, desc="Generating AI response...")
@@ -180,18 +169,18 @@ Please analyze the video content and provide a comprehensive answer based on wha
                 do_sample=True,
                 top_p=0.9,
                 repetition_penalty=1.1,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
             )
         # Decode response
-        full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         # Extract just the assistant's response
-        if "Assistant:" in full_response:
-            ai_response = full_response.split("Assistant:")[-1].strip()
         else:
-            ai_response = full_response.split(conversation)[-1].strip()
         progress(0.9, desc="Formatting results...")

 import spaces
 import tempfile
 import os
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
 import warnings
 warnings.filterwarnings("ignore")
 # Global variables
 model = None
+processor = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_loaded = False
 @spaces.GPU
 def load_videollama3_model():
     """Load VideoLLaMA3 model with proper configuration"""
+    global model, processor, model_loaded
     try:
         print("🔄 Loading VideoLLaMA3-7B model...")
             bnb_4bit_quant_type="nf4"
         )
+        # Load processor (handles both text and video)
+        print("Loading processor...")
+        processor = AutoProcessor.from_pretrained(
             model_name,
+            trust_remote_code=True
         )
         # Load model
         print("Loading VideoLLaMA3 model (this may take several minutes)...")
         model = AutoModelForCausalLM.from_pretrained(
             device_map="auto",
             torch_dtype=torch.float16,
             trust_remote_code=True,
+            low_cpu_mem_usage=True
         )
         model_loaded = True
         progress(0.3, desc="Preparing AI input...")
+        # Create proper conversation format for VideoLLaMA3
+        conversation = [
+            {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 16}},
+                    {"type": "text", "text": question}
+                ]
+            }
+        ]
         progress(0.5, desc="Processing with VideoLLaMA3...")
+        # Process the conversation with video
+        inputs = processor(conversation=conversation, return_tensors="pt")
+        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
         progress(0.7, desc="Generating AI response...")
                 do_sample=True,
                 top_p=0.9,
                 repetition_penalty=1.1,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
             )
         # Decode response
+        response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
         # Extract just the assistant's response
+        if "assistant" in response.lower():
+            ai_response = response.split("assistant")[-1].strip()
         else:
+            ai_response = response.strip()
         progress(0.9, desc="Formatting results...")