cweigendev commited on
Commit
b6537a4
Β·
verified Β·
1 Parent(s): e43572f

updating app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -41
app.py CHANGED
@@ -6,20 +6,20 @@ from PIL import Image
6
  import spaces
7
  import tempfile
8
  import os
9
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
  import warnings
11
  warnings.filterwarnings("ignore")
12
 
13
  # Global variables
14
  model = None
15
- tokenizer = None
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  model_loaded = False
18
 
19
  @spaces.GPU
20
  def load_videollama3_model():
21
  """Load VideoLLaMA3 model with proper configuration"""
22
- global model, tokenizer, model_loaded
23
 
24
  try:
25
  print("πŸ”„ Loading VideoLLaMA3-7B model...")
@@ -34,17 +34,13 @@ def load_videollama3_model():
34
  bnb_4bit_quant_type="nf4"
35
  )
36
 
37
- # Load tokenizer
38
- print("Loading tokenizer...")
39
- tokenizer = AutoTokenizer.from_pretrained(
40
  model_name,
41
- trust_remote_code=True,
42
- use_fast=False
43
  )
44
 
45
- if tokenizer.pad_token is None:
46
- tokenizer.pad_token = tokenizer.eos_token
47
-
48
  # Load model
49
  print("Loading VideoLLaMA3 model (this may take several minutes)...")
50
  model = AutoModelForCausalLM.from_pretrained(
@@ -53,7 +49,7 @@ def load_videollama3_model():
53
  device_map="auto",
54
  torch_dtype=torch.float16,
55
  trust_remote_code=True,
56
- low_cpu_mem_usage=True,
57
  )
58
 
59
  model_loaded = True
@@ -141,33 +137,26 @@ def analyze_video_with_ai(video_file, question, progress=gr.Progress()):
141
 
142
  progress(0.3, desc="Preparing AI input...")
143
 
144
- # Create a detailed prompt for video analysis
145
- system_prompt = "You are VideoLLaMA3, an advanced AI assistant specialized in video understanding. Analyze the video frames and provide detailed, accurate responses about the video content."
 
 
 
 
 
 
 
 
 
146
 
147
- user_prompt = f"""I have a video with the following specifications:
148
- - Duration: {video_info['duration']:.1f} seconds
149
- - Original FPS: {video_info['original_fps']:.1f}
150
- - Total frames: {video_info['total_frames']}
151
- - Analyzed frames: {video_info['extracted_frames']}
152
- - Resolution: {video_info['resolution']}
153
-
154
- Question: {question}
155
-
156
- Please analyze the video content and provide a comprehensive answer based on what you observe in the video frames."""
157
-
158
  progress(0.5, desc="Processing with VideoLLaMA3...")
159
 
160
- # Prepare conversation format
161
- conversation = f"System: {system_prompt}\n\nHuman: {user_prompt}\n\nAssistant:"
 
162
 
163
- # Tokenize input
164
- inputs = tokenizer(
165
- conversation,
166
- return_tensors="pt",
167
- max_length=2048,
168
- truncation=True,
169
- padding=True
170
- ).to(device)
171
 
172
  progress(0.7, desc="Generating AI response...")
173
 
@@ -180,18 +169,18 @@ Please analyze the video content and provide a comprehensive answer based on wha
180
  do_sample=True,
181
  top_p=0.9,
182
  repetition_penalty=1.1,
183
- pad_token_id=tokenizer.eos_token_id,
184
- eos_token_id=tokenizer.eos_token_id
185
  )
186
 
187
  # Decode response
188
- full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
189
 
190
  # Extract just the assistant's response
191
- if "Assistant:" in full_response:
192
- ai_response = full_response.split("Assistant:")[-1].strip()
193
  else:
194
- ai_response = full_response.split(conversation)[-1].strip()
195
 
196
  progress(0.9, desc="Formatting results...")
197
 
 
6
  import spaces
7
  import tempfile
8
  import os
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
10
  import warnings
11
  warnings.filterwarnings("ignore")
12
 
13
  # Global variables
14
  model = None
15
+ processor = None
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  model_loaded = False
18
 
19
  @spaces.GPU
20
  def load_videollama3_model():
21
  """Load VideoLLaMA3 model with proper configuration"""
22
+ global model, processor, model_loaded
23
 
24
  try:
25
  print("πŸ”„ Loading VideoLLaMA3-7B model...")
 
34
  bnb_4bit_quant_type="nf4"
35
  )
36
 
37
+ # Load processor (handles both text and video)
38
+ print("Loading processor...")
39
+ processor = AutoProcessor.from_pretrained(
40
  model_name,
41
+ trust_remote_code=True
 
42
  )
43
 
 
 
 
44
  # Load model
45
  print("Loading VideoLLaMA3 model (this may take several minutes)...")
46
  model = AutoModelForCausalLM.from_pretrained(
 
49
  device_map="auto",
50
  torch_dtype=torch.float16,
51
  trust_remote_code=True,
52
+ low_cpu_mem_usage=True
53
  )
54
 
55
  model_loaded = True
 
137
 
138
  progress(0.3, desc="Preparing AI input...")
139
 
140
+ # Create proper conversation format for VideoLLaMA3
141
+ conversation = [
142
+ {"role": "system", "content": "You are a helpful assistant that can analyze videos."},
143
+ {
144
+ "role": "user",
145
+ "content": [
146
+ {"type": "video", "video": {"video_path": video_file, "fps": 1, "max_frames": 16}},
147
+ {"type": "text", "text": question}
148
+ ]
149
+ }
150
+ ]
151
 
 
 
 
 
 
 
 
 
 
 
 
152
  progress(0.5, desc="Processing with VideoLLaMA3...")
153
 
154
+ # Process the conversation with video
155
+ inputs = processor(conversation=conversation, return_tensors="pt")
156
+ inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
157
 
158
+ if "pixel_values" in inputs:
159
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
 
 
 
 
 
 
160
 
161
  progress(0.7, desc="Generating AI response...")
162
 
 
169
  do_sample=True,
170
  top_p=0.9,
171
  repetition_penalty=1.1,
172
+ pad_token_id=processor.tokenizer.eos_token_id,
173
+ eos_token_id=processor.tokenizer.eos_token_id
174
  )
175
 
176
  # Decode response
177
+ response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
178
 
179
  # Extract just the assistant's response
180
+ if "assistant" in response.lower():
181
+ ai_response = response.split("assistant")[-1].strip()
182
  else:
183
+ ai_response = response.strip()
184
 
185
  progress(0.9, desc="Formatting results...")
186