John Ho commited on
Commit
1df8e73
·
1 Parent(s): 88958c8

trying a different inference script

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app_qwen25vl.py +265 -0
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.32.0
8
- app_file: app.py
9
  pinned: false
10
  short_description: Demo of the camera motion detection as part of CameraBench
11
  ---
 
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.32.0
8
+ app_file: app_qwen25vl.py # app.py
9
  pinned: false
10
  short_description: Demo of the camera motion detection as part of CameraBench
11
  ---
app_qwen25vl.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library imports
2
+ import os
3
+ from datetime import datetime
4
+ import subprocess
5
+ import time
6
+ import uuid
7
+ import io
8
+ from threading import Thread
9
+
10
+ # Third-party imports
11
+ import numpy as np
12
+ import torch
13
+ from PIL import Image
14
+ import accelerate
15
+ import gradio as gr
16
+ import spaces
17
+ from transformers import (
18
+ Qwen2_5_VLForConditionalGeneration,
19
+ AutoTokenizer,
20
+ AutoProcessor,
21
+ TextIteratorStreamer,
22
+ )
23
+
24
+ # Local imports
25
+ from qwen_vl_utils import process_vision_info
26
+
27
+ # Set device agnostic code
28
+ if torch.cuda.is_available():
29
+ device = "cuda"
30
+ elif (torch.backends.mps.is_available()) and (torch.backends.mps.is_built()):
31
+ device = "mps"
32
+ else:
33
+ device = "cpu"
34
+
35
+ print(f"[INFO] Using device: {device}")
36
+
37
+ # Define supported media extensions
38
+ image_extensions = Image.registered_extensions()
39
+ video_extensions = (
40
+ "avi",
41
+ "mp4",
42
+ "mov",
43
+ "mkv",
44
+ "flv",
45
+ "wmv",
46
+ "mjpeg",
47
+ "gif",
48
+ "webm",
49
+ "m4v",
50
+ "3gp",
51
+ ) # Removed .wav as it's audio, not video
52
+
53
+
54
+ def identify_and_save_blob(blob_path):
55
+ """
56
+ Identifies if the blob is an image or video and saves it with a unique name.
57
+ Returns the saved file path and its media type ("image" or "video").
58
+ """
59
+ try:
60
+ with open(blob_path, "rb") as file:
61
+ blob_content = file.read()
62
+
63
+ # Try to identify if it's an image
64
+ try:
65
+ Image.open(
66
+ io.BytesIO(blob_content)
67
+ ).verify() # Check if it's a valid image
68
+ extension = ".png" # Default to PNG for saving
69
+ media_type = "image"
70
+ except (IOError, SyntaxError):
71
+ # If it's not a valid image, assume it's a video
72
+ # We can try to get the actual extension from the blob_path,
73
+ # but for unknown types, MP4 is a good default.
74
+ _, ext = os.path.splitext(blob_path)
75
+ if ext.lower() in video_extensions:
76
+ extension = ext.lower()
77
+ else:
78
+ extension = ".mp4" # Default to MP4 for saving
79
+ media_type = "video"
80
+
81
+ # Create a unique filename
82
+ filename = f"temp_{uuid.uuid4()}_media{extension}"
83
+ with open(filename, "wb") as f:
84
+ f.write(blob_content)
85
+
86
+ return filename, media_type
87
+
88
+ except FileNotFoundError:
89
+ raise ValueError(f"The file {blob_path} was not found.")
90
+ except Exception as e:
91
+ raise ValueError(f"An error occurred while processing the file: {e}")
92
+
93
+
94
+ # Model and Processor Loading
95
+ # Define models and processors as dictionaries for easy selection
96
+ models = {
97
+ "Qwen/Qwen2.5-VL-7B-Instruct": Qwen2_5_VLForConditionalGeneration.from_pretrained(
98
+ "Qwen/Qwen2.5-VL-7B-Instruct",
99
+ trust_remote_code=True,
100
+ torch_dtype="auto",
101
+ device_map="auto",
102
+ ).eval(),
103
+ "Qwen/Qwen2.5-VL-3B-Instruct": Qwen2_5_VLForConditionalGeneration.from_pretrained(
104
+ "Qwen/Qwen2.5-VL-3B-Instruct",
105
+ trust_remote_code=True,
106
+ torch_dtype="auto",
107
+ device_map="auto",
108
+ ).eval(),
109
+ }
110
+
111
+ processors = {
112
+ "Qwen/Qwen2.5-VL-7B-Instruct": AutoProcessor.from_pretrained(
113
+ "Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True
114
+ ),
115
+ "Qwen/Qwen2.5-VL-3B-Instruct": AutoProcessor.from_pretrained(
116
+ "Qwen/Qwen2.5-VL-3B-Instruct", trust_remote_code=True
117
+ ),
118
+ }
119
+
120
+ DESCRIPTION = "[Qwen2.5-VL Demo](https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5)"
121
+
122
+
123
+ @spaces.GPU
124
+ def run_example(
125
+ video_path: str, text_input: str, model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct"
126
+ ):
127
+ # if media_input is None:
128
+ # raise gr.Error("No media provided. Please upload an image or video before submitting.")
129
+ # if model_id is None:
130
+ # raise gr.Error("No model selected. Please select a model.")
131
+
132
+ start_time = time.time()
133
+
134
+ # media_path = None
135
+ # media_type = None
136
+
137
+ # # Determine if it's an image (numpy array from gr.Image) or a file (from gr.File)
138
+ # if isinstance(media_input, np.ndarray): # This comes from gr.Image
139
+ # img = Image.fromarray(np.uint8(media_input))
140
+ # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
141
+ # filename = f"image_{timestamp}.png"
142
+ # img.save(filename)
143
+ # media_path = os.path.abspath(filename)
144
+ # media_type = "image"
145
+ # elif isinstance(media_input, str): # This comes from gr.File (filepath)
146
+ # path = media_input
147
+ # _, ext = os.path.splitext(path)
148
+ # ext = ext.lower()
149
+
150
+ # if ext in image_extensions:
151
+ # media_path = path
152
+ # media_type = "image"
153
+ # elif ext in video_extensions:
154
+ # media_path = path
155
+ # media_type = "video"
156
+ # else:
157
+ # # For blobs or unknown file types, try to identify
158
+ # try:
159
+ # media_path, media_type = identify_and_save_blob(path)
160
+ # print(f"Identified blob as: {media_type}, saved to: {media_path}")
161
+ # except Exception as e:
162
+ # print(f"Error identifying blob: {e}")
163
+ # raise gr.Error("Unsupported media type. Please upload an image (PNG, JPG, etc.) or a video (MP4, AVI, etc.).")
164
+ # else:
165
+ # raise gr.Error("Unsupported input type for media. Please upload an image or video.")
166
+
167
+ # print(f"[INFO] Processing {media_type} from {media_path}")
168
+
169
+ model = models[model_id]
170
+ processor = processors[model_id]
171
+
172
+ # Construct messages list based on media type
173
+ content_list = []
174
+ # if media_type == "image":
175
+ # content_list.append({"type": "image", "image": media_path})
176
+ # elif media_type == "video":
177
+ # content_list.append({"type": "video", "video": media_path, "fps": 8.0}) # Qwen2.5-VL often uses 8fps
178
+ content_list.append({"type": "video", "video": video_path, "fps": 8.0})
179
+ content_list.append({"type": "text", "text": text_input})
180
+ # if text_input:
181
+ # content_list.append({"type": "text", "text": text_input})
182
+ # else:
183
+ # # Default prompt if no text_input is provided
184
+ # content_list.append({"type": "text", "text": "What is in this image/video?"})
185
+
186
+ messages = [{"role": "user", "content": content_list}]
187
+
188
+ # Preparation for inference
189
+ text = processor.apply_chat_template(
190
+ messages, tokenize=False, add_generation_prompt=True
191
+ )
192
+ image_inputs, video_inputs = process_vision_info(
193
+ messages
194
+ ) # This utility handles both image and video info
195
+ inputs = processor(
196
+ text=[text],
197
+ images=image_inputs,
198
+ videos=video_inputs,
199
+ padding=True,
200
+ return_tensors="pt",
201
+ ).to(device)
202
+
203
+ # Inference: Generation of the output using streaming
204
+ streamer = TextIteratorStreamer(
205
+ processor, skip_prompt=True, **{"skip_special_tokens": True}
206
+ )
207
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
208
+
209
+ # Start generation in a separate thread to allow streaming
210
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
211
+ thread.start()
212
+
213
+ buffer = ""
214
+ for new_text in streamer:
215
+ buffer += new_text
216
+ yield buffer, None # Yield partial text and None for time until full generation
217
+ # Clean up the temporary file after it's processed (optional, depends on use case)
218
+ # if media_path and os.path.exists(media_path) and "temp_" in os.path.basename(media_path):
219
+ # os.remove(media_path)
220
+
221
+ end_time = time.time()
222
+ total_time = round(end_time - start_time, 2)
223
+
224
+ # Final yield with total time
225
+ yield buffer, f"{total_time} seconds"
226
+
227
+ # Clean up the temporary file after it's fully processed
228
+ # if media_path and os.path.exists(media_path) and "temp_" in os.path.basename(media_path):
229
+ # os.remove(media_path)
230
+ # print(f"[INFO] Cleaned up temporary file: {media_path}")
231
+
232
+
233
+ css = """
234
+ #output {
235
+ height: 500px;
236
+ overflow: auto;
237
+ border: 1px solid #ccc;
238
+ }
239
+ """
240
+
241
+ with gr.Blocks(css=css) as demo:
242
+ gr.Markdown(DESCRIPTION)
243
+ with gr.Tab(label="Qwen2.5-VL Input"):
244
+ with gr.Row():
245
+ with gr.Column():
246
+ # Change input to gr.File to accept both image and video
247
+ input_media = gr.Video(label="Input Video")
248
+ text_input = gr.Textbox(
249
+ label="Text Prompt",
250
+ value="Describe the camera motion in this video.",
251
+ )
252
+ submit_btn = gr.Button(value="Submit")
253
+ with gr.Column():
254
+ output_text = gr.Textbox(label="Output Text", interactive=False)
255
+ time_taken = gr.Textbox(
256
+ label="Time taken for processing + inference", interactive=False
257
+ )
258
+
259
+ submit_btn.click(
260
+ run_example,
261
+ [input_media, text_input, model_selector],
262
+ [output_text, time_taken],
263
+ ) # Ensure output components match yield order
264
+
265
+ demo.launch(debug=True)