goalit4848 commited on
Commit
b7e0e42
·
verified ·
1 Parent(s): 947c70f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -0
app.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ here is app.py file
2
+ <pre style="caret-color: rgb(255, 255, 255); color: rgb(255, 255, 255); font-style: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; widows: auto; word-spacing: 0px; -webkit-tap-highlight-color: rgba(26, 26, 26, 0.3); -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px; text-decoration: none;">import os
3
+ import random
4
+ import uuid
5
+ import json
6
+ import time
7
+ import asyncio
8
+ from threading import Thread
9
+
10
+ import gradio as gr
11
+ import spaces
12
+ import torch
13
+ import numpy as np
14
+ from PIL import Image
15
+ import cv2
16
+
17
+ from transformers import (
18
+ Qwen2_5_VLForConditionalGeneration,
19
+ AutoModel,
20
+ AutoTokenizer,
21
+ AutoProcessor,
22
+ TextIteratorStreamer,
23
+ )
24
+ from transformers.image_utils import load_image
25
+
26
+ # Constants for text generation
27
+ MAX_MAX_NEW_TOKENS = 2048
28
+ DEFAULT_MAX_NEW_TOKENS = 1024
29
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
+
31
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
+
33
+ # Load Qwen2.5-VL-7B-Instruct
34
+ MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
35
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
36
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
37
+ MODEL_ID_M,
38
+ trust_remote_code=True,
39
+ torch_dtype=torch.float16
40
+ ).to(device).eval()
41
+
42
+ # Load Qwen2.5-VL-3B-Instruct
43
+ MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
44
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
45
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
46
+ MODEL_ID_X,
47
+ trust_remote_code=True,
48
+ torch_dtype=torch.float16
49
+ ).to(device).eval()
50
+
51
+ def downsample_video(video_path):
52
+ """
53
+ Downsamples the video to evenly spaced frames.
54
+ Each frame is returned as a PIL image along with its timestamp.
55
+ """
56
+ vidcap = cv2.VideoCapture(video_path)
57
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
58
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
59
+ frames = []
60
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
61
+ for i in frame_indices:
62
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
63
+ success, image = vidcap.read()
64
+ if success:
65
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
66
+ pil_image = Image.fromarray(image)
67
+ timestamp = round(i / fps, 2)
68
+ frames.append((pil_image, timestamp))
69
+ vidcap.release()
70
+ return frames
71
+
72
+ @spaces.GPU
73
+ def generate_image(model_name: str, text: str, image: Image.Image,
74
+ max_new_tokens: int = 1024,
75
+ temperature: float = 0.6,
76
+ top_p: float = 0.9,
77
+ top_k: int = 50,
78
+ repetition_penalty: float = 1.2):
79
+ """
80
+ Generates responses using the selected model for image input.
81
+ """
82
+ if model_name == "Qwen2.5-VL-7B-Instruct":
83
+ processor = processor_m
84
+ model = model_m
85
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
86
+ processor = processor_x
87
+ model = model_x
88
+ else:
89
+ yield "Invalid model selected."
90
+ return
91
+
92
+ if image is None:
93
+ yield "Please upload an image."
94
+ return
95
+
96
+ messages = [{
97
+ "role": "user",
98
+ "content": [
99
+ {"type": "image", "image": image},
100
+ {"type": "text", "text": text},
101
+ ]
102
+ }]
103
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
104
+ inputs = processor(
105
+ text=[prompt_full],
106
+ images=[image],
107
+ return_tensors="pt",
108
+ padding=True,
109
+ truncation=False,
110
+ max_length=MAX_INPUT_TOKEN_LENGTH
111
+ ).to(device)
112
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
113
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
114
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
115
+ thread.start()
116
+ buffer = ""
117
+ for new_text in streamer:
118
+ buffer += new_text
119
+ time.sleep(0.01)
120
+ yield buffer
121
+
122
+ @spaces.GPU
123
+ def generate_video(model_name: str, text: str, video_path: str,
124
+ max_new_tokens: int = 1024,
125
+ temperature: float = 0.6,
126
+ top_p: float = 0.9,
127
+ top_k: int = 50,
128
+ repetition_penalty: float = 1.2):
129
+ """
130
+ Generates responses using the selected model for video input.
131
+ """
132
+ if model_name == "Qwen2.5-VL-7B-Instruct":
133
+ processor = processor_m
134
+ model = model_m
135
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
136
+ processor = processor_x
137
+ model = model_x
138
+ else:
139
+ yield "Invalid model selected."
140
+ return
141
+
142
+ if video_path is None:
143
+ yield "Please upload a video."
144
+ return
145
+
146
+ frames = downsample_video(video_path)
147
+ messages = [
148
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
149
+ {"role": "user", "content": [{"type": "text", "text": text}]}
150
+ ]
151
+ for frame in frames:
152
+ image, timestamp = frame
153
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
154
+ messages[1]["content"].append({"type": "image", "image": image})
155
+ inputs = processor.apply_chat_template(
156
+ messages,
157
+ tokenize=True,
158
+ add_generation_prompt=True,
159
+ return_dict=True,
160
+ return_tensors="pt",
161
+ truncation=False,
162
+ max_length=MAX_INPUT_TOKEN_LENGTH
163
+ ).to(device)
164
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
165
+ generation_kwargs = {
166
+ **inputs,
167
+ "streamer": streamer,
168
+ "max_new_tokens": max_new_tokens,
169
+ "do_sample": True,
170
+ "temperature": temperature,
171
+ "top_p": top_p,
172
+ "top_k": top_k,
173
+ "repetition_penalty": repetition_penalty,
174
+ }
175
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
176
+ thread.start()
177
+ buffer = ""
178
+ for new_text in streamer:
179
+ buffer += new_text
180
+ time.sleep(0.01)
181
+ yield buffer
182
+
183
+ # Define examples for image and video inference
184
+ image_examples = [
185
+ ["Jsonify Data.", "images/1.jpg"],
186
+ ["Explain the pie-chart in detail.", "images/2.jpg"]
187
+ ]
188
+
189
+ video_examples = [
190
+ ["Explain the ad in detail", "videos/1.mp4"],
191
+ ["Identify the main actions in the video", "videos/2.mp4"],
192
+ ["Identify the main scenes in the video", "videos/3.mp4"]
193
+ ]
194
+
195
+ css = """
196
+ .submit-btn {
197
+ background-color: #2980b9 !important;
198
+ color: white !important;
199
+ }
200
+ .submit-btn:hover {
201
+ background-color: #3498db !important;
202
+ }
203
+ """
204
+
205
+ # Create the Gradio Interface
206
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
207
+ gr.Markdown("# **Qwen2.5-VL**")
208
+ with gr.Row():
209
+ with gr.Column():
210
+ with gr.Tabs():
211
+ with gr.TabItem("Image Inference"):
212
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
213
+ image_upload = gr.Image(type="pil", label="Image")
214
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
215
+ gr.Examples(
216
+ examples=image_examples,
217
+ inputs=[image_query, image_upload]
218
+ )
219
+ with gr.TabItem("Video Inference"):
220
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
221
+ video_upload = gr.Video(label="Video")
222
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
223
+ gr.Examples(
224
+ examples=video_examples,
225
+ inputs=[video_query, video_upload]
226
+ )
227
+ with gr.Accordion("Advanced options", open=False):
228
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
229
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
230
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
231
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
232
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
233
+ with gr.Column():
234
+ output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
235
+ model_choice = gr.Radio(
236
+ choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct"],
237
+ label="Select Model",
238
+ value="Qwen2.5-VL-7B-Instruct"
239
+ )
240
+
241
+ gr.Markdown("**Model Info**")
242
+ gr.Markdown("&gt; [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
243
+ gr.Markdown("&gt; [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
244
+
245
+ image_submit.click(
246
+ fn=generate_image,
247
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
248
+ outputs=output
249
+ )
250
+ video_submit.click(
251
+ fn=generate_video,
252
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
253
+ outputs=output
254
+ )
255
+
256
+ if __name__ == "__main__":
257
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)</pre>