prithivMLmods commited on
Commit
b560a0e
·
verified ·
1 Parent(s): 6d94394

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -32
app.py CHANGED
@@ -5,6 +5,8 @@ import json
5
  import time
6
  import asyncio
7
  from threading import Thread
 
 
8
 
9
  import gradio as gr
10
  import spaces
@@ -13,6 +15,7 @@ import numpy as np
13
  from PIL import Image
14
  import cv2
15
  import requests
 
16
 
17
  from transformers import (
18
  Qwen3VLMoeForConditionalGeneration,
@@ -26,7 +29,6 @@ MAX_MAX_NEW_TOKENS = 4096
26
  DEFAULT_MAX_NEW_TOKENS = 2048
27
 
28
  # Let the environment (e.g., Hugging Face Spaces) determine the device.
29
- # This avoids conflicts with the CUDA environment setup by the platform.
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
 
32
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -41,9 +43,6 @@ if torch.cuda.is_available():
41
  print("Using device:", device)
42
  # --- Model Loading ---
43
 
44
- # To address the warnings, we add `use_fast=False` to ensure we use the
45
- # processor version the model was originally saved with.
46
-
47
  # Load Qwen3VL
48
  MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
49
  processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
@@ -57,13 +56,11 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
57
  def downsample_video(video_path):
58
  """
59
  Downsamples the video to evenly spaced frames.
60
- Each frame is returned as a PIL image along with its timestamp.
61
  """
62
  vidcap = cv2.VideoCapture(video_path)
63
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
64
  fps = vidcap.get(cv2.CAP_PROP_FPS)
65
  frames = []
66
- # Use a maximum of 10 frames to avoid excessive memory usage
67
  frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
68
  for i in frame_indices:
69
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -71,11 +68,29 @@ def downsample_video(video_path):
71
  if success:
72
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
73
  pil_image = Image.fromarray(image)
74
- timestamp = round(i / fps, 2)
75
- frames.append((pil_image, timestamp))
76
  vidcap.release()
77
  return frames
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  @spaces.GPU
80
  def generate_image(text: str, image: Image.Image,
81
  max_new_tokens: int = 1024,
@@ -84,7 +99,7 @@ def generate_image(text: str, image: Image.Image,
84
  top_k: int = 50,
85
  repetition_penalty: float = 1.2):
86
  """
87
- Generates responses using the Qwen3-VL model for image input.
88
  """
89
  if image is None:
90
  yield "Please upload an image.", "Please upload an image."
@@ -93,10 +108,7 @@ def generate_image(text: str, image: Image.Image,
93
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
94
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
95
 
96
- # FIX: Removed truncation=True and max_length to prevent the ValueError
97
- inputs = processor_q3vl(
98
- text=[prompt_full], images=[image], return_tensors="pt", padding=True
99
- ).to(device)
100
 
101
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
102
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
@@ -116,30 +128,23 @@ def generate_video(text: str, video_path: str,
116
  top_k: int = 50,
117
  repetition_penalty: float = 1.2):
118
  """
119
- Generates responses using the Qwen3-VL model for video input.
120
  """
121
  if video_path is None:
122
  yield "Please upload a video.", "Please upload a video."
123
  return
124
 
125
- frames_with_ts = downsample_video(video_path)
126
- if not frames_with_ts:
127
  yield "Could not process video.", "Could not process video."
128
  return
129
 
130
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
131
- images_for_processor = []
132
- # Add an <|image|> placeholder for each frame in the message
133
- for frame, timestamp in frames_with_ts:
134
- messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
135
- images_for_processor.append(frame)
136
 
137
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
138
-
139
- # FIX: Removed truncation=True and max_length to prevent the ValueError
140
- inputs = processor_q3vl(
141
- text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
142
- ).to(device)
143
 
144
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
145
  generation_kwargs = {
@@ -156,15 +161,57 @@ def generate_video(text: str, video_path: str,
156
  time.sleep(0.01)
157
  yield buffer, buffer
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- # Define examples for image and video inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  image_examples = [
162
  ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
163
  ["Convert this page to doc [markdown] precisely.", "images/3.png"],
164
- ["Convert this page to doc [markdown] precisely.", "images/4.png"],
165
  ["Explain the creativity in the image.", "images/6.jpg"],
166
- ["Convert this page to doc [markdown] precisely.", "images/1.png"],
167
- ["Convert chart to OTSL.", "images/2.png"]
168
  ]
169
 
170
  video_examples = [
@@ -172,13 +219,17 @@ video_examples = [
172
  ["Explain the ad in detail.", "videos/1.mp4"]
173
  ]
174
 
 
 
 
 
 
175
  css = """
176
  .submit-btn { background-color: #2980b9 !important; color: white !important; }
177
  .submit-btn:hover { background-color: #3498db !important; }
178
  .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
179
  """
180
 
181
- # Create the Gradio Interface
182
  with gr.Blocks(css=css) as demo:
183
  gr.Markdown("# **Qwen3-VL-Processor**")
184
  with gr.Row():
@@ -189,12 +240,19 @@ with gr.Blocks(css=css) as demo:
189
  image_upload = gr.Image(type="pil", label="Image", height=290)
190
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
191
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
 
192
  with gr.TabItem("Video Inference"):
193
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
194
  video_upload = gr.Video(label="Video", height=290)
195
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
196
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
197
 
 
 
 
 
 
 
198
  with gr.Accordion("Advanced options", open=False):
199
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
200
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -205,9 +263,11 @@ with gr.Blocks(css=css) as demo:
205
  with gr.Column():
206
  with gr.Column(elem_classes="canvas-output"):
207
  gr.Markdown("## Output")
208
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=9, show_copy_button=True)
209
  with gr.Accordion("(Result.md)", open=False):
210
  markdown_output = gr.Markdown(label="(Result.Md)")
 
 
211
  image_submit.click(
212
  fn=generate_image,
213
  inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -218,6 +278,11 @@ with gr.Blocks(css=css) as demo:
218
  inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
219
  outputs=[output, markdown_output]
220
  )
 
 
 
 
 
221
 
222
  if __name__ == "__main__":
223
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
5
  import time
6
  import asyncio
7
  from threading import Thread
8
+ from pathlib import Path
9
+ from io import BytesIO
10
 
11
  import gradio as gr
12
  import spaces
 
15
  from PIL import Image
16
  import cv2
17
  import requests
18
+ import fitz # PyMuPDF
19
 
20
  from transformers import (
21
  Qwen3VLMoeForConditionalGeneration,
 
29
  DEFAULT_MAX_NEW_TOKENS = 2048
30
 
31
  # Let the environment (e.g., Hugging Face Spaces) determine the device.
 
32
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
 
34
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 
43
  print("Using device:", device)
44
  # --- Model Loading ---
45
 
 
 
 
46
  # Load Qwen3VL
47
  MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
48
  processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
 
56
  def downsample_video(video_path):
57
  """
58
  Downsamples the video to evenly spaced frames.
 
59
  """
60
  vidcap = cv2.VideoCapture(video_path)
61
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
62
  fps = vidcap.get(cv2.CAP_PROP_FPS)
63
  frames = []
 
64
  frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
65
  for i in frame_indices:
66
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
 
68
  if success:
69
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
70
  pil_image = Image.fromarray(image)
71
+ frames.append(pil_image)
 
72
  vidcap.release()
73
  return frames
74
 
75
+ def convert_pdf_to_images(file_path: str, dpi: int = 200):
76
+ """
77
+ Converts a PDF file into a list of PIL Images.
78
+ """
79
+ if not file_path:
80
+ return []
81
+
82
+ images = []
83
+ pdf_document = fitz.open(file_path)
84
+ zoom = dpi / 72.0
85
+ mat = fitz.Matrix(zoom, zoom)
86
+ for page_num in range(len(pdf_document)):
87
+ page = pdf_document.load_page(page_num)
88
+ pix = page.get_pixmap(matrix=mat)
89
+ img_data = pix.tobytes("png")
90
+ images.append(Image.open(BytesIO(img_data)))
91
+ pdf_document.close()
92
+ return images
93
+
94
  @spaces.GPU
95
  def generate_image(text: str, image: Image.Image,
96
  max_new_tokens: int = 1024,
 
99
  top_k: int = 50,
100
  repetition_penalty: float = 1.2):
101
  """
102
+ Generates responses for a single image input.
103
  """
104
  if image is None:
105
  yield "Please upload an image.", "Please upload an image."
 
108
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
109
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
110
 
111
+ inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
 
 
 
112
 
113
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
114
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
128
  top_k: int = 50,
129
  repetition_penalty: float = 1.2):
130
  """
131
+ Generates responses for a video input by processing downsampled frames.
132
  """
133
  if video_path is None:
134
  yield "Please upload a video.", "Please upload a video."
135
  return
136
 
137
+ frames = downsample_video(video_path)
138
+ if not frames:
139
  yield "Could not process video.", "Could not process video."
140
  return
141
 
142
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
143
+ for frame in frames:
144
+ messages[0]["content"].insert(0, {"type": "image"})
 
 
 
145
 
146
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
147
+ inputs = processor_q3vl(text=[prompt_full], images=frames, return_tensors="pt", padding=True).to(device)
 
 
 
 
148
 
149
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
150
  generation_kwargs = {
 
161
  time.sleep(0.01)
162
  yield buffer, buffer
163
 
164
+ @spaces.GPU
165
+ def generate_pdf(text: str, pdf_path: str,
166
+ max_new_tokens: int = 2048,
167
+ temperature: float = 0.6,
168
+ top_p: float = 0.9,
169
+ top_k: int = 50,
170
+ repetition_penalty: float = 1.2):
171
+ """
172
+ Processes a PDF file page by page and generates a combined textual output.
173
+ """
174
+ if not pdf_path:
175
+ yield "Please upload a PDF file.", "Please upload a PDF file."
176
+ return
177
 
178
+ try:
179
+ page_images = convert_pdf_to_images(pdf_path)
180
+ if not page_images:
181
+ yield "Could not extract pages from the PDF.", "Could not extract pages from the PDF."
182
+ return
183
+ except Exception as e:
184
+ yield f"Error processing PDF: {e}", f"Error processing PDF: {e}"
185
+ return
186
+
187
+ full_response = ""
188
+ for i, image in enumerate(page_images):
189
+ page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
190
+ yield page_header, page_header
191
+
192
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
193
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
194
+ inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
195
+
196
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
197
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
198
+
199
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
200
+ thread.start()
201
+
202
+ page_buffer = ""
203
+ for new_text in streamer:
204
+ page_buffer += new_text
205
+ yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
206
+ time.sleep(0.01)
207
+
208
+ full_response += page_header + page_buffer + "\n"
209
+
210
+ # --- Gradio Interface ---
211
  image_examples = [
212
  ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
213
  ["Convert this page to doc [markdown] precisely.", "images/3.png"],
 
214
  ["Explain the creativity in the image.", "images/6.jpg"],
 
 
215
  ]
216
 
217
  video_examples = [
 
219
  ["Explain the ad in detail.", "videos/1.mp4"]
220
  ]
221
 
222
+ #pdf_examples = [
223
+ # ["Summarize the key findings from this document.", "examples/sample-doc.pdf"],
224
+ # ["Extract the main points from each section.", "examples/research-paper.pdf"],
225
+ #]
226
+
227
  css = """
228
  .submit-btn { background-color: #2980b9 !important; color: white !important; }
229
  .submit-btn:hover { background-color: #3498db !important; }
230
  .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
231
  """
232
 
 
233
  with gr.Blocks(css=css) as demo:
234
  gr.Markdown("# **Qwen3-VL-Processor**")
235
  with gr.Row():
 
240
  image_upload = gr.Image(type="pil", label="Image", height=290)
241
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
242
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
243
+
244
  with gr.TabItem("Video Inference"):
245
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
246
  video_upload = gr.Video(label="Video", height=290)
247
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
248
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
249
 
250
+ with gr.TabItem("PDF Inference"):
251
+ pdf_query = gr.Textbox(label="Query Input", placeholder="e.g., 'Summarize this document'")
252
+ pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
253
+ pdf_submit = gr.Button("Submit", elem_classes="submit-btn")
254
+ #gr.Examples(examples=pdf_examples, inputs=[pdf_query, pdf_upload])
255
+
256
  with gr.Accordion("Advanced options", open=False):
257
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
258
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
263
  with gr.Column():
264
  with gr.Column(elem_classes="canvas-output"):
265
  gr.Markdown("## Output")
266
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=10, show_copy_button=True)
267
  with gr.Accordion("(Result.md)", open=False):
268
  markdown_output = gr.Markdown(label="(Result.Md)")
269
+
270
+ # Event handlers
271
  image_submit.click(
272
  fn=generate_image,
273
  inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
278
  inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
279
  outputs=[output, markdown_output]
280
  )
281
+ pdf_submit.click(
282
+ fn=generate_pdf,
283
+ inputs=[pdf_query, pdf_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
284
+ outputs=[output, markdown_output]
285
+ )
286
 
287
  if __name__ == "__main__":
288
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)