prithivMLmods commited on
Commit
24aaf5e
·
verified ·
1 Parent(s): 4efd691

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -135
app.py CHANGED
@@ -32,17 +32,17 @@ from gradio.themes.utils import colors, fonts, sizes
32
  # Define a new "Thistle" color palette
33
  colors.thistle = colors.Color(
34
  name="thistle",
35
- c50="#F9F5F9",
36
- c100="#F3ECF4",
37
- c200="#E8D9EA",
38
- c300="#DCC5E0",
39
- c400="#D0B2D6",
40
- c500="#D8BFD8", # Thistle
41
- c600="#B8A2B9",
42
- c700="#98869A",
43
- c800="#796A7C",
44
- c900="#5C505D",
45
- c950="#423A44",
46
  )
47
 
48
  colors.red_gray = colors.Color(
@@ -78,12 +78,12 @@ class ThistleTheme(Soft):
78
  super().set(
79
  background_fill_primary="*primary_50",
80
  background_fill_primary_dark="*primary_900",
81
- body_background_fill="linear-gradient(135deg, *secondary_200, *secondary_100)",
82
  body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
83
- button_primary_text_color="*neutral_900",
84
- button_primary_text_color_hover="white",
85
- button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_400)",
86
- button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_600)",
87
  button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
88
  button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
89
  button_secondary_text_color="black",
@@ -97,7 +97,10 @@ class ThistleTheme(Soft):
97
  button_cancel_background_fill_hover=f"linear-gradient(90deg, {colors.red_gray.c500}, {colors.red_gray.c600})",
98
  button_cancel_background_fill_hover_dark=f"linear-gradient(90deg, {colors.red_gray.c800}, {colors.red_gray.c900})",
99
  button_cancel_text_color="white",
100
- slider_color="*secondary_400",
 
 
 
101
  slider_color_dark="*secondary_600",
102
  block_title_text_weight="600",
103
  block_border_width="3px",
@@ -105,7 +108,7 @@ class ThistleTheme(Soft):
105
  button_primary_shadow="*shadow_drop_lg",
106
  button_large_padding="11px",
107
  color_accent_soft="*primary_100",
108
- block_label_background_fill="*secondary_200",
109
  )
110
 
111
  thistle_theme = ThistleTheme()
@@ -114,7 +117,48 @@ thistle_theme = ThistleTheme()
114
  css = """
115
  :root {
116
  --color-grey-50: #f9fafb;
 
 
 
 
 
 
 
 
 
117
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  """
119
 
120
  # --- App Constants & Setup ---
@@ -152,7 +196,8 @@ def downsample_video(video_path):
152
  success, image = vidcap.read()
153
  if success:
154
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
155
- frames.append(Image.fromarray(image))
 
156
  vidcap.release()
157
  return frames
158
 
@@ -182,58 +227,49 @@ def load_and_preview_pdf(file_path: Optional[str]) -> Tuple[Optional[Image.Image
182
  pages = convert_pdf_to_images(file_path)
183
  if not pages:
184
  return None, state, '<div style="text-align:center;">Could not load file</div>'
185
- state["pages"], state["total_pages"] = pages, len(pages)
186
- return pages[0], state, f'<div style="text-align:center;">Page 1 / {state["total_pages"]}</div>'
 
 
187
  except Exception as e:
188
  return None, state, f'<div style="text-align:center;">Failed to load preview: {e}</div>'
189
 
190
  def navigate_pdf_page(direction: str, state: Dict[str, Any]):
191
  if not state or not state["pages"]:
192
  return None, state, '<div style="text-align:center;">No file loaded</div>'
193
- idx, total = state["current_page_index"], state["total_pages"]
194
- new_idx = max(0, idx - 1) if direction == "prev" else min(total - 1, idx + 1)
195
- state["current_page_index"] = new_idx
196
- return state["pages"][new_idx], state, f'<div style="text-align:center;">Page {new_idx + 1} / {total}</div>'
 
 
 
 
 
 
 
 
197
 
198
  @spaces.GPU
199
- def model_stream_response(prompt_text: str, images: list, max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
200
- messages = [{"role": "user", "content": []}]
201
- for img in images:
202
- messages[0]["content"].append({"type": "image"})
203
- messages[0]["content"].append({"type": "text", "text": prompt_text})
204
-
205
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
206
- inputs = processor_q3vl(text=[prompt_full], images=images, return_tensors="pt", padding=True).to(device)
207
-
208
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
209
-
210
- generation_kwargs = {
211
- **inputs,
212
- "streamer": streamer,
213
- "max_new_tokens": max_new_tokens,
214
- "do_sample": True,
215
- "temperature": temperature,
216
- "top_p": top_p,
217
- "top_k": top_k,
218
- "repetition_penalty": repetition_penalty,
219
- }
220
-
221
  thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
222
  thread.start()
223
-
224
  buffer = ""
225
  for new_text in streamer:
226
  buffer += new_text
227
- yield buffer.replace("<|im_end|>", ""), buffer.replace("<|im_end|>", "")
228
  time.sleep(0.01)
 
229
 
230
- def generate_image(text: str, image: Image.Image, *args):
231
- if image is None:
232
- yield "Please upload an image.", "Please upload an image."
233
- return
234
- yield from model_stream_response(text, [image], *args)
235
-
236
- def generate_video(text: str, video_path: str, *args):
237
  if video_path is None:
238
  yield "Please upload a video.", "Please upload a video."
239
  return
@@ -241,70 +277,96 @@ def generate_video(text: str, video_path: str, *args):
241
  if not frames:
242
  yield "Could not process video.", "Could not process video."
243
  return
244
- yield from model_stream_response(text, frames, *args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- def generate_pdf(text: str, state: Dict[str, Any], *args):
 
247
  if not state or not state["pages"]:
248
  yield "Please upload a PDF file first.", "Please upload a PDF file first."
249
  return
250
-
251
  full_response = ""
252
- for i, image in enumerate(state["pages"]):
253
- page_header = f"--- Page {i+1}/{len(state['pages'])} ---\n"
254
  yield full_response + page_header, full_response + page_header
255
-
256
- # This is a simplified approach. For true streaming of the whole PDF, a more complex logic would be needed.
257
- # Here we just get the full response for the page and then append it.
258
- final_page_text = ""
259
- for page_text, _ in model_stream_response(text, [image], *args):
260
- yield full_response + page_header + page_text, full_response + page_header + page_text
261
- final_page_text = page_text
262
-
263
- full_response += page_header + final_page_text + "\n\n"
 
 
 
 
264
 
265
- def generate_caption(image: Image.Image, *args):
 
 
 
 
266
  if image is None:
267
- yield "Please upload an image for captioning.", "Please upload an image for captioning."
268
  return
 
 
269
 
270
- system_prompt = (
271
- "You are an AI assistant that rigorously follows this response protocol: For every input image, "
272
- "your primary task is to write a precise caption that captures the essence of the image in clear, "
273
- "concise, and contextually accurate language. Along with the caption, provide a structured set of "
274
- "attributes describing the visual elements, including details such as objects, people, actions, "
275
- "colors, environment, mood, and other notable characteristics. Ensure captions are precise, neutral, "
276
- "and descriptive, avoiding unnecessary elaboration or subjective interpretation unless explicitly required. "
277
- "Do not reference the rules or instructions in the output; only return the formatted caption, attributes, and class_name."
278
- )
279
- yield from model_stream_response(system_prompt, [image], *args)
 
 
 
 
 
280
 
281
  # --- Gradio Interface ---
282
  image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
283
  video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
284
- caption_examples = [["images/3.png"], ["images/5.jpg"]]
285
 
286
  with gr.Blocks(theme=thistle_theme, css=css) as demo:
287
  pdf_state = gr.State(value=get_initial_pdf_state())
288
- gr.Markdown("# **Qwen3-VL-Demo**")
289
-
290
  with gr.Row():
291
  with gr.Column(scale=2):
292
  with gr.Tabs():
293
- # Image Tab
294
  with gr.TabItem("Image Inference"):
295
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
296
  image_upload = gr.Image(type="pil", label="Image", height=290)
297
  image_submit = gr.Button("Submit", variant="primary")
298
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
299
 
300
- # Video Tab
301
  with gr.TabItem("Video Inference"):
302
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
303
  video_upload = gr.Video(label="Video", height=290)
304
  video_submit = gr.Button("Submit", variant="primary")
305
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
306
 
307
- # PDF Tab
308
  with gr.TabItem("PDF Inference"):
309
  with gr.Row():
310
  with gr.Column(scale=1):
@@ -312,29 +374,24 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
312
  pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
313
  pdf_submit = gr.Button("Submit", variant="primary")
314
  with gr.Column(scale=1):
315
- pdf_preview_img = gr.Image(label="PDF Preview", height=290, interactive=False)
316
  with gr.Row():
317
  prev_page_btn = gr.Button("◀ Previous")
318
  page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
319
  next_page_btn = gr.Button("Next ▶")
320
-
321
- # Caption Tab
322
  with gr.TabItem("Caption"):
323
- caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
324
  caption_submit = gr.Button("Generate Caption", variant="primary")
325
- gr.Examples(examples=caption_examples, inputs=[caption_image_upload])
326
 
327
- # Advanced Options
328
  with gr.Accordion("Advanced options", open=False):
329
- adv_opts = [
330
- gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
331
- gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
332
- gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
333
- gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
334
- gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
335
- ]
336
 
337
- # Output Column
338
  with gr.Column(scale=3):
339
  gr.Markdown("## Output")
340
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
@@ -342,44 +399,13 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
342
  markdown_output = gr.Markdown(label="(Result.Md)")
343
 
344
  # Event handlers
345
- image_submit.click(
346
- fn=generate_image,
347
- inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
348
- outputs=[output, markdown_output]
349
- )
350
- video_submit.click(
351
- fn=generate_video,
352
- inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
353
- outputs=[output, markdown_output]
354
- )
355
- pdf_submit.click(
356
- fn=generate_pdf,
357
- inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
358
- outputs=[output, markdown_output]
359
- )
360
- pdf_upload.change(
361
- fn=load_and_preview_pdf,
362
- inputs=[pdf_upload],
363
- outputs=[pdf_preview_img, pdf_state, page_info]
364
- )
365
- prev_page_btn.click(
366
- fn=lambda s: navigate_pdf_page("prev", s),
367
- inputs=[pdf_state],
368
- outputs=[pdf_preview_img, pdf_state, page_info]
369
- )
370
- next_page_btn.click(
371
- fn=lambda s: navigate_pdf_page("next", s),
372
- inputs=[pdf_state],
373
- outputs=[pdf_preview_img, pdf_state, page_info]
374
- )
375
- caption_submit.click(
376
- fn=generate_caption,
377
- inputs=[caption_image_upload] + adv_opts,
378
- outputs=[output, markdown_output])
379
-
380
  pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
381
  prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
382
  next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
 
383
 
384
  if __name__ == "__main__":
385
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
32
  # Define a new "Thistle" color palette
33
  colors.thistle = colors.Color(
34
  name="thistle",
35
+ c50="#FCF9FD",
36
+ c100="#F5F0F8",
37
+ c200="#EBE1F1",
38
+ c300="#E1D1E9",
39
+ c400="#D8BFD8", # Thistle Base
40
+ c500="#C5A9C2",
41
+ c600="#B194AC",
42
+ c700="#9C7F96",
43
+ c800="#876A80",
44
+ c900="#72556A",
45
+ c950="#5D4054",
46
  )
47
 
48
  colors.red_gray = colors.Color(
 
78
  super().set(
79
  background_fill_primary="*primary_50",
80
  background_fill_primary_dark="*primary_900",
81
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
82
  body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
83
+ button_primary_text_color="white",
84
+ button_primary_text_color_hover="black",
85
+ button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_500)",
86
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_300, *secondary_400)",
87
  button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
88
  button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
89
  button_secondary_text_color="black",
 
97
  button_cancel_background_fill_hover=f"linear-gradient(90deg, {colors.red_gray.c500}, {colors.red_gray.c600})",
98
  button_cancel_background_fill_hover_dark=f"linear-gradient(90deg, {colors.red_gray.c800}, {colors.red_gray.c900})",
99
  button_cancel_text_color="white",
100
+ button_cancel_text_color_dark="white",
101
+ button_cancel_text_color_hover="white",
102
+ button_cancel_text_color_hover_dark="white",
103
+ slider_color="*secondary_300",
104
  slider_color_dark="*secondary_600",
105
  block_title_text_weight="600",
106
  block_border_width="3px",
 
108
  button_primary_shadow="*shadow_drop_lg",
109
  button_large_padding="11px",
110
  color_accent_soft="*primary_100",
111
+ block_label_background_fill="*primary_200",
112
  )
113
 
114
  thistle_theme = ThistleTheme()
 
117
  css = """
118
  :root {
119
  --color-grey-50: #f9fafb;
120
+ --banner-background: var(--secondary-400);
121
+ --banner-text-color: var(--primary-100);
122
+ --banner-background-dark: var(--secondary-800);
123
+ --banner-text-color-dark: var(--primary-100);
124
+ --banner-chrome-height: calc(16px + 43px);
125
+ --chat-chrome-height-wide-no-banner: 320px;
126
+ --chat-chrome-height-narrow-no-banner: 450px;
127
+ --chat-chrome-height-wide: calc(var(--chat-chrome-height-wide-no-banner) + var(--banner-chrome-height));
128
+ --chat-chrome-height-narrow: calc(var(--chat-chrome-height-narrow-no-banner) + var(--banner-chrome-height));
129
  }
130
+ .banner-message { background-color: var(--banner-background); padding: 5px; margin: 0; border-radius: 5px; border: none; }
131
+ .banner-message-text { font-size: 13px; font-weight: bolder; color: var(--banner-text-color) !important; }
132
+ body.dark .banner-message { background-color: var(--banner-background-dark) !important; }
133
+ body.dark .gradio-container .contain .banner-message .banner-message-text { color: var(--banner-text-color-dark) !important; }
134
+ .toast-body { background-color: var(--color-grey-50); }
135
+ .html-container:has(.css-styles) { padding: 0; margin: 0; }
136
+ .css-styles { height: 0; }
137
+ .model-message { text-align: end; }
138
+ .model-dropdown-container { display: flex; align-items: center; gap: 10px; padding: 0; }
139
+ .user-input-container .multimodal-textbox{ border: none !important; }
140
+ .control-button { height: 51px; }
141
+ button.cancel { border: var(--button-border-width) solid var(--button-cancel-border-color); background: var(--button-cancel-background-fill); color: var(--button-cancel-text-color); box-shadow: var(--button-cancel-shadow); }
142
+ button.cancel:hover, .cancel[disabled] { background: var(--button-cancel-background-fill-hover); color: var(--button-cancel-text-color-hover); }
143
+ .opt-out-message { top: 8px; }
144
+ .opt-out-message .html-container, .opt-out-checkbox label { font-size: 14px !important; padding: 0 !important; margin: 0 !important; color: var(--neutral-400) !important; }
145
+ div.block.chatbot { height: calc(100svh - var(--chat-chrome-height-wide)) !important; max-height: 900px !important; }
146
+ div.no-padding { padding: 0 !important; }
147
+ @media (max-width: 1280px) { div.block.chatbot { height: calc(100svh - var(--chat-chrome-height-wide)) !important; } }
148
+ @media (max-width: 1024px) {
149
+ .responsive-row { flex-direction: column; }
150
+ .model-message { text-align: start; font-size: 10px !important; }
151
+ .model-dropdown-container { flex-direction: column; align-items: flex-start; }
152
+ div.block.chatbot { height: calc(100svh - var(--chat-chrome-height-narrow)) !important; }
153
+ }
154
+ @media (max-width: 400px) {
155
+ .responsive-row { flex-direction: column; }
156
+ .model-message { text-align: start; font-size: 10px !important; }
157
+ .model-dropdown-container { flex-direction: column; align-items: flex-start; }
158
+ div.block.chatbot { max-height: 360px !important; }
159
+ }
160
+ @media (max-height: 932px) { .chatbot { max-height: 500px !important; } }
161
+ @media (max-height: 1280px) { div.block.chatbot { max-height: 800px !important; } }
162
  """
163
 
164
  # --- App Constants & Setup ---
 
196
  success, image = vidcap.read()
197
  if success:
198
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
199
+ pil_image = Image.fromarray(image)
200
+ frames.append(pil_image)
201
  vidcap.release()
202
  return frames
203
 
 
227
  pages = convert_pdf_to_images(file_path)
228
  if not pages:
229
  return None, state, '<div style="text-align:center;">Could not load file</div>'
230
+ state["pages"] = pages
231
+ state["total_pages"] = len(pages)
232
+ page_info_html = f'<div style="text-align:center;">Page 1 / {state["total_pages"]}</div>'
233
+ return pages[0], state, page_info_html
234
  except Exception as e:
235
  return None, state, f'<div style="text-align:center;">Failed to load preview: {e}</div>'
236
 
237
  def navigate_pdf_page(direction: str, state: Dict[str, Any]):
238
  if not state or not state["pages"]:
239
  return None, state, '<div style="text-align:center;">No file loaded</div>'
240
+ current_index = state["current_page_index"]
241
+ total_pages = state["total_pages"]
242
+ if direction == "prev":
243
+ new_index = max(0, current_index - 1)
244
+ elif direction == "next":
245
+ new_index = min(total_pages - 1, current_index + 1)
246
+ else:
247
+ new_index = current_index
248
+ state["current_page_index"] = new_index
249
+ image_preview = state["pages"][new_index]
250
+ page_info_html = f'<div style="text-align:center;">Page {new_index + 1} / {total_pages}</div>'
251
+ return image_preview, state, page_info_html
252
 
253
  @spaces.GPU
254
+ def generate_image(text: str, image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
255
+ if image is None:
256
+ yield "Please upload an image.", "Please upload an image."
257
+ return
258
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
 
259
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
260
+ inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
 
261
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
262
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
 
 
263
  thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
264
  thread.start()
 
265
  buffer = ""
266
  for new_text in streamer:
267
  buffer += new_text
 
268
  time.sleep(0.01)
269
+ yield buffer, buffer
270
 
271
+ @spaces.GPU
272
+ def generate_video(text: str, video_path: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
 
 
 
 
 
273
  if video_path is None:
274
  yield "Please upload a video.", "Please upload a video."
275
  return
 
277
  if not frames:
278
  yield "Could not process video.", "Could not process video."
279
  return
280
+ messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
281
+ for frame in frames:
282
+ messages[0]["content"].insert(0, {"type": "image"})
283
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
284
+ inputs = processor_q3vl(text=[prompt_full], images=frames, return_tensors="pt", padding=True).to(device)
285
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
286
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": True, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty}
287
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
288
+ thread.start()
289
+ buffer = ""
290
+ for new_text in streamer:
291
+ buffer += new_text
292
+ buffer = buffer.replace("<|im_end|>", "")
293
+ time.sleep(0.01)
294
+ yield buffer, buffer
295
 
296
+ @spaces.GPU
297
+ def generate_pdf(text: str, state: Dict[str, Any], max_new_tokens: int = 2048, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
298
  if not state or not state["pages"]:
299
  yield "Please upload a PDF file first.", "Please upload a PDF file first."
300
  return
301
+ page_images = state["pages"]
302
  full_response = ""
303
+ for i, image in enumerate(page_images):
304
+ page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
305
  yield full_response + page_header, full_response + page_header
306
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
307
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
308
+ inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
309
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
310
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
311
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
312
+ thread.start()
313
+ page_buffer = ""
314
+ for new_text in streamer:
315
+ page_buffer += new_text
316
+ yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
317
+ time.sleep(0.01)
318
+ full_response += page_header + page_buffer + "\n\n"
319
 
320
+ @spaces.GPU
321
+ def generate_caption(image: Image.Image):
322
+ """
323
+ Generates a caption and attributes for a single image based on a standard system prompt.
324
+ """
325
  if image is None:
326
+ yield "Please upload an image to generate a caption."
327
  return
328
+
329
+ system_prompt = "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary task is to write a precise caption that captures the essence of the image in clear, concise, and contextually accurate language. Along with the caption, provide a structured set of attributes describing the visual elements, including details such as objects, people, actions, colors, environment, mood, and other notable characteristics. Ensure captions are precise, neutral, and descriptive, avoiding unnecessary elaboration or subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; only return the formatted caption, attributes, and class_name."
330
 
331
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
332
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
333
+
334
+ inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
335
+
336
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
337
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": DEFAULT_MAX_NEW_TOKENS}
338
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
339
+ thread.start()
340
+
341
+ buffer = ""
342
+ for new_text in streamer:
343
+ buffer += new_text
344
+ time.sleep(0.01)
345
+ yield buffer
346
 
347
  # --- Gradio Interface ---
348
  image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
349
  video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
350
+ pdf_examples = [["examples/sample-doc.pdf"]]
351
 
352
  with gr.Blocks(theme=thistle_theme, css=css) as demo:
353
  pdf_state = gr.State(value=get_initial_pdf_state())
354
+ gr.Markdown("# **Qwen3-VL-Processor**")
 
355
  with gr.Row():
356
  with gr.Column(scale=2):
357
  with gr.Tabs():
 
358
  with gr.TabItem("Image Inference"):
359
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
360
  image_upload = gr.Image(type="pil", label="Image", height=290)
361
  image_submit = gr.Button("Submit", variant="primary")
362
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
363
 
 
364
  with gr.TabItem("Video Inference"):
365
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
366
  video_upload = gr.Video(label="Video", height=290)
367
  video_submit = gr.Button("Submit", variant="primary")
368
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
369
 
 
370
  with gr.TabItem("PDF Inference"):
371
  with gr.Row():
372
  with gr.Column(scale=1):
 
374
  pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
375
  pdf_submit = gr.Button("Submit", variant="primary")
376
  with gr.Column(scale=1):
377
+ pdf_preview_img = gr.Image(label="PDF Preview", height=290)
378
  with gr.Row():
379
  prev_page_btn = gr.Button("◀ Previous")
380
  page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
381
  next_page_btn = gr.Button("Next ▶")
382
+
 
383
  with gr.TabItem("Caption"):
384
+ caption_image_upload = gr.Image(type="pil", label="Upload Image for Captioning", height=350)
385
  caption_submit = gr.Button("Generate Caption", variant="primary")
386
+ caption_output = gr.Markdown(label="Generated Caption and Attributes")
387
 
 
388
  with gr.Accordion("Advanced options", open=False):
389
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
390
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
391
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
392
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
393
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
 
394
 
 
395
  with gr.Column(scale=3):
396
  gr.Markdown("## Output")
397
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
 
399
  markdown_output = gr.Markdown(label="(Result.Md)")
400
 
401
  # Event handlers
402
+ image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
403
+ video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
404
+ pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
406
  prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
407
  next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
408
+ caption_submit.click(fn=generate_caption, inputs=[caption_image_upload], outputs=[caption_output])
409
 
410
  if __name__ == "__main__":
411
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)