prithivMLmods commited on
Commit
4efd691
·
verified ·
1 Parent(s): 3784a2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -127
app.py CHANGED
@@ -29,11 +29,20 @@ from transformers.image_utils import load_image
29
  from gradio.themes import Soft
30
  from gradio.themes.utils import colors, fonts, sizes
31
 
32
- colors.teal_gray = colors.Color(
33
- name="teal_gray",
34
- c50="#e8f1f4", c100="#cddde3", c200="#a8c3cf", c300="#7da6b8",
35
- c400="#588aa2", c500="#3d6e87", c600="#335b70", c700="#2b495a",
36
- c800="#2c5364", c900="#233f4b", c950="#1b323c",
 
 
 
 
 
 
 
 
 
37
  )
38
 
39
  colors.red_gray = colors.Color(
@@ -43,12 +52,12 @@ colors.red_gray = colors.Color(
43
  c800="#732d2d", c900="#5f2626", c950="#4d2020",
44
  )
45
 
46
- class Teals(Soft):
47
  def __init__(
48
  self,
49
  *,
50
  primary_hue: colors.Color | str = colors.gray,
51
- secondary_hue: colors.Color | str = colors.teal_gray,
52
  neutral_hue: colors.Color | str = colors.slate,
53
  text_size: sizes.Size | str = sizes.text_md,
54
  font: fonts.Font | str | Iterable[fonts.Font | str] = (
@@ -69,12 +78,12 @@ class Teals(Soft):
69
  super().set(
70
  background_fill_primary="*primary_50",
71
  background_fill_primary_dark="*primary_900",
72
- body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
73
  body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
74
- button_primary_text_color="white",
75
- button_primary_text_color_hover="black",
76
  button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_400)",
77
- button_primary_background_fill_hover="linear-gradient(90deg, *secondary_300, *secondary_300)",
78
  button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
79
  button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
80
  button_secondary_text_color="black",
@@ -88,10 +97,7 @@ class Teals(Soft):
88
  button_cancel_background_fill_hover=f"linear-gradient(90deg, {colors.red_gray.c500}, {colors.red_gray.c600})",
89
  button_cancel_background_fill_hover_dark=f"linear-gradient(90deg, {colors.red_gray.c800}, {colors.red_gray.c900})",
90
  button_cancel_text_color="white",
91
- button_cancel_text_color_dark="white",
92
- button_cancel_text_color_hover="white",
93
- button_cancel_text_color_hover_dark="white",
94
- slider_color="*secondary_300",
95
  slider_color_dark="*secondary_600",
96
  block_title_text_weight="600",
97
  block_border_width="3px",
@@ -99,57 +105,16 @@ class Teals(Soft):
99
  button_primary_shadow="*shadow_drop_lg",
100
  button_large_padding="11px",
101
  color_accent_soft="*primary_100",
102
- block_label_background_fill="*primary_200",
103
  )
104
 
105
- teals = Teals()
106
 
107
  # --- Custom CSS ---
108
  css = """
109
  :root {
110
  --color-grey-50: #f9fafb;
111
- --banner-background: var(--secondary-400);
112
- --banner-text-color: var(--primary-100);
113
- --banner-background-dark: var(--secondary-800);
114
- --banner-text-color-dark: var(--primary-100);
115
- --banner-chrome-height: calc(16px + 43px);
116
- --chat-chrome-height-wide-no-banner: 320px;
117
- --chat-chrome-height-narrow-no-banner: 450px;
118
- --chat-chrome-height-wide: calc(var(--chat-chrome-height-wide-no-banner) + var(--banner-chrome-height));
119
- --chat-chrome-height-narrow: calc(var(--chat-chrome-height-narrow-no-banner) + var(--banner-chrome-height));
120
  }
121
- .banner-message { background-color: var(--banner-background); padding: 5px; margin: 0; border-radius: 5px; border: none; }
122
- .banner-message-text { font-size: 13px; font-weight: bolder; color: var(--banner-text-color) !important; }
123
- body.dark .banner-message { background-color: var(--banner-background-dark) !important; }
124
- body.dark .gradio-container .contain .banner-message .banner-message-text { color: var(--banner-text-color-dark) !important; }
125
- .toast-body { background-color: var(--color-grey-50); }
126
- .html-container:has(.css-styles) { padding: 0; margin: 0; }
127
- .css-styles { height: 0; }
128
- .model-message { text-align: end; }
129
- .model-dropdown-container { display: flex; align-items: center; gap: 10px; padding: 0; }
130
- .user-input-container .multimodal-textbox{ border: none !important; }
131
- .control-button { height: 51px; }
132
- button.cancel { border: var(--button-border-width) solid var(--button-cancel-border-color); background: var(--button-cancel-background-fill); color: var(--button-cancel-text-color); box-shadow: var(--button-cancel-shadow); }
133
- button.cancel:hover, .cancel[disabled] { background: var(--button-cancel-background-fill-hover); color: var(--button-cancel-text-color-hover); }
134
- .opt-out-message { top: 8px; }
135
- .opt-out-message .html-container, .opt-out-checkbox label { font-size: 14px !important; padding: 0 !important; margin: 0 !important; color: var(--neutral-400) !important; }
136
- div.block.chatbot { height: calc(100svh - var(--chat-chrome-height-wide)) !important; max-height: 900px !important; }
137
- div.no-padding { padding: 0 !important; }
138
- @media (max-width: 1280px) { div.block.chatbot { height: calc(100svh - var(--chat-chrome-height-wide)) !important; } }
139
- @media (max-width: 1024px) {
140
- .responsive-row { flex-direction: column; }
141
- .model-message { text-align: start; font-size: 10px !important; }
142
- .model-dropdown-container { flex-direction: column; align-items: flex-start; }
143
- div.block.chatbot { height: calc(100svh - var(--chat-chrome-height-narrow)) !important; }
144
- }
145
- @media (max-width: 400px) {
146
- .responsive-row { flex-direction: column; }
147
- .model-message { text-align: start; font-size: 10px !important; }
148
- .model-dropdown-container { flex-direction: column; align-items: flex-start; }
149
- div.block.chatbot { max-height: 360px !important; }
150
- }
151
- @media (max-height: 932px) { .chatbot { max-height: 500px !important; } }
152
- @media (max-height: 1280px) { div.block.chatbot { max-height: 800px !important; } }
153
  """
154
 
155
  # --- App Constants & Setup ---
@@ -187,8 +152,7 @@ def downsample_video(video_path):
187
  success, image = vidcap.read()
188
  if success:
189
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
190
- pil_image = Image.fromarray(image)
191
- frames.append(pil_image)
192
  vidcap.release()
193
  return frames
194
 
@@ -218,49 +182,58 @@ def load_and_preview_pdf(file_path: Optional[str]) -> Tuple[Optional[Image.Image
218
  pages = convert_pdf_to_images(file_path)
219
  if not pages:
220
  return None, state, '<div style="text-align:center;">Could not load file</div>'
221
- state["pages"] = pages
222
- state["total_pages"] = len(pages)
223
- page_info_html = f'<div style="text-align:center;">Page 1 / {state["total_pages"]}</div>'
224
- return pages[0], state, page_info_html
225
  except Exception as e:
226
  return None, state, f'<div style="text-align:center;">Failed to load preview: {e}</div>'
227
 
228
  def navigate_pdf_page(direction: str, state: Dict[str, Any]):
229
  if not state or not state["pages"]:
230
  return None, state, '<div style="text-align:center;">No file loaded</div>'
231
- current_index = state["current_page_index"]
232
- total_pages = state["total_pages"]
233
- if direction == "prev":
234
- new_index = max(0, current_index - 1)
235
- elif direction == "next":
236
- new_index = min(total_pages - 1, current_index + 1)
237
- else:
238
- new_index = current_index
239
- state["current_page_index"] = new_index
240
- image_preview = state["pages"][new_index]
241
- page_info_html = f'<div style="text-align:center;">Page {new_index + 1} / {total_pages}</div>'
242
- return image_preview, state, page_info_html
243
 
244
  @spaces.GPU
245
- def generate_image(text: str, image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
246
- if image is None:
247
- yield "Please upload an image.", "Please upload an image."
248
- return
249
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
 
250
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
251
- inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
 
252
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
253
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
 
 
254
  thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
255
  thread.start()
 
256
  buffer = ""
257
  for new_text in streamer:
258
  buffer += new_text
 
259
  time.sleep(0.01)
260
- yield buffer, buffer
261
 
262
- @spaces.GPU
263
- def generate_video(text: str, video_path: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
 
 
 
 
 
264
  if video_path is None:
265
  yield "Please upload a video.", "Please upload a video."
266
  return
@@ -268,69 +241,70 @@ def generate_video(text: str, video_path: str, max_new_tokens: int = 1024, tempe
268
  if not frames:
269
  yield "Could not process video.", "Could not process video."
270
  return
271
- messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
272
- for frame in frames:
273
- messages[0]["content"].insert(0, {"type": "image"})
274
- prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
275
- inputs = processor_q3vl(text=[prompt_full], images=frames, return_tensors="pt", padding=True).to(device)
276
- streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
277
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": True, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty}
278
- thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
279
- thread.start()
280
- buffer = ""
281
- for new_text in streamer:
282
- buffer += new_text
283
- buffer = buffer.replace("<|im_end|>", "")
284
- time.sleep(0.01)
285
- yield buffer, buffer
286
 
287
- @spaces.GPU
288
- def generate_pdf(text: str, state: Dict[str, Any], max_new_tokens: int = 2048, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
289
  if not state or not state["pages"]:
290
  yield "Please upload a PDF file first.", "Please upload a PDF file first."
291
  return
292
- page_images = state["pages"]
293
  full_response = ""
294
- for i, image in enumerate(page_images):
295
- page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
296
  yield full_response + page_header, full_response + page_header
297
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
298
- prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
299
- inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
300
- streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
301
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
302
- thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
303
- thread.start()
304
- page_buffer = ""
305
- for new_text in streamer:
306
- page_buffer += new_text
307
- yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
308
- time.sleep(0.01)
309
- full_response += page_header + page_buffer + "\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  # --- Gradio Interface ---
312
  image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
313
  video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
314
- pdf_examples = [["examples/sample-doc.pdf"]]
315
 
316
- with gr.Blocks(theme=teals, css=css) as demo:
317
  pdf_state = gr.State(value=get_initial_pdf_state())
318
  gr.Markdown("# **Qwen3-VL-Demo**")
 
319
  with gr.Row():
320
  with gr.Column(scale=2):
321
  with gr.Tabs():
 
322
  with gr.TabItem("Image Inference"):
323
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
324
  image_upload = gr.Image(type="pil", label="Image", height=290)
325
  image_submit = gr.Button("Submit", variant="primary")
326
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
327
 
 
328
  with gr.TabItem("Video Inference"):
329
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
330
  video_upload = gr.Video(label="Video", height=290)
331
  video_submit = gr.Button("Submit", variant="primary")
332
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
333
 
 
334
  with gr.TabItem("PDF Inference"):
335
  with gr.Row():
336
  with gr.Column(scale=1):
@@ -338,19 +312,29 @@ with gr.Blocks(theme=teals, css=css) as demo:
338
  pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
339
  pdf_submit = gr.Button("Submit", variant="primary")
340
  with gr.Column(scale=1):
341
- pdf_preview_img = gr.Image(label="PDF Preview", height=290)
342
  with gr.Row():
343
  prev_page_btn = gr.Button("◀ Previous")
344
  page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
345
  next_page_btn = gr.Button("Next ▶")
 
 
 
 
 
 
346
 
 
347
  with gr.Accordion("Advanced options", open=False):
348
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
349
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
350
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
351
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
352
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
 
353
 
 
354
  with gr.Column(scale=3):
355
  gr.Markdown("## Output")
356
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
@@ -388,5 +372,14 @@ with gr.Blocks(theme=teals, css=css) as demo:
388
  inputs=[pdf_state],
389
  outputs=[pdf_preview_img, pdf_state, page_info]
390
  )
 
 
 
 
 
 
 
 
 
391
  if __name__ == "__main__":
392
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
29
  from gradio.themes import Soft
30
  from gradio.themes.utils import colors, fonts, sizes
31
 
32
+ # Define a new "Thistle" color palette
33
+ colors.thistle = colors.Color(
34
+ name="thistle",
35
+ c50="#F9F5F9",
36
+ c100="#F3ECF4",
37
+ c200="#E8D9EA",
38
+ c300="#DCC5E0",
39
+ c400="#D0B2D6",
40
+ c500="#D8BFD8", # Thistle
41
+ c600="#B8A2B9",
42
+ c700="#98869A",
43
+ c800="#796A7C",
44
+ c900="#5C505D",
45
+ c950="#423A44",
46
  )
47
 
48
  colors.red_gray = colors.Color(
 
52
  c800="#732d2d", c900="#5f2626", c950="#4d2020",
53
  )
54
 
55
+ class ThistleTheme(Soft):
56
  def __init__(
57
  self,
58
  *,
59
  primary_hue: colors.Color | str = colors.gray,
60
+ secondary_hue: colors.Color | str = colors.thistle,
61
  neutral_hue: colors.Color | str = colors.slate,
62
  text_size: sizes.Size | str = sizes.text_md,
63
  font: fonts.Font | str | Iterable[fonts.Font | str] = (
 
78
  super().set(
79
  background_fill_primary="*primary_50",
80
  background_fill_primary_dark="*primary_900",
81
+ body_background_fill="linear-gradient(135deg, *secondary_200, *secondary_100)",
82
  body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
83
+ button_primary_text_color="*neutral_900",
84
+ button_primary_text_color_hover="white",
85
  button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_400)",
86
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_600)",
87
  button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
88
  button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
89
  button_secondary_text_color="black",
 
97
  button_cancel_background_fill_hover=f"linear-gradient(90deg, {colors.red_gray.c500}, {colors.red_gray.c600})",
98
  button_cancel_background_fill_hover_dark=f"linear-gradient(90deg, {colors.red_gray.c800}, {colors.red_gray.c900})",
99
  button_cancel_text_color="white",
100
+ slider_color="*secondary_400",
 
 
 
101
  slider_color_dark="*secondary_600",
102
  block_title_text_weight="600",
103
  block_border_width="3px",
 
105
  button_primary_shadow="*shadow_drop_lg",
106
  button_large_padding="11px",
107
  color_accent_soft="*primary_100",
108
+ block_label_background_fill="*secondary_200",
109
  )
110
 
111
+ thistle_theme = ThistleTheme()
112
 
113
  # --- Custom CSS ---
114
  css = """
115
  :root {
116
  --color-grey-50: #f9fafb;
 
 
 
 
 
 
 
 
 
117
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  """
119
 
120
  # --- App Constants & Setup ---
 
152
  success, image = vidcap.read()
153
  if success:
154
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
155
+ frames.append(Image.fromarray(image))
 
156
  vidcap.release()
157
  return frames
158
 
 
182
  pages = convert_pdf_to_images(file_path)
183
  if not pages:
184
  return None, state, '<div style="text-align:center;">Could not load file</div>'
185
+ state["pages"], state["total_pages"] = pages, len(pages)
186
+ return pages[0], state, f'<div style="text-align:center;">Page 1 / {state["total_pages"]}</div>'
 
 
187
  except Exception as e:
188
  return None, state, f'<div style="text-align:center;">Failed to load preview: {e}</div>'
189
 
190
  def navigate_pdf_page(direction: str, state: Dict[str, Any]):
191
  if not state or not state["pages"]:
192
  return None, state, '<div style="text-align:center;">No file loaded</div>'
193
+ idx, total = state["current_page_index"], state["total_pages"]
194
+ new_idx = max(0, idx - 1) if direction == "prev" else min(total - 1, idx + 1)
195
+ state["current_page_index"] = new_idx
196
+ return state["pages"][new_idx], state, f'<div style="text-align:center;">Page {new_idx + 1} / {total}</div>'
 
 
 
 
 
 
 
 
197
 
198
  @spaces.GPU
199
+ def model_stream_response(prompt_text: str, images: list, max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
200
+ messages = [{"role": "user", "content": []}]
201
+ for img in images:
202
+ messages[0]["content"].append({"type": "image"})
203
+ messages[0]["content"].append({"type": "text", "text": prompt_text})
204
+
205
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
206
+ inputs = processor_q3vl(text=[prompt_full], images=images, return_tensors="pt", padding=True).to(device)
207
+
208
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
209
+
210
+ generation_kwargs = {
211
+ **inputs,
212
+ "streamer": streamer,
213
+ "max_new_tokens": max_new_tokens,
214
+ "do_sample": True,
215
+ "temperature": temperature,
216
+ "top_p": top_p,
217
+ "top_k": top_k,
218
+ "repetition_penalty": repetition_penalty,
219
+ }
220
+
221
  thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
222
  thread.start()
223
+
224
  buffer = ""
225
  for new_text in streamer:
226
  buffer += new_text
227
+ yield buffer.replace("<|im_end|>", ""), buffer.replace("<|im_end|>", "")
228
  time.sleep(0.01)
 
229
 
230
+ def generate_image(text: str, image: Image.Image, *args):
231
+ if image is None:
232
+ yield "Please upload an image.", "Please upload an image."
233
+ return
234
+ yield from model_stream_response(text, [image], *args)
235
+
236
+ def generate_video(text: str, video_path: str, *args):
237
  if video_path is None:
238
  yield "Please upload a video.", "Please upload a video."
239
  return
 
241
  if not frames:
242
  yield "Could not process video.", "Could not process video."
243
  return
244
+ yield from model_stream_response(text, frames, *args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ def generate_pdf(text: str, state: Dict[str, Any], *args):
 
247
  if not state or not state["pages"]:
248
  yield "Please upload a PDF file first.", "Please upload a PDF file first."
249
  return
250
+
251
  full_response = ""
252
+ for i, image in enumerate(state["pages"]):
253
+ page_header = f"--- Page {i+1}/{len(state['pages'])} ---\n"
254
  yield full_response + page_header, full_response + page_header
255
+
256
+ # This is a simplified approach. For true streaming of the whole PDF, a more complex logic would be needed.
257
+ # Here we just get the full response for the page and then append it.
258
+ final_page_text = ""
259
+ for page_text, _ in model_stream_response(text, [image], *args):
260
+ yield full_response + page_header + page_text, full_response + page_header + page_text
261
+ final_page_text = page_text
262
+
263
+ full_response += page_header + final_page_text + "\n\n"
264
+
265
+ def generate_caption(image: Image.Image, *args):
266
+ if image is None:
267
+ yield "Please upload an image for captioning.", "Please upload an image for captioning."
268
+ return
269
+
270
+ system_prompt = (
271
+ "You are an AI assistant that rigorously follows this response protocol: For every input image, "
272
+ "your primary task is to write a precise caption that captures the essence of the image in clear, "
273
+ "concise, and contextually accurate language. Along with the caption, provide a structured set of "
274
+ "attributes describing the visual elements, including details such as objects, people, actions, "
275
+ "colors, environment, mood, and other notable characteristics. Ensure captions are precise, neutral, "
276
+ "and descriptive, avoiding unnecessary elaboration or subjective interpretation unless explicitly required. "
277
+ "Do not reference the rules or instructions in the output; only return the formatted caption, attributes, and class_name."
278
+ )
279
+ yield from model_stream_response(system_prompt, [image], *args)
280
 
281
  # --- Gradio Interface ---
282
  image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
283
  video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
284
+ caption_examples = [["images/3.png"], ["images/5.jpg"]]
285
 
286
+ with gr.Blocks(theme=thistle_theme, css=css) as demo:
287
  pdf_state = gr.State(value=get_initial_pdf_state())
288
  gr.Markdown("# **Qwen3-VL-Demo**")
289
+
290
  with gr.Row():
291
  with gr.Column(scale=2):
292
  with gr.Tabs():
293
+ # Image Tab
294
  with gr.TabItem("Image Inference"):
295
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
296
  image_upload = gr.Image(type="pil", label="Image", height=290)
297
  image_submit = gr.Button("Submit", variant="primary")
298
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
299
 
300
+ # Video Tab
301
  with gr.TabItem("Video Inference"):
302
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
303
  video_upload = gr.Video(label="Video", height=290)
304
  video_submit = gr.Button("Submit", variant="primary")
305
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
306
 
307
+ # PDF Tab
308
  with gr.TabItem("PDF Inference"):
309
  with gr.Row():
310
  with gr.Column(scale=1):
 
312
  pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
313
  pdf_submit = gr.Button("Submit", variant="primary")
314
  with gr.Column(scale=1):
315
+ pdf_preview_img = gr.Image(label="PDF Preview", height=290, interactive=False)
316
  with gr.Row():
317
  prev_page_btn = gr.Button("◀ Previous")
318
  page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
319
  next_page_btn = gr.Button("Next ▶")
320
+
321
+ # Caption Tab
322
+ with gr.TabItem("Caption"):
323
+ caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
324
+ caption_submit = gr.Button("Generate Caption", variant="primary")
325
+ gr.Examples(examples=caption_examples, inputs=[caption_image_upload])
326
 
327
+ # Advanced Options
328
  with gr.Accordion("Advanced options", open=False):
329
+ adv_opts = [
330
+ gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
331
+ gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
332
+ gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
333
+ gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
334
+ gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
335
+ ]
336
 
337
+ # Output Column
338
  with gr.Column(scale=3):
339
  gr.Markdown("## Output")
340
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
 
372
  inputs=[pdf_state],
373
  outputs=[pdf_preview_img, pdf_state, page_info]
374
  )
375
+ caption_submit.click(
376
+ fn=generate_caption,
377
+ inputs=[caption_image_upload] + adv_opts,
378
+ outputs=[output, markdown_output])
379
+
380
+ pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
381
+ prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
382
+ next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
383
+
384
  if __name__ == "__main__":
385
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)