prithivMLmods commited on
Commit
30d47a8
·
verified ·
1 Parent(s): 432c2cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -20
app.py CHANGED
@@ -16,7 +16,7 @@ import numpy as np
16
  from PIL import Image
17
  import cv2
18
  import requests
19
- import fitz # PyMuPDF
20
 
21
  from transformers import (
22
  Qwen3VLMoeForConditionalGeneration,
@@ -25,7 +25,6 @@ from transformers import (
25
  )
26
  from transformers.image_utils import load_image
27
 
28
- # --- Theme Definition ---
29
  from gradio.themes import Soft
30
  from gradio.themes.utils import colors, fonts, sizes
31
 
@@ -50,7 +49,6 @@ class ThistleTheme(Soft):
50
  primary_hue: colors.Color | str = colors.gray,
51
  secondary_hue: colors.Color | str = colors.thistle,
52
  neutral_hue: colors.Color | str = colors.slate,
53
- # Update: Increased base text size from md to lg
54
  text_size: sizes.Size | str = sizes.text_lg,
55
  font: fonts.Font | str | Iterable[fonts.Font | str] = (
56
  fonts.GoogleFont("Inconsolata"), "Arial", "sans-serif",
@@ -105,14 +103,12 @@ class ThistleTheme(Soft):
105
 
106
  thistle_theme = ThistleTheme()
107
 
108
- # --- Custom CSS ---
109
  css = """
110
- /* Update: Added styles to increase the size of the main titles */
111
  #main-title h1 {
112
- font-size: 2.8em !important;
113
  }
114
  #output-title h2 {
115
- font-size: 2.2em !important;
116
  }
117
  :root {
118
  --color-grey-50: #f9fafb;
@@ -160,7 +156,6 @@ div.no-padding { padding: 0 !important; }
160
  @media (max-height: 1280px) { div.block.chatbot { max-height: 800px !important; } }
161
  """
162
 
163
- # --- App Constants & Setup ---
164
  MAX_MAX_NEW_TOKENS = 4096
165
  DEFAULT_MAX_NEW_TOKENS = 2048
166
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -175,7 +170,6 @@ if torch.cuda.is_available():
175
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
176
  print("Using device:", device)
177
 
178
- # --- Model Loading ---
179
  MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
180
  processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
181
  model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
@@ -184,8 +178,6 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
184
  dtype=torch.float16
185
  ).to(device).eval()
186
 
187
- # --- Backend Functions ---
188
-
189
  def extract_gif_frames(gif_path: str):
190
  if not gif_path:
191
  return []
@@ -380,16 +372,22 @@ def generate_gif(text: str, gif_path: str, max_new_tokens: int = 1024, temperatu
380
  buffer = buffer.replace("<|im_end|>", "")
381
  time.sleep(0.01)
382
  yield buffer, buffer
383
-
384
- # --- Gradio Interface ---
385
- image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
386
- video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
387
- pdf_examples = [["examples/sample-doc.pdf"]]
 
 
 
 
 
 
 
388
 
389
  with gr.Blocks(theme=thistle_theme, css=css) as demo:
390
  pdf_state = gr.State(value=get_initial_pdf_state())
391
- # Update: Added elem_id for CSS targeting
392
- gr.Markdown("## **Qwen3-VL-Demo**", elem_id="main-title")
393
  with gr.Row():
394
  with gr.Column(scale=2):
395
  with gr.Tabs():
@@ -422,10 +420,12 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
422
  gif_query = gr.Textbox(label="Query Input", placeholder="e.g., 'What is happening in this gif?'")
423
  gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
424
  gif_submit = gr.Button("Submit", variant="primary")
 
425
 
426
  with gr.TabItem("Caption"):
427
  caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
428
  caption_submit = gr.Button("Generate Caption", variant="primary")
 
429
 
430
  with gr.Accordion("Advanced options", open=False):
431
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -435,13 +435,11 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
435
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
436
 
437
  with gr.Column(scale=3):
438
- # Update: Added elem_id for CSS targeting
439
  gr.Markdown("## Output", elem_id="output-title")
440
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
441
  with gr.Accordion("(Result.md)", open=False):
442
  markdown_output = gr.Markdown(label="(Result.Md)")
443
 
444
- # Event handlers
445
  image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
446
  video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
447
  pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
 
16
  from PIL import Image
17
  import cv2
18
  import requests
19
+ import fitz
20
 
21
  from transformers import (
22
  Qwen3VLMoeForConditionalGeneration,
 
25
  )
26
  from transformers.image_utils import load_image
27
 
 
28
  from gradio.themes import Soft
29
  from gradio.themes.utils import colors, fonts, sizes
30
 
 
49
  primary_hue: colors.Color | str = colors.gray,
50
  secondary_hue: colors.Color | str = colors.thistle,
51
  neutral_hue: colors.Color | str = colors.slate,
 
52
  text_size: sizes.Size | str = sizes.text_lg,
53
  font: fonts.Font | str | Iterable[fonts.Font | str] = (
54
  fonts.GoogleFont("Inconsolata"), "Arial", "sans-serif",
 
103
 
104
  thistle_theme = ThistleTheme()
105
 
 
106
  css = """
 
107
  #main-title h1 {
108
+ font-size: 2.3em !important;
109
  }
110
  #output-title h2 {
111
+ font-size: 2.1em !important;
112
  }
113
  :root {
114
  --color-grey-50: #f9fafb;
 
156
  @media (max-height: 1280px) { div.block.chatbot { max-height: 800px !important; } }
157
  """
158
 
 
159
  MAX_MAX_NEW_TOKENS = 4096
160
  DEFAULT_MAX_NEW_TOKENS = 2048
161
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
170
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
171
  print("Using device:", device)
172
 
 
173
  MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
174
  processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
175
  model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
 
178
  dtype=torch.float16
179
  ).to(device).eval()
180
 
 
 
181
  def extract_gif_frames(gif_path: str):
182
  if not gif_path:
183
  return []
 
372
  buffer = buffer.replace("<|im_end|>", "")
373
  time.sleep(0.01)
374
  yield buffer, buffer
375
+
376
+ image_examples = [["Perform OCR on the image precisely and reconstruct it correctly...", "examples/images/1.jpg"],
377
+ ["Caption the image. Describe the safety measures shown in the image. Conclude whether the situation is (safe or unsafe)...", "examples/images/2.jpg"],
378
+ ["Solve the problem...", "examples/images/3.png"]]
379
+ video_examples = [["Explain the Ad video in detail.", "examples/videos/1.mp4"],
380
+ ["Explain the video in detail.", "examples/videos/2.mp4"]]
381
+ pdf_examples = [["Extract the content precisely.", "examples/pdfs/doc1.pdf"],
382
+ ["Analyze and provide a short report.", "examples/pdfs/doc2.pdf"]]
383
+ gif_examples = [["Describe this GIF.", "examples/gifs/1.gif"],
384
+ ["Describe this GIF.", "examples/gifs/2.gif"]]
385
+ caption_examples = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/thailand.jpg"],
386
+ ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG"]]
387
 
388
  with gr.Blocks(theme=thistle_theme, css=css) as demo:
389
  pdf_state = gr.State(value=get_initial_pdf_state())
390
+ gr.Markdown("# **Qwen3VL HF Demo**", elem_id="main-title")
 
391
  with gr.Row():
392
  with gr.Column(scale=2):
393
  with gr.Tabs():
 
420
  gif_query = gr.Textbox(label="Query Input", placeholder="e.g., 'What is happening in this gif?'")
421
  gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
422
  gif_submit = gr.Button("Submit", variant="primary")
423
+ gr.Examples(examples=gif_examples, inputs=[gif_query, gif_upload])
424
 
425
  with gr.TabItem("Caption"):
426
  caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
427
  caption_submit = gr.Button("Generate Caption", variant="primary")
428
+ gr.Examples(examples=caption_examples, inputs=[caption_image_upload])
429
 
430
  with gr.Accordion("Advanced options", open=False):
431
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
435
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
436
 
437
  with gr.Column(scale=3):
 
438
  gr.Markdown("## Output", elem_id="output-title")
439
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
440
  with gr.Accordion("(Result.md)", open=False):
441
  markdown_output = gr.Markdown(label="(Result.Md)")
442
 
 
443
  image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
444
  video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
445
  pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])