prithivMLmods commited on
Commit
c827e70
·
verified ·
1 Parent(s): 63d52ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -21
app.py CHANGED
@@ -29,20 +29,11 @@ from transformers.image_utils import load_image
29
  from gradio.themes import Soft
30
  from gradio.themes.utils import colors, fonts, sizes
31
 
32
- # 1. Define the new "Thistle" color palette
33
  colors.thistle = colors.Color(
34
  name="thistle",
35
- c50="#F9F5F9",
36
- c100="#F0E8F1",
37
- c200="#E7DBE8",
38
- c300="#DECEE0",
39
- c400="#D2BFD8",
40
- c500="#D8BFD8", # Base color: Thistle
41
- c600="#B59CB7",
42
- c700="#927996",
43
- c800="#6F5675",
44
- c900="#4C3454",
45
- c950="#291233",
46
  )
47
 
48
  colors.red_gray = colors.Color(
@@ -52,7 +43,6 @@ colors.red_gray = colors.Color(
52
  c800="#732d2d", c900="#5f2626", c950="#4d2020",
53
  )
54
 
55
- # 2. Create the new theme class using the Thistle palette
56
  class ThistleTheme(Soft):
57
  def __init__(
58
  self,
@@ -187,6 +177,26 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
187
  ).to(device).eval()
188
 
189
  # --- Backend Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  def downsample_video(video_path):
191
  vidcap = cv2.VideoCapture(video_path)
192
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -318,13 +328,11 @@ def generate_pdf(text: str, state: Dict[str, Any], max_new_tokens: int = 2048, t
318
  time.sleep(0.01)
319
  full_response += page_header + page_buffer + "\n\n"
320
 
321
- # 3. New backend function for the "Caption" tab
322
  @spaces.GPU
323
  def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
324
  if image is None:
325
  yield "Please upload an image to caption.", "Please upload an image to caption."
326
  return
327
-
328
  system_prompt = (
329
  "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary "
330
  "task is to write a precise caption that captures the essence of the image in clear, concise, and contextually "
@@ -334,7 +342,6 @@ def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature
334
  "subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; "
335
  "only return the formatted caption, attributes, and class_name."
336
  )
337
-
338
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
339
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
340
  inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
@@ -348,6 +355,31 @@ def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature
348
  time.sleep(0.01)
349
  yield buffer, buffer
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  # --- Gradio Interface ---
352
  image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
353
  video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
@@ -384,12 +416,15 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
384
  page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
385
  next_page_btn = gr.Button("Next ▶")
386
 
387
- # 4. Add the new "Caption" tab to the UI
 
 
 
 
388
  with gr.TabItem("Caption"):
389
  caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
390
  caption_submit = gr.Button("Generate Caption", variant="primary")
391
 
392
-
393
  with gr.Accordion("Advanced options", open=False):
394
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
395
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -407,12 +442,12 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
407
  image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
408
  video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
409
  pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
 
 
 
410
  pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
411
  prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
412
  next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
413
-
414
- # 5. Add the event handler for the new caption button
415
- caption_submit.click(fn=generate_caption, inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
416
 
417
  if __name__ == "__main__":
418
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
29
  from gradio.themes import Soft
30
  from gradio.themes.utils import colors, fonts, sizes
31
 
 
32
  colors.thistle = colors.Color(
33
  name="thistle",
34
+ c50="#F9F5F9", c100="#F0E8F1", c200="#E7DBE8", c300="#DECEE0",
35
+ c400="#D2BFD8", c500="#D8BFD8", c600="#B59CB7", c700="#927996",
36
+ c800="#6F5675", c900="#4C3454", c950="#291233",
 
 
 
 
 
 
 
 
37
  )
38
 
39
  colors.red_gray = colors.Color(
 
43
  c800="#732d2d", c900="#5f2626", c950="#4d2020",
44
  )
45
 
 
46
  class ThistleTheme(Soft):
47
  def __init__(
48
  self,
 
177
  ).to(device).eval()
178
 
179
  # --- Backend Functions ---
180
+
181
+ def extract_gif_frames(gif_path: str):
182
+ """
183
+ Extracts and downsamples frames from a GIF file.
184
+ """
185
+ if not gif_path:
186
+ return []
187
+
188
+ with Image.open(gif_path) as gif:
189
+ total_frames = gif.n_frames
190
+ frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
191
+
192
+ frames = []
193
+ for i in frame_indices:
194
+ gif.seek(i)
195
+ # Convert frame to RGB and append a copy
196
+ frames.append(gif.convert("RGB").copy())
197
+
198
+ return frames
199
+
200
  def downsample_video(video_path):
201
  vidcap = cv2.VideoCapture(video_path)
202
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
 
328
  time.sleep(0.01)
329
  full_response += page_header + page_buffer + "\n\n"
330
 
 
331
  @spaces.GPU
332
  def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
333
  if image is None:
334
  yield "Please upload an image to caption.", "Please upload an image to caption."
335
  return
 
336
  system_prompt = (
337
  "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary "
338
  "task is to write a precise caption that captures the essence of the image in clear, concise, and contextually "
 
342
  "subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; "
343
  "only return the formatted caption, attributes, and class_name."
344
  )
 
345
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
346
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
347
  inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
 
355
  time.sleep(0.01)
356
  yield buffer, buffer
357
 
358
+ @spaces.GPU
359
+ def generate_gif(text: str, gif_path: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
360
+ if gif_path is None:
361
+ yield "Please upload a GIF.", "Please upload a GIF."
362
+ return
363
+ frames = extract_gif_frames(gif_path)
364
+ if not frames:
365
+ yield "Could not process GIF.", "Could not process GIF."
366
+ return
367
+ messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
368
+ for frame in frames:
369
+ messages[0]["content"].insert(0, {"type": "image"})
370
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
371
+ inputs = processor_q3vl(text=[prompt_full], images=frames, return_tensors="pt", padding=True).to(device)
372
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
373
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": True, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty}
374
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
375
+ thread.start()
376
+ buffer = ""
377
+ for new_text in streamer:
378
+ buffer += new_text
379
+ buffer = buffer.replace("<|im_end|>", "")
380
+ time.sleep(0.01)
381
+ yield buffer, buffer
382
+
383
  # --- Gradio Interface ---
384
  image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
385
  video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
 
416
  page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
417
  next_page_btn = gr.Button("Next ▶")
418
 
419
+ with gr.TabItem("Gif Inference"):
420
+ gif_query = gr.Textbox(label="Query Input", placeholder="e.g., 'What is happening in this gif?'")
421
+ gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
422
+ gif_submit = gr.Button("Submit", variant="primary")
423
+
424
  with gr.TabItem("Caption"):
425
  caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
426
  caption_submit = gr.Button("Generate Caption", variant="primary")
427
 
 
428
  with gr.Accordion("Advanced options", open=False):
429
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
430
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
442
  image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
443
  video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
444
  pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
445
+ gif_submit.click(fn=generate_gif, inputs=[gif_query, gif_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
446
+ caption_submit.click(fn=generate_caption, inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
447
+
448
  pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
449
  prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
450
  next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
 
 
 
451
 
452
  if __name__ == "__main__":
453
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)