Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

c827e70

verified ·

1 Parent(s): 63d52ce

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -21

app.py CHANGED Viewed

@@ -29,20 +29,11 @@ from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# 1. Define the new "Thistle" color palette
 colors.thistle = colors.Color(
     name="thistle",
-    c50="#F9F5F9",
-    c100="#F0E8F1",
-    c200="#E7DBE8",
-    c300="#DECEE0",
-    c400="#D2BFD8",
-    c500="#D8BFD8",  # Base color: Thistle
-    c600="#B59CB7",
-    c700="#927996",
-    c800="#6F5675",
-    c900="#4C3454",
-    c950="#291233",
 )
 colors.red_gray = colors.Color(
@@ -52,7 +43,6 @@ colors.red_gray = colors.Color(
     c800="#732d2d", c900="#5f2626", c950="#4d2020",
 )
-# 2. Create the new theme class using the Thistle palette
 class ThistleTheme(Soft):
     def __init__(
         self,
@@ -187,6 +177,26 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # --- Backend Functions ---
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -318,13 +328,11 @@ def generate_pdf(text: str, state: Dict[str, Any], max_new_tokens: int = 2048, t
             time.sleep(0.01)
         full_response += page_header + page_buffer + "\n\n"
-# 3. New backend function for the "Caption" tab
 @spaces.GPU
 def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     if image is None:
         yield "Please upload an image to caption.", "Please upload an image to caption."
         return
     system_prompt = (
         "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary "
         "task is to write a precise caption that captures the essence of the image in clear, concise, and contextually "
@@ -334,7 +342,6 @@ def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature
         "subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; "
         "only return the formatted caption, attributes, and class_name."
     )
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
@@ -348,6 +355,31 @@ def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature
         time.sleep(0.01)
         yield buffer, buffer
 # --- Gradio Interface ---
 image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
 video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
@@ -384,12 +416,15 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
                                 page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
                                 next_page_btn = gr.Button("Next ▶")
-                # 4. Add the new "Caption" tab to the UI
                 with gr.TabItem("Caption"):
                     caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -407,12 +442,12 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
     image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
     prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
     next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
-    # 5. Add the event handler for the new caption button
-    caption_submit.click(fn=generate_caption, inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.thistle = colors.Color(
     name="thistle",
+    c50="#F9F5F9", c100="#F0E8F1", c200="#E7DBE8", c300="#DECEE0",
+    c400="#D2BFD8", c500="#D8BFD8", c600="#B59CB7", c700="#927996",
+    c800="#6F5675", c900="#4C3454", c950="#291233",
 )
 colors.red_gray = colors.Color(
     c800="#732d2d", c900="#5f2626", c950="#4d2020",
 )
 class ThistleTheme(Soft):
     def __init__(
         self,
 ).to(device).eval()
 # --- Backend Functions ---
+def extract_gif_frames(gif_path: str):
+    """
+    Extracts and downsamples frames from a GIF file.
+    """
+    if not gif_path:
+        return []
+    with Image.open(gif_path) as gif:
+        total_frames = gif.n_frames
+        frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
+        frames = []
+        for i in frame_indices:
+            gif.seek(i)
+            # Convert frame to RGB and append a copy
+            frames.append(gif.convert("RGB").copy())
+    return frames
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
             time.sleep(0.01)
         full_response += page_header + page_buffer + "\n\n"
 @spaces.GPU
 def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     if image is None:
         yield "Please upload an image to caption.", "Please upload an image to caption."
         return
     system_prompt = (
         "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary "
         "task is to write a precise caption that captures the essence of the image in clear, concise, and contextually "
         "subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; "
         "only return the formatted caption, attributes, and class_name."
     )
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
         time.sleep(0.01)
         yield buffer, buffer
+@spaces.GPU
+def generate_gif(text: str, gif_path: str, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
+    if gif_path is None:
+        yield "Please upload a GIF.", "Please upload a GIF."
+        return
+    frames = extract_gif_frames(gif_path)
+    if not frames:
+        yield "Could not process GIF.", "Could not process GIF."
+        return
+    messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+    for frame in frames:
+        messages[0]["content"].insert(0, {"type": "image"})
+    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor_q3vl(text=[prompt_full], images=frames, return_tensors="pt", padding=True).to(device)
+    streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": True, "temperature": temperature, "top_p": top_p, "top_k": top_k, "repetition_penalty": repetition_penalty}
+    thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
+        yield buffer, buffer
 # --- Gradio Interface ---
 image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
 video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
                                 page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
                                 next_page_btn = gr.Button("Next ▶")
+                with gr.TabItem("Gif Inference"):
+                    gif_query = gr.Textbox(label="Query Input", placeholder="e.g., 'What is happening in this gif?'")
+                    gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
+                    gif_submit = gr.Button("Submit", variant="primary")
                 with gr.TabItem("Caption"):
                     caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
     image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    gif_submit.click(fn=generate_gif, inputs=[gif_query, gif_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
+    caption_submit.click(fn=generate_caption, inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
     prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
     next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)