Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

63d52ce

verified ·

1 Parent(s): 24aaf5e

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -30

app.py CHANGED Viewed

@@ -29,20 +29,20 @@ from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# Define a new "Thistle" color palette
 colors.thistle = colors.Color(
     name="thistle",
-    c50="#FCF9FD",
-    c100="#F5F0F8",
-    c200="#EBE1F1",
-    c300="#E1D1E9",
-    c400="#D8BFD8", # Thistle Base
-    c500="#C5A9C2",
-    c600="#B194AC",
-    c700="#9C7F96",
-    c800="#876A80",
-    c900="#72556A",
-    c950="#5D4054",
 )
 colors.red_gray = colors.Color(
@@ -52,6 +52,7 @@ colors.red_gray = colors.Color(
     c800="#732d2d", c900="#5f2626", c950="#4d2020",
 )
 class ThistleTheme(Soft):
     def __init__(
         self,
@@ -80,10 +81,10 @@ class ThistleTheme(Soft):
             background_fill_primary_dark="*primary_900",
             body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
             body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
-            button_primary_text_color="white",
-            button_primary_text_color_hover="black",
-            button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_500)",
-            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_300, *secondary_400)",
             button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
             button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
             button_secondary_text_color="black",
@@ -317,32 +318,35 @@ def generate_pdf(text: str, state: Dict[str, Any], max_new_tokens: int = 2048, t
             time.sleep(0.01)
         full_response += page_header + page_buffer + "\n\n"
 @spaces.GPU
-def generate_caption(image: Image.Image):
-    """
-    Generates a caption and attributes for a single image based on a standard system prompt.
-    """
     if image is None:
-        yield "Please upload an image to generate a caption."
         return
-    system_prompt = "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary task is to write a precise caption that captures the essence of the image in clear, concise, and contextually accurate language. Along with the caption, provide a structured set of attributes describing the visual elements, including details such as objects, people, actions, colors, environment, mood, and other notable characteristics. Ensure captions are precise, neutral, and descriptive, avoiding unnecessary elaboration or subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; only return the formatted caption, attributes, and class_name."
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": DEFAULT_MAX_NEW_TOKENS}
     thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
-        yield buffer
 # --- Gradio Interface ---
 image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
@@ -380,10 +384,11 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
                                 page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
                                 next_page_btn = gr.Button("Next ▶")
                 with gr.TabItem("Caption"):
-                    caption_image_upload = gr.Image(type="pil", label="Upload Image for Captioning", height=350)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
-                    caption_output = gr.Markdown(label="Generated Caption and Attributes")
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -405,7 +410,9 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
     pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
     prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
     next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
-    caption_submit.click(fn=generate_caption, inputs=[caption_image_upload], outputs=[caption_output])
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+# 1. Define the new "Thistle" color palette
 colors.thistle = colors.Color(
     name="thistle",
+    c50="#F9F5F9",
+    c100="#F0E8F1",
+    c200="#E7DBE8",
+    c300="#DECEE0",
+    c400="#D2BFD8",
+    c500="#D8BFD8",  # Base color: Thistle
+    c600="#B59CB7",
+    c700="#927996",
+    c800="#6F5675",
+    c900="#4C3454",
+    c950="#291233",
 )
 colors.red_gray = colors.Color(
     c800="#732d2d", c900="#5f2626", c950="#4d2020",
 )
+# 2. Create the new theme class using the Thistle palette
 class ThistleTheme(Soft):
     def __init__(
         self,
             background_fill_primary_dark="*primary_900",
             body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
             body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
+            button_primary_text_color="black",
+            button_primary_text_color_hover="white",
+            button_primary_background_fill="linear-gradient(90deg, *secondary_400, *secondary_400)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_600)",
             button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
             button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
             button_secondary_text_color="black",
             time.sleep(0.01)
         full_response += page_header + page_buffer + "\n\n"
+# 3. New backend function for the "Caption" tab
 @spaces.GPU
+def generate_caption(image: Image.Image, max_new_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
     if image is None:
+        yield "Please upload an image to caption.", "Please upload an image to caption."
         return
+    system_prompt = (
+        "You are an AI assistant that rigorously follows this response protocol: For every input image, your primary "
+        "task is to write a precise caption that captures the essence of the image in clear, concise, and contextually "
+        "accurate language. Along with the caption, provide a structured set of attributes describing the visual "
+        "elements, including details such as objects, people, actions, colors, environment, mood, and other notable "
+        "characteristics. Ensure captions are precise, neutral, and descriptive, avoiding unnecessary elaboration or "
+        "subjective interpretation unless explicitly required. Do not reference the rules or instructions in the output; "
+        "only return the formatted caption, attributes, and class_name."
+    )
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": system_prompt}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
+        yield buffer, buffer
 # --- Gradio Interface ---
 image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
                                 page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
                                 next_page_btn = gr.Button("Next ▶")
+                # 4. Add the new "Caption" tab to the UI
                 with gr.TabItem("Caption"):
+                    caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
     pdf_upload.change(fn=load_and_preview_pdf, inputs=[pdf_upload], outputs=[pdf_preview_img, pdf_state, page_info])
     prev_page_btn.click(fn=lambda s: navigate_pdf_page("prev", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
     next_page_btn.click(fn=lambda s: navigate_pdf_page("next", s), inputs=[pdf_state], outputs=[pdf_preview_img, pdf_state, page_info])
+    # 5. Add the event handler for the new caption button
+    caption_submit.click(fn=generate_caption, inputs=[caption_image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)