Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 3

Commit

16e37bd

verified ·

1 Parent(s): 81d2b64

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -11

app.py CHANGED Viewed

@@ -87,7 +87,8 @@ model_md3 = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
 )
-tokenizer_md3 = AutoTokenizer.from_pretrained(MODEL_ID_MD3)
 # --- PDF Generation and Preview Utility Function ---
@@ -182,10 +183,11 @@ def process_document_stream(
     # --- Special Handling for Moondream3 ---
     if model_name == "Moondream3":
-        prompt_full = f"<image>\n{prompt_input}"
         answer = model_md3.answer_question(
-            model_md3.encode_image(image),
-            prompt_full,
             tokenizer=tokenizer_md3
         )
         yield answer, answer
@@ -255,14 +257,14 @@ def create_gradio_interface():
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
-                    choices=["Moondream3", "Camel-Doc-OCR-062825", "MinerU2.5-2509-1.2B", "Video-MTR"],
-                    label="Select Model", value= "Moondream3"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter the prompt")
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
-                with gr.Accordion("Advanced Settings (PDF)", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
                     temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                     top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
@@ -285,11 +287,11 @@ def create_gradio_interface():
                         raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
-                                examples=["examples/1.png", "examples/2.png", "examples/3.png",
-                                          "examples/4.png", "examples/5.png"],
                                 inputs=image_input, label="Examples"
                             )
-                        gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Tiny-VLMs-Lab/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Result.md)", open=True):
@@ -324,4 +326,4 @@ def create_gradio_interface():
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

     torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
 )
+# FIXED: Added trust_remote_code=True to the tokenizer loading
+tokenizer_md3 = AutoTokenizer.from_pretrained(MODEL_ID_MD3, trust_remote_code=True)
 # --- PDF Generation and Preview Utility Function ---
     # --- Special Handling for Moondream3 ---
     if model_name == "Moondream3":
+        # Moondream3 has a different inference method
+        enc_image = model_md3.encode_image(image)
         answer = model_md3.answer_question(
+            enc_image,
+            prompt_input,
             tokenizer=tokenizer_md3
         )
         yield answer, answer
             # Left Column (Inputs)
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
+                    choices=["Camel-Doc-OCR-062825", "MinerU2.5-2509-1.2B", "Video-MTR", "Moondream3"],
+                    label="Select Model", value= "Camel-Doc-OCR-062825"
                 )
                 prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter the prompt")
                 image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
+                with gr.Accordion("Advanced Settings", open=False):
                     max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
                     temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                     top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                         raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
+                                examples=[["examples/1.png"], ["examples/2.png"], ["examples/3.png"],
+                                          ["examples/4.png"], ["examples/5.png"], ["examples/6.png"]],
                                 inputs=image_input, label="Examples"
                             )
+                        gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
                     with gr.Tab("📰 README.md"):
                         with gr.Accordion("(Result.md)", open=True):
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)