Spaces:

prithivMLmods
/

Multimodal-Edge-Node

Running on Zero

App Files Files Community

prithivMLmods commited on 25 days ago

Commit

aaa8883

verified ·

1 Parent(s): de9c364

update app

Browse files

Files changed (1) hide show

app.py +158 -0

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import gradio as gr
+import torch
+import spaces
+import json
+import ast
+import re
+from threading import Thread
+from PIL import Image
+from transformers import (
+    Qwen3_5ForConditionalGeneration,
+    AutoProcessor,
+    TextIteratorStreamer,
+)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = (
+    torch.bfloat16
+    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+    else torch.float16
+)
+MODEL_NAME = "Qwen/Qwen3.5-2B"
+CATEGORIES = ["Query", "Caption", "Point", "Detect"]
+print(f"Loading model: {MODEL_NAME} ...")
+qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained(
+    MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
+).eval()
+qwen_processor = AutoProcessor.from_pretrained(MODEL_NAME)
+print("Model loaded.")
+def safe_parse_json(text: str):
+    text = text.strip()
+    text = re.sub(r"^```(json)?", "", text)
+    text = re.sub(r"```$", "", text)
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    try:
+        return ast.literal_eval(text)
+    except Exception:
+        return {}
+def on_category_change(category: str):
+    placeholders = {
+        "Query": "e.g., Count the total number of boats and describe the environment.",
+        "Caption": "e.g., short, normal, detailed",
+        "Point": "e.g., The gun held by the person.",
+        "Detect": "e.g., The headlight of the car.",
+    }
+    return gr.Textbox(placeholder=placeholders.get(category, "Enter your prompt here."))
+@spaces.GPU
+def process_inputs(image, category, prompt):
+    if image is None:
+        raise gr.Error("Please upload an image.")
+    if not prompt or not prompt.strip():
+        raise gr.Error("Please provide a prompt.")
+    image = image.convert("RGB")
+    image.thumbnail((512, 512))
+    if category == "Query":
+        full_prompt = prompt
+    elif category == "Caption":
+        full_prompt = f"Provide a {prompt} length caption for the image."
+    elif category == "Point":
+        full_prompt = f"Provide 2d point coordinates for {prompt}. Report in JSON format."
+    elif category == "Detect":
+        full_prompt = f"Provide bounding box coordinates for {prompt}. Report in JSON format."
+    else:
+        full_prompt = prompt
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": full_prompt},
+            ],
+        }
+    ]
+    text = qwen_processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = qwen_processor(
+        text=[text], images=[image], return_tensors="pt", padding=True
+    ).to(qwen_model.device)
+    streamer = TextIteratorStreamer(
+        qwen_processor.tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True,
+        timeout=120,
+    )
+    thread = Thread(
+        target=qwen_model.generate,
+        kwargs=dict(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=1024,
+            use_cache=True,
+            temperature=1.5,
+            min_p=0.1,
+        ),
+    )
+    thread.start()
+    full_text = ""
+    for tok in streamer:
+        full_text += tok
+        yield full_text
+    thread.join()
+with gr.Blocks() as demo:
+    gr.Markdown("## Qwen 3.5 - Image Understanding")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload Image", height=350)
+            category_select = gr.Dropdown(
+                choices=CATEGORIES,
+                value="Query",
+                label="Task Category",
+                interactive=True,
+            )
+            prompt_input = gr.Textbox(
+                placeholder="e.g., Count the total number of boats and describe the environment.",
+                label="Prompt",
+                lines=3,
+            )
+            run_btn = gr.Button("Run", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(label="Output", lines=20, interactive=False)
+    category_select.change(
+        fn=on_category_change,
+        inputs=[category_select],
+        outputs=[prompt_input],
+    )
+    run_btn.click(
+        fn=process_inputs,
+        inputs=[image_input, category_select, prompt_input],
+        outputs=[output_text],
+    )
+if __name__ == "__main__":
+    demo.launch(show_error=True, ssr_mode=False)