import os
import io
import json
import ast
import re
import uuid
import threading
from pathlib import Path
from typing import Optional
import spaces
import torch
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from gradio import Server
from fastapi import Request, UploadFile, File, Form
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
from transformers import (
Qwen2_5_VLForConditionalGeneration,
Qwen3_5ForConditionalGeneration,
Qwen3VLForConditionalGeneration,
Gemma4ForConditionalGeneration,
AutoProcessor,
AutoModelForImageTextToText,
TextIteratorStreamer,
)
from qwen_vl_utils import process_vision_info
# --- App Configuration & Initialization ---
app = Server()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = (
torch.bfloat16
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
else torch.float16
)
QWEN_VL_2B_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"
QWEN_VL_4B_MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct"
QWEN_4B_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-4B-Unredacted-MAX"
QWEN_4B_MODEL_NAME = "Qwen/Qwen3.5-4B"
QWEN_2B_MODEL_NAME = "Qwen/Qwen3.5-2B"
LFM_450_MODEL_NAME = "LiquidAI/LFM2.5-VL-450M"
GEMMA4_E2B_NAME = "google/gemma-4-E2B-it"
LFM_16_MODEL_NAME = "LiquidAI/LFM2.5-VL-1.6B"
QWEN_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-2B-Unredacted-MAX"
QWEN25_VL_3B_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
# ── Qwen3-VL-2B-Instruct ────────────────────────────────
print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...")
try:
qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained(
QWEN_VL_2B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
qwen_vl_2b_processor = AutoProcessor.from_pretrained(QWEN_VL_2B_MODEL_NAME, trust_remote_code=True)
print("Qwen3-VL-2B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
qwen_vl_2b_model = None
qwen_vl_2b_processor = None
# ── Qwen3-VL-4B-Instruct ────────────────────────────────
print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
try:
qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained(
QWEN_VL_4B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
qwen_vl_4b_processor = AutoProcessor.from_pretrained(QWEN_VL_4B_MODEL_NAME, trust_remote_code=True)
print("Qwen3-VL-4B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
qwen_vl_4b_model = None
qwen_vl_4b_processor = None
# ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
try:
qwen_4b_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_4B_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_4b_unredacted_processor = AutoProcessor.from_pretrained(QWEN_4B_UNREDACTED_NAME)
print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
qwen_4b_unredacted_model = None
qwen_4b_unredacted_processor = None
# ── Qwen3.5-4B ──────────────────────────────────────────
print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
try:
qwen_4b_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_4B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_4b_processor = AutoProcessor.from_pretrained(QWEN_4B_MODEL_NAME)
print("Qwen3.5-4B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
qwen_4b_model = None
qwen_4b_processor = None
# ── Qwen3.5-2B ──────────────────────────────────────────
print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
try:
qwen_2b_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_2B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_2b_processor = AutoProcessor.from_pretrained(QWEN_2B_MODEL_NAME)
print("Qwen3.5-2B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
qwen_2b_model = None
qwen_2b_processor = None
# ── LFM2.5-VL-450M ──────────────────────────────────────
print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
try:
lfm_450_model = AutoModelForImageTextToText.from_pretrained(
LFM_450_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16,
).eval()
lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME)
print("LFM-450M model loaded successfully.")
except Exception as e:
print(f"Warning: LFM-450M model loading failed. Error: {e}")
lfm_450_model = None
lfm_450_processor = None
# ── Gemma4-E2B-it ───────────────────────────────────────
print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
try:
gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained(
GEMMA4_E2B_NAME, torch_dtype=torch.bfloat16,
device_map="auto" if torch.cuda.is_available() else None,
).eval()
if not torch.cuda.is_available():
gemma4_e2b_model = gemma4_e2b_model.to(DEVICE)
gemma4_e2b_processor = AutoProcessor.from_pretrained(GEMMA4_E2B_NAME)
print("Gemma4-E2B-it model loaded successfully.")
except Exception as e:
print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
gemma4_e2b_model = None
gemma4_e2b_processor = None
# ── LFM2.5-VL-1.6B ──────────────────────────────────────
print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
try:
lfm_16_model = AutoModelForImageTextToText.from_pretrained(
LFM_16_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16,
).eval()
lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME)
print("LFM-1.6B model loaded successfully.")
except Exception as e:
print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
lfm_16_model = None
lfm_16_processor = None
# ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
try:
qwen_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_unredacted_processor = AutoProcessor.from_pretrained(QWEN_UNREDACTED_NAME)
print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
qwen_unredacted_model = None
qwen_unredacted_processor = None
# ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
try:
qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
QWEN25_VL_3B_NAME, torch_dtype="auto", device_map="auto",
).eval()
qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME)
print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
qwen25_vl_3b_model = None
qwen25_vl_3b_processor = None
# ---------------------------------------------------------------------------
# Utility: safe JSON parser (strips markdown fences, handles ast fallback)
# ---------------------------------------------------------------------------
def safe_parse_json(text: str):
text = text.strip()
# strip …
text = re.sub(r"[\s\S]*?", "", text, flags=re.IGNORECASE).strip()
text = re.sub(r"^```(json)?", "", text)
text = re.sub(r"```$", "", text)
text = text.strip()
try:
return json.loads(text)
except json.JSONDecodeError:
pass
try:
return ast.literal_eval(text)
except Exception:
pass
# Try to find the first JSON array or object in the text
for pattern in [r'\[[\s\S]*\]', r'\{[\s\S]*\}']:
m = re.search(pattern, text)
if m:
try:
return json.loads(m.group())
except Exception:
pass
return None
# ---------------------------------------------------------------------------
# Server-side annotation (mirrors reference annotate_image exactly)
# ---------------------------------------------------------------------------
PALETTE_COLORS = [
(78, 205, 196), # teal
(124, 106, 247), # purple
(255, 107, 107), # red
(255, 217, 61), # yellow
(107, 203, 119), # green
(255, 146, 43), # orange
(204, 93, 232), # magenta
(51, 154, 240), # blue
]
def _get_font(size: int = 14):
"""Try to load a truetype font, fall back to default."""
for font_name in ["arial.ttf", "Arial.ttf", "DejaVuSans.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]:
try:
return ImageFont.truetype(font_name, size)
except (IOError, OSError):
continue
return ImageFont.load_default()
def annotate_detections(image: Image.Image, objects: list) -> Image.Image:
"""
Draw bounding boxes + labels on image.
objects: list of {label, x_min, y_min, x_max, y_max} (all coords 0-1 fractions)
"""
image = image.convert("RGB").copy()
W, H = image.size
draw = ImageDraw.Draw(image, "RGBA")
font_lbl = _get_font(max(12, W // 40))
for i, obj in enumerate(objects):
col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
col_rgba_fill = col + (46,) # ~18% opacity fill
col_rgba_solid = col + (255,)
x1 = int(obj["x_min"] * W)
y1 = int(obj["y_min"] * H)
x2 = int(obj["x_max"] * W)
y2 = int(obj["y_max"] * H)
# clamp
x1, x2 = max(0, x1), min(W, x2)
y1, y2 = max(0, y1), min(H, y2)
if x2 <= x1 or y2 <= y1:
continue
# Filled rectangle
draw.rectangle([x1, y1, x2, y2], fill=col_rgba_fill)
# Border (draw 2px by drawing twice)
lw = max(2, W // 200)
for t in range(lw):
draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=col_rgba_solid)
# Corner accents
ca = min(18, (x2-x1)//4, (y2-y1)//4)
cw = max(2, lw + 1)
for (cx, cy, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]:
draw.line([cx, cy, cx+dx*ca, cy], fill=col_rgba_solid, width=cw)
draw.line([cx, cy, cx, cy+dy*ca], fill=col_rgba_solid, width=cw)
# Label pill
label = obj.get("label", "object")
try:
bb = font_lbl.getbbox(label)
tw, th = bb[2]-bb[0], bb[3]-bb[1]
except Exception:
tw, th = len(label)*7, 12
pad = 5
pw, ph = tw + pad*2, th + pad*2
lx = max(0, min(x1, W - pw))
ly = max(0, y1 - ph) if y1 - ph >= 0 else y1 + 2
draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba_solid)
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
return image
def annotate_points(image: Image.Image, points: list) -> Image.Image:
"""
Draw point markers + labels on image.
points: list of {label, x, y} (coords 0-1 fractions)
"""
image = image.convert("RGB").copy()
W, H = image.size
draw = ImageDraw.Draw(image, "RGBA")
font_lbl = _get_font(max(12, W // 40))
r = max(7, W // 55)
for i, pt in enumerate(points):
col = PALETTE_COLORS[i % len(PALETTE_COLORS)]
col_rgba = col + (255,)
glow_rgba = col + (40,)
mid_rgba = col + (64,)
cx = int(pt["x"] * W)
cy = int(pt["y"] * H)
cx = max(r, min(W-r, cx))
cy = max(r, min(H-r, cy))
# Outer glow
draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=glow_rgba)
# Mid ring
draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)], fill=mid_rgba)
# Core dot
draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=col_rgba, outline=(255,255,255,255), width=max(2,r//3))
# Centre white dot
cr = max(2, r//3)
draw.ellipse([cx-cr, cy-cr, cx+cr, cy+cr], fill=(255,255,255,255))
# Label
label = pt.get("label", "")
if label:
try:
bb = font_lbl.getbbox(label)
tw, th = bb[2]-bb[0], bb[3]-bb[1]
except Exception:
tw, th = len(label)*7, 12
pad = 5
pw, ph = tw + pad*2, th + pad*2
lx = min(cx + r + 6, W - pw)
ly = max(0, cy - ph//2)
draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba)
draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl)
return image
def parse_and_annotate(image: Image.Image, full_text: str, category: str):
"""
Parse model output and return annotated PIL image + structured result dict.
Mirrors the reference code logic exactly.
"""
parsed = safe_parse_json(full_text)
if parsed is None:
return image, {"error": "No JSON found in model output", "raw": full_text[:500]}
if category == "Point":
result = {"points": []}
items = parsed if isinstance(parsed, list) else [parsed]
for item in items:
if isinstance(item, dict) and "point_2d" in item:
coords = item["point_2d"]
if isinstance(coords, (list, tuple)) and len(coords) == 2:
x, y = float(coords[0]), float(coords[1])
# Reference divides by 1000.0 — Qwen uses 0-1000 scale
result["points"].append({
"label": item.get("label", ""),
"x": x / 1000.0,
"y": y / 1000.0,
})
annotated = annotate_points(image.copy(), result["points"])
return annotated, result
elif category == "Detect":
result = {"objects": []}
items = parsed if isinstance(parsed, list) else [parsed]
for item in items:
if isinstance(item, dict) and "bbox_2d" in item:
coords = item["bbox_2d"]
if isinstance(coords, (list, tuple)) and len(coords) == 4:
xmin, ymin, xmax, ymax = [float(v) for v in coords]
result["objects"].append({
"label": item.get("label", "object"),
"x_min": xmin / 1000.0,
"y_min": ymin / 1000.0,
"x_max": xmax / 1000.0,
"y_max": ymax / 1000.0,
})
annotated = annotate_detections(image.copy(), result["objects"])
return annotated, result
return image, {}
def pil_to_png_bytes(image: Image.Image) -> bytes:
buf = io.BytesIO()
image.save(buf, format="PNG")
return buf.getvalue()
# ---------------------------------------------------------------------------
# Inference Generator (Streaming)
# ---------------------------------------------------------------------------
@spaces.GPU(duration=120)
def generate_inference_stream(
image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
):
if category == "Query":
full_prompt = prompt
elif category == "Caption":
full_prompt = f"Provide a {prompt} length caption for the image."
elif category == "Point":
full_prompt = f"Provide 2d point coordinates for {prompt}. Report in JSON format."
elif category == "Detect":
full_prompt = f"Provide bounding box coordinates for {prompt}. Report in JSON format."
else:
full_prompt = prompt
# ── Qwen3-VL-2B ─────────────────────────────────────
if model_id == "qwen_vl_2b":
if qwen_vl_2b_model is None or qwen_vl_2b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen_vl_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = qwen_vl_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_2b_model.device)
streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen_vl_2b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3-VL-4B ─────────────────────────────────────
elif model_id == "qwen_vl_4b":
if qwen_vl_4b_model is None or qwen_vl_4b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen_vl_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = qwen_vl_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_4b_model.device)
streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen_vl_4b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
elif model_id == "qwen_4b_unredacted":
if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen_4b_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = qwen_4b_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_unredacted_model.device)
streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen_4b_unredacted_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-4B ──────────────────────────────────────
elif model_id == "qwen_4b":
if qwen_4b_model is None or qwen_4b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = qwen_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_model.device)
streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen_4b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-2B ──────────────────────────────────────
elif model_id == "qwen_2b":
if qwen_2b_model is None or qwen_2b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = qwen_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_2b_model.device)
streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen_2b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── LFM-450M ────────────────────────────────────────
elif model_id == "lfm_450":
if lfm_450_model is None or lfm_450_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
conversation = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
inputs = lfm_450_processor.apply_chat_template(
conversation, add_generation_prompt=True,
return_tensors="pt", return_dict=True, tokenize=True,
).to(lfm_450_model.device)
streamer = TextIteratorStreamer(lfm_450_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=lfm_450_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Gemma4-E2B-it ───────────────────────────────────
elif model_id == "gemma4_e2b":
if gemma4_e2b_model is None or gemma4_e2b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = gemma4_e2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = gemma4_e2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(gemma4_e2b_model.device)
streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=gemma4_e2b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── LFM-1.6B ────────────────────────────────────────
elif model_id == "lfm_16":
if lfm_16_model is None or lfm_16_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
conversation = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
inputs = lfm_16_processor.apply_chat_template(
conversation, add_generation_prompt=True,
return_tensors="pt", return_dict=True, tokenize=True,
).to(lfm_16_model.device)
streamer = TextIteratorStreamer(lfm_16_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=lfm_16_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
elif model_id == "qwen_unredacted":
if qwen_unredacted_model is None or qwen_unredacted_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = qwen_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_unredacted_model.device)
streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen_unredacted_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen2.5-VL-3B-Instruct ──────────────────────────
elif model_id == "qwen25_vl_3b":
if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"; return
messages = [{"role": "user", "content": [
{"type": "image", "image": image}, {"type": "text", "text": full_prompt},
]}]
text_input = qwen25_vl_3b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = qwen25_vl_3b_processor(
text=[text_input], images=image_inputs, videos=video_inputs,
return_tensors="pt", padding=True,
).to(qwen25_vl_3b_model.device)
streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120)
thread = threading.Thread(target=qwen25_vl_3b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True))
thread.start()
for tok in streamer:
if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
yield "data: [DONE]\n\n"
# ---------------------------------------------------------------------------
# New endpoint: /api/annotate — receives image + model output text + category
# Returns annotated PNG + structured JSON
# ---------------------------------------------------------------------------
@app.post("/api/annotate")
async def annotate_endpoint(
image: UploadFile = File(...),
text: str = Form(...),
category: str = Form(...),
):
try:
img_bytes = await image.read()
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
annotated_img, result_dict = parse_and_annotate(img, text, category)
png_bytes = pil_to_png_bytes(annotated_img)
return JSONResponse({
"image_b64": __import__("base64").b64encode(png_bytes).decode(),
"result": result_dict,
})
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# ---------------------------------------------------------------------------
# Main inference endpoint
# ---------------------------------------------------------------------------
@app.post("/api/run")
async def run_inference(
image: UploadFile = File(...),
category: str = Form(...),
prompt: str = Form(...),
model_id: str = Form("qwen_vl_2b"),
):
try:
img_bytes = await image.read()
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
img.thumbnail((512, 512))
return StreamingResponse(
generate_inference_stream(img, category, prompt, model_id),
media_type="text/event-stream",
)
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# ---------------------------------------------------------------------------
# Frontend
# ---------------------------------------------------------------------------
@app.get("/", response_class=HTMLResponse)
async def homepage(request: Request):
return r"""
Multimodal-Edge-Comparator
MULTIMODAL EDGE
|
Node-Based Inference Canvas
10x Vision Models
QWEN3-VL · 2B
Qwen3-VL-2B-Instruct — dedicated vision-language model by Alibaba Cloud.
Strong spatial grounding, OCR & instruction-following.
Results will stream here...
Active for Point / Detect tasks.
Run inference to visualise.
"""
if __name__ == "__main__":
app.launch(show_error=True, ssr_mode=False)