import os import io import json import ast import re import uuid import threading from pathlib import Path from typing import Optional import spaces import torch import numpy as np from PIL import Image, ImageDraw, ImageFont from gradio import Server from fastapi import Request, UploadFile, File, Form from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse from transformers import ( Qwen2_5_VLForConditionalGeneration, Qwen3_5ForConditionalGeneration, Qwen3VLForConditionalGeneration, Gemma4ForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer, ) from qwen_vl_utils import process_vision_info # --- App Configuration & Initialization --- app = Server() DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = ( torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 ) QWEN_VL_2B_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct" QWEN_VL_4B_MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct" QWEN_4B_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-4B-Unredacted-MAX" QWEN_4B_MODEL_NAME = "Qwen/Qwen3.5-4B" QWEN_2B_MODEL_NAME = "Qwen/Qwen3.5-2B" LFM_450_MODEL_NAME = "LiquidAI/LFM2.5-VL-450M" GEMMA4_E2B_NAME = "google/gemma-4-E2B-it" LFM_16_MODEL_NAME = "LiquidAI/LFM2.5-VL-1.6B" QWEN_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-2B-Unredacted-MAX" QWEN25_VL_3B_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" # ── Qwen3-VL-2B-Instruct ──────────────────────────────── print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...") try: qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained( QWEN_VL_2B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16, ).to(DEVICE).eval() qwen_vl_2b_processor = AutoProcessor.from_pretrained(QWEN_VL_2B_MODEL_NAME, trust_remote_code=True) print("Qwen3-VL-2B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}") qwen_vl_2b_model = None qwen_vl_2b_processor = None # ── Qwen3-VL-4B-Instruct ──────────────────────────────── print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...") try: qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained( QWEN_VL_4B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16, ).to(DEVICE).eval() qwen_vl_4b_processor = AutoProcessor.from_pretrained(QWEN_VL_4B_MODEL_NAME, trust_remote_code=True) print("Qwen3-VL-4B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}") qwen_vl_4b_model = None qwen_vl_4b_processor = None # ── Qwen3.5-4B-Unredacted-MAX ─────────────────────────── print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...") try: qwen_4b_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_4B_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_4b_unredacted_processor = AutoProcessor.from_pretrained(QWEN_4B_UNREDACTED_NAME) print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}") qwen_4b_unredacted_model = None qwen_4b_unredacted_processor = None # ── Qwen3.5-4B ────────────────────────────────────────── print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...") try: qwen_4b_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_4B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_4b_processor = AutoProcessor.from_pretrained(QWEN_4B_MODEL_NAME) print("Qwen3.5-4B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}") qwen_4b_model = None qwen_4b_processor = None # ── Qwen3.5-2B ────────────────────────────────────────── print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...") try: qwen_2b_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_2B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_2b_processor = AutoProcessor.from_pretrained(QWEN_2B_MODEL_NAME) print("Qwen3.5-2B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}") qwen_2b_model = None qwen_2b_processor = None # ── LFM2.5-VL-450M ────────────────────────────────────── print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...") try: lfm_450_model = AutoModelForImageTextToText.from_pretrained( LFM_450_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16, ).eval() lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME) print("LFM-450M model loaded successfully.") except Exception as e: print(f"Warning: LFM-450M model loading failed. Error: {e}") lfm_450_model = None lfm_450_processor = None # ── Gemma4-E2B-it ─────────────────────────────────────── print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...") try: gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained( GEMMA4_E2B_NAME, torch_dtype=torch.bfloat16, device_map="auto" if torch.cuda.is_available() else None, ).eval() if not torch.cuda.is_available(): gemma4_e2b_model = gemma4_e2b_model.to(DEVICE) gemma4_e2b_processor = AutoProcessor.from_pretrained(GEMMA4_E2B_NAME) print("Gemma4-E2B-it model loaded successfully.") except Exception as e: print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}") gemma4_e2b_model = None gemma4_e2b_processor = None # ── LFM2.5-VL-1.6B ────────────────────────────────────── print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...") try: lfm_16_model = AutoModelForImageTextToText.from_pretrained( LFM_16_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16, ).eval() lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME) print("LFM-1.6B model loaded successfully.") except Exception as e: print(f"Warning: LFM-1.6B model loading failed. Error: {e}") lfm_16_model = None lfm_16_processor = None # ── Qwen3.5-2B-Unredacted-MAX ─────────────────────────── print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...") try: qwen_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_unredacted_processor = AutoProcessor.from_pretrained(QWEN_UNREDACTED_NAME) print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}") qwen_unredacted_model = None qwen_unredacted_processor = None # ── Qwen2.5-VL-3B-Instruct ────────────────────────────── print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...") try: qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( QWEN25_VL_3B_NAME, torch_dtype="auto", device_map="auto", ).eval() qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME) print("Qwen2.5-VL-3B-Instruct model loaded successfully.") except Exception as e: print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}") qwen25_vl_3b_model = None qwen25_vl_3b_processor = None # --------------------------------------------------------------------------- # Utility: safe JSON parser (strips markdown fences, handles ast fallback) # --------------------------------------------------------------------------- def safe_parse_json(text: str): text = text.strip() # strip text = re.sub(r"[\s\S]*?", "", text, flags=re.IGNORECASE).strip() text = re.sub(r"^```(json)?", "", text) text = re.sub(r"```$", "", text) text = text.strip() try: return json.loads(text) except json.JSONDecodeError: pass try: return ast.literal_eval(text) except Exception: pass # Try to find the first JSON array or object in the text for pattern in [r'\[[\s\S]*\]', r'\{[\s\S]*\}']: m = re.search(pattern, text) if m: try: return json.loads(m.group()) except Exception: pass return None # --------------------------------------------------------------------------- # Server-side annotation (mirrors reference annotate_image exactly) # --------------------------------------------------------------------------- PALETTE_COLORS = [ (78, 205, 196), # teal (124, 106, 247), # purple (255, 107, 107), # red (255, 217, 61), # yellow (107, 203, 119), # green (255, 146, 43), # orange (204, 93, 232), # magenta (51, 154, 240), # blue ] def _get_font(size: int = 14): """Try to load a truetype font, fall back to default.""" for font_name in ["arial.ttf", "Arial.ttf", "DejaVuSans.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]: try: return ImageFont.truetype(font_name, size) except (IOError, OSError): continue return ImageFont.load_default() def annotate_detections(image: Image.Image, objects: list) -> Image.Image: """ Draw bounding boxes + labels on image. objects: list of {label, x_min, y_min, x_max, y_max} (all coords 0-1 fractions) """ image = image.convert("RGB").copy() W, H = image.size draw = ImageDraw.Draw(image, "RGBA") font_lbl = _get_font(max(12, W // 40)) for i, obj in enumerate(objects): col = PALETTE_COLORS[i % len(PALETTE_COLORS)] col_rgba_fill = col + (46,) # ~18% opacity fill col_rgba_solid = col + (255,) x1 = int(obj["x_min"] * W) y1 = int(obj["y_min"] * H) x2 = int(obj["x_max"] * W) y2 = int(obj["y_max"] * H) # clamp x1, x2 = max(0, x1), min(W, x2) y1, y2 = max(0, y1), min(H, y2) if x2 <= x1 or y2 <= y1: continue # Filled rectangle draw.rectangle([x1, y1, x2, y2], fill=col_rgba_fill) # Border (draw 2px by drawing twice) lw = max(2, W // 200) for t in range(lw): draw.rectangle([x1+t, y1+t, x2-t, y2-t], outline=col_rgba_solid) # Corner accents ca = min(18, (x2-x1)//4, (y2-y1)//4) cw = max(2, lw + 1) for (cx, cy, dx, dy) in [(x1,y1,1,1),(x2,y1,-1,1),(x2,y2,-1,-1),(x1,y2,1,-1)]: draw.line([cx, cy, cx+dx*ca, cy], fill=col_rgba_solid, width=cw) draw.line([cx, cy, cx, cy+dy*ca], fill=col_rgba_solid, width=cw) # Label pill label = obj.get("label", "object") try: bb = font_lbl.getbbox(label) tw, th = bb[2]-bb[0], bb[3]-bb[1] except Exception: tw, th = len(label)*7, 12 pad = 5 pw, ph = tw + pad*2, th + pad*2 lx = max(0, min(x1, W - pw)) ly = max(0, y1 - ph) if y1 - ph >= 0 else y1 + 2 draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba_solid) draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl) return image def annotate_points(image: Image.Image, points: list) -> Image.Image: """ Draw point markers + labels on image. points: list of {label, x, y} (coords 0-1 fractions) """ image = image.convert("RGB").copy() W, H = image.size draw = ImageDraw.Draw(image, "RGBA") font_lbl = _get_font(max(12, W // 40)) r = max(7, W // 55) for i, pt in enumerate(points): col = PALETTE_COLORS[i % len(PALETTE_COLORS)] col_rgba = col + (255,) glow_rgba = col + (40,) mid_rgba = col + (64,) cx = int(pt["x"] * W) cy = int(pt["y"] * H) cx = max(r, min(W-r, cx)) cy = max(r, min(H-r, cy)) # Outer glow draw.ellipse([cx-r*2, cy-r*2, cx+r*2, cy+r*2], fill=glow_rgba) # Mid ring draw.ellipse([cx-int(r*1.4), cy-int(r*1.4), cx+int(r*1.4), cy+int(r*1.4)], fill=mid_rgba) # Core dot draw.ellipse([cx-r, cy-r, cx+r, cy+r], fill=col_rgba, outline=(255,255,255,255), width=max(2,r//3)) # Centre white dot cr = max(2, r//3) draw.ellipse([cx-cr, cy-cr, cx+cr, cy+cr], fill=(255,255,255,255)) # Label label = pt.get("label", "") if label: try: bb = font_lbl.getbbox(label) tw, th = bb[2]-bb[0], bb[3]-bb[1] except Exception: tw, th = len(label)*7, 12 pad = 5 pw, ph = tw + pad*2, th + pad*2 lx = min(cx + r + 6, W - pw) ly = max(0, cy - ph//2) draw.rounded_rectangle([lx, ly, lx+pw, ly+ph], radius=4, fill=col_rgba) draw.text((lx+pad, ly+pad), label, fill=(255,255,255,255), font=font_lbl) return image def parse_and_annotate(image: Image.Image, full_text: str, category: str): """ Parse model output and return annotated PIL image + structured result dict. Mirrors the reference code logic exactly. """ parsed = safe_parse_json(full_text) if parsed is None: return image, {"error": "No JSON found in model output", "raw": full_text[:500]} if category == "Point": result = {"points": []} items = parsed if isinstance(parsed, list) else [parsed] for item in items: if isinstance(item, dict) and "point_2d" in item: coords = item["point_2d"] if isinstance(coords, (list, tuple)) and len(coords) == 2: x, y = float(coords[0]), float(coords[1]) # Reference divides by 1000.0 — Qwen uses 0-1000 scale result["points"].append({ "label": item.get("label", ""), "x": x / 1000.0, "y": y / 1000.0, }) annotated = annotate_points(image.copy(), result["points"]) return annotated, result elif category == "Detect": result = {"objects": []} items = parsed if isinstance(parsed, list) else [parsed] for item in items: if isinstance(item, dict) and "bbox_2d" in item: coords = item["bbox_2d"] if isinstance(coords, (list, tuple)) and len(coords) == 4: xmin, ymin, xmax, ymax = [float(v) for v in coords] result["objects"].append({ "label": item.get("label", "object"), "x_min": xmin / 1000.0, "y_min": ymin / 1000.0, "x_max": xmax / 1000.0, "y_max": ymax / 1000.0, }) annotated = annotate_detections(image.copy(), result["objects"]) return annotated, result return image, {} def pil_to_png_bytes(image: Image.Image) -> bytes: buf = io.BytesIO() image.save(buf, format="PNG") return buf.getvalue() # --------------------------------------------------------------------------- # Inference Generator (Streaming) # --------------------------------------------------------------------------- @spaces.GPU(duration=120) def generate_inference_stream( image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b" ): if category == "Query": full_prompt = prompt elif category == "Caption": full_prompt = f"Provide a {prompt} length caption for the image." elif category == "Point": full_prompt = f"Provide 2d point coordinates for {prompt}. Report in JSON format." elif category == "Detect": full_prompt = f"Provide bounding box coordinates for {prompt}. Report in JSON format." else: full_prompt = prompt # ── Qwen3-VL-2B ───────────────────────────────────── if model_id == "qwen_vl_2b": if qwen_vl_2b_model is None or qwen_vl_2b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_vl_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_vl_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_2b_model.device) streamer = TextIteratorStreamer(qwen_vl_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen_vl_2b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3-VL-4B ───────────────────────────────────── elif model_id == "qwen_vl_4b": if qwen_vl_4b_model is None or qwen_vl_4b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_vl_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_vl_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_vl_4b_model.device) streamer = TextIteratorStreamer(qwen_vl_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen_vl_4b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-4B-Unredacted-MAX ─────────────────────── elif model_id == "qwen_4b_unredacted": if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_4b_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_4b_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_unredacted_model.device) streamer = TextIteratorStreamer(qwen_4b_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen_4b_unredacted_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-4B ────────────────────────────────────── elif model_id == "qwen_4b": if qwen_4b_model is None or qwen_4b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_4b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_4b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_4b_model.device) streamer = TextIteratorStreamer(qwen_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen_4b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-2B ────────────────────────────────────── elif model_id == "qwen_2b": if qwen_2b_model is None or qwen_2b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_2b_model.device) streamer = TextIteratorStreamer(qwen_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen_2b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── LFM-450M ──────────────────────────────────────── elif model_id == "lfm_450": if lfm_450_model is None or lfm_450_processor is None: yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return conversation = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] inputs = lfm_450_processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt", return_dict=True, tokenize=True, ).to(lfm_450_model.device) streamer = TextIteratorStreamer(lfm_450_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=lfm_450_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Gemma4-E2B-it ─────────────────────────────────── elif model_id == "gemma4_e2b": if gemma4_e2b_model is None or gemma4_e2b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = gemma4_e2b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = gemma4_e2b_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(gemma4_e2b_model.device) streamer = TextIteratorStreamer(gemma4_e2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=gemma4_e2b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── LFM-1.6B ──────────────────────────────────────── elif model_id == "lfm_16": if lfm_16_model is None or lfm_16_processor is None: yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return conversation = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] inputs = lfm_16_processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt", return_dict=True, tokenize=True, ).to(lfm_16_model.device) streamer = TextIteratorStreamer(lfm_16_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=lfm_16_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-2B-Unredacted-MAX ─────────────────────── elif model_id == "qwen_unredacted": if qwen_unredacted_model is None or qwen_unredacted_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_unredacted_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = qwen_unredacted_processor(text=[text_input], images=[image], return_tensors="pt", padding=True).to(qwen_unredacted_model.device) streamer = TextIteratorStreamer(qwen_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen_unredacted_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen2.5-VL-3B-Instruct ────────────────────────── elif model_id == "qwen25_vl_3b": if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n" yield "data: [DONE]\n\n"; return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen25_vl_3b_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = qwen25_vl_3b_processor( text=[text_input], images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True, ).to(qwen25_vl_3b_model.device) streamer = TextIteratorStreamer(qwen25_vl_3b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120) thread = threading.Thread(target=qwen25_vl_3b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True)) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() yield "data: [DONE]\n\n" # --------------------------------------------------------------------------- # New endpoint: /api/annotate — receives image + model output text + category # Returns annotated PNG + structured JSON # --------------------------------------------------------------------------- @app.post("/api/annotate") async def annotate_endpoint( image: UploadFile = File(...), text: str = Form(...), category: str = Form(...), ): try: img_bytes = await image.read() img = Image.open(io.BytesIO(img_bytes)).convert("RGB") annotated_img, result_dict = parse_and_annotate(img, text, category) png_bytes = pil_to_png_bytes(annotated_img) return JSONResponse({ "image_b64": __import__("base64").b64encode(png_bytes).decode(), "result": result_dict, }) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # --------------------------------------------------------------------------- # Main inference endpoint # --------------------------------------------------------------------------- @app.post("/api/run") async def run_inference( image: UploadFile = File(...), category: str = Form(...), prompt: str = Form(...), model_id: str = Form("qwen_vl_2b"), ): try: img_bytes = await image.read() img = Image.open(io.BytesIO(img_bytes)).convert("RGB") img.thumbnail((512, 512)) return StreamingResponse( generate_inference_stream(img, category, prompt, model_id), media_type="text/event-stream", ) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # --------------------------------------------------------------------------- # Frontend # --------------------------------------------------------------------------- @app.get("/", response_class=HTMLResponse) async def homepage(request: Request): return r""" Multimodal-Edge-Comparator
| Node-Based Inference Canvas 10x Vision Models
Input Image ID: 01
Click or drop image here
Model Selector ID: 02
QWEN3-VL · 2B

Qwen3-VL-2B-Instruct — dedicated vision-language model by Alibaba Cloud. Strong spatial grounding, OCR & instruction-following.
Task Config ID: 03
Output Stream ID: 04
Results will stream here...
View Grounding ID: 05
Active for Point / Detect tasks.
Run inference to visualise.
""" if __name__ == "__main__": app.launch(show_error=True, ssr_mode=False)