import os import io import json import ast import re import uuid import threading from pathlib import Path from typing import Optional import spaces import torch from PIL import Image from gradio import Server from fastapi import Request, UploadFile, File, Form from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse from transformers import ( Qwen2_5_VLForConditionalGeneration, Qwen3_5ForConditionalGeneration, Qwen3VLForConditionalGeneration, Gemma4ForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer, ) from qwen_vl_utils import process_vision_info # --- App Configuration & Initialization --- app = Server() DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = ( torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 ) QWEN_VL_2B_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct" QWEN_VL_4B_MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct" QWEN_4B_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-4B-Unredacted-MAX" QWEN_4B_MODEL_NAME = "Qwen/Qwen3.5-4B" QWEN_2B_MODEL_NAME = "Qwen/Qwen3.5-2B" LFM_450_MODEL_NAME = "LiquidAI/LFM2.5-VL-450M" GEMMA4_E2B_NAME = "google/gemma-4-E2B-it" LFM_16_MODEL_NAME = "LiquidAI/LFM2.5-VL-1.6B" QWEN_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-2B-Unredacted-MAX" QWEN25_VL_3B_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" # ── Qwen3-VL-2B-Instruct ──────────────────────────────── print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...") try: qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained( QWEN_VL_2B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16, ).to(DEVICE).eval() qwen_vl_2b_processor = AutoProcessor.from_pretrained( QWEN_VL_2B_MODEL_NAME, trust_remote_code=True ) print("Qwen3-VL-2B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}") qwen_vl_2b_model = None qwen_vl_2b_processor = None # ── Qwen3-VL-4B-Instruct ──────────────────────────────── print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...") try: qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained( QWEN_VL_4B_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16, ).to(DEVICE).eval() qwen_vl_4b_processor = AutoProcessor.from_pretrained( QWEN_VL_4B_MODEL_NAME, trust_remote_code=True ) print("Qwen3-VL-4B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}") qwen_vl_4b_model = None qwen_vl_4b_processor = None # ── Qwen3.5-4B-Unredacted-MAX ─────────────────────────── print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...") try: qwen_4b_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_4B_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_4b_unredacted_processor = AutoProcessor.from_pretrained(QWEN_4B_UNREDACTED_NAME) print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}") qwen_4b_unredacted_model = None qwen_4b_unredacted_processor = None # ── Qwen3.5-4B ────────────────────────────────────────── print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...") try: qwen_4b_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_4B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_4b_processor = AutoProcessor.from_pretrained(QWEN_4B_MODEL_NAME) print("Qwen3.5-4B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}") qwen_4b_model = None qwen_4b_processor = None # ── Qwen3.5-2B ────────────────────────────────────────── print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...") try: qwen_2b_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_2B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_2b_processor = AutoProcessor.from_pretrained(QWEN_2B_MODEL_NAME) print("Qwen3.5-2B model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}") qwen_2b_model = None qwen_2b_processor = None # ── LFM2.5-VL-450M ────────────────────────────────────── print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...") try: lfm_450_model = AutoModelForImageTextToText.from_pretrained( LFM_450_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16, ).eval() lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME) print("LFM-450M model loaded successfully.") except Exception as e: print(f"Warning: LFM-450M model loading failed. Error: {e}") lfm_450_model = None lfm_450_processor = None # ── Gemma4-E2B-it ─────────────────────────────────────── print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...") try: gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained( GEMMA4_E2B_NAME, torch_dtype=torch.bfloat16, device_map="auto" if torch.cuda.is_available() else None, ).eval() if not torch.cuda.is_available(): gemma4_e2b_model = gemma4_e2b_model.to(DEVICE) gemma4_e2b_processor = AutoProcessor.from_pretrained(GEMMA4_E2B_NAME) print("Gemma4-E2B-it model loaded successfully.") except Exception as e: print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}") gemma4_e2b_model = None gemma4_e2b_processor = None # ── LFM2.5-VL-1.6B ────────────────────────────────────── print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...") try: lfm_16_model = AutoModelForImageTextToText.from_pretrained( LFM_16_MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16, ).eval() lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME) print("LFM-1.6B model loaded successfully.") except Exception as e: print(f"Warning: LFM-1.6B model loading failed. Error: {e}") lfm_16_model = None lfm_16_processor = None # ── Qwen3.5-2B-Unredacted-MAX ─────────────────────────── print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...") try: qwen_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained( QWEN_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE, ).eval() qwen_unredacted_processor = AutoProcessor.from_pretrained(QWEN_UNREDACTED_NAME) print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.") except Exception as e: print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}") qwen_unredacted_model = None qwen_unredacted_processor = None # ── Qwen2.5-VL-3B-Instruct ────────────────────────────── print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...") try: qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( QWEN25_VL_3B_NAME, torch_dtype="auto", device_map="auto", ).eval() qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME) print("Qwen2.5-VL-3B-Instruct model loaded successfully.") except Exception as e: print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}") qwen25_vl_3b_model = None qwen25_vl_3b_processor = None # --- Utility Functions --- def safe_parse_json(text: str): text = text.strip() text = re.sub(r"^```(json)?", "", text) text = re.sub(r"```$", "", text) text = text.strip() try: return json.loads(text) except json.JSONDecodeError: pass try: return ast.literal_eval(text) except Exception: return {} # --- Inference Generator (Streaming) --- @spaces.GPU(duration=120) def generate_inference_stream( image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b" ): if category == "Query": full_prompt = prompt elif category == "Caption": full_prompt = f"Provide a {prompt} length caption for the image." elif category == "Point": full_prompt = f"Provide 2d point coordinates for {prompt}. Report in JSON format." elif category == "Detect": full_prompt = f"Provide bounding box coordinates for {prompt}. Report in JSON format." else: full_prompt = prompt # ── Qwen3-VL-2B ───────────────────────────────────── if model_id == "qwen_vl_2b": if qwen_vl_2b_model is None or qwen_vl_2b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_vl_2b_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = qwen_vl_2b_processor( text=[text_input], images=[image], return_tensors="pt", padding=True ).to(qwen_vl_2b_model.device) streamer = TextIteratorStreamer( qwen_vl_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen_vl_2b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3-VL-4B ───────────────────────────────────── elif model_id == "qwen_vl_4b": if qwen_vl_4b_model is None or qwen_vl_4b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_vl_4b_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = qwen_vl_4b_processor( text=[text_input], images=[image], return_tensors="pt", padding=True ).to(qwen_vl_4b_model.device) streamer = TextIteratorStreamer( qwen_vl_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen_vl_4b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-4B-Unredacted-MAX ─────────────────────── elif model_id == "qwen_4b_unredacted": if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_4b_unredacted_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = qwen_4b_unredacted_processor( text=[text_input], images=[image], return_tensors="pt", padding=True ).to(qwen_4b_unredacted_model.device) streamer = TextIteratorStreamer( qwen_4b_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen_4b_unredacted_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-4B ────────────────────────────────────── elif model_id == "qwen_4b": if qwen_4b_model is None or qwen_4b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_4b_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = qwen_4b_processor( text=[text_input], images=[image], return_tensors="pt", padding=True ).to(qwen_4b_model.device) streamer = TextIteratorStreamer( qwen_4b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen_4b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-2B ────────────────────────────────────── elif model_id == "qwen_2b": if qwen_2b_model is None or qwen_2b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_2b_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = qwen_2b_processor( text=[text_input], images=[image], return_tensors="pt", padding=True ).to(qwen_2b_model.device) streamer = TextIteratorStreamer( qwen_2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen_2b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── LFM-450M ──────────────────────────────────────── elif model_id == "lfm_450": if lfm_450_model is None or lfm_450_processor is None: yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return conversation = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] inputs = lfm_450_processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt", return_dict=True, tokenize=True, ).to(lfm_450_model.device) streamer = TextIteratorStreamer( lfm_450_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=lfm_450_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Gemma4-E2B-it ─────────────────────────────────── elif model_id == "gemma4_e2b": if gemma4_e2b_model is None or gemma4_e2b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = gemma4_e2b_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = gemma4_e2b_processor( text=[text_input], images=[image], return_tensors="pt", padding=True, ).to(gemma4_e2b_model.device) streamer = TextIteratorStreamer( gemma4_e2b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=gemma4_e2b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── LFM-1.6B ──────────────────────────────────────── elif model_id == "lfm_16": if lfm_16_model is None or lfm_16_processor is None: yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return conversation = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] inputs = lfm_16_processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt", return_dict=True, tokenize=True, ).to(lfm_16_model.device) streamer = TextIteratorStreamer( lfm_16_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=lfm_16_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen3.5-2B-Unredacted-MAX ─────────────────────── elif model_id == "qwen_unredacted": if qwen_unredacted_model is None or qwen_unredacted_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen_unredacted_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = qwen_unredacted_processor( text=[text_input], images=[image], return_tensors="pt", padding=True ).to(qwen_unredacted_model.device) streamer = TextIteratorStreamer( qwen_unredacted_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen_unredacted_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.5, min_p=0.1), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() # ── Qwen2.5-VL-3B-Instruct ────────────────────────── elif model_id == "qwen25_vl_3b": if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None: yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n" yield "data: [DONE]\n\n" return messages = [{"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": full_prompt}, ]}] text_input = qwen25_vl_3b_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = qwen25_vl_3b_processor( text=[text_input], images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True, ).to(qwen25_vl_3b_model.device) streamer = TextIteratorStreamer( qwen25_vl_3b_processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=120, ) thread = threading.Thread( target=qwen25_vl_3b_model.generate, kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, do_sample=True), ) thread.start() for tok in streamer: if tok: yield f"data: {json.dumps({'chunk': tok})}\n\n" thread.join() yield "data: [DONE]\n\n" # --- FastAPI Endpoints --- @app.post("/api/run") async def run_inference( image: UploadFile = File(...), category: str = Form(...), prompt: str = Form(...), model_id: str = Form("qwen_vl_2b"), ): try: img_bytes = await image.read() img = Image.open(io.BytesIO(img_bytes)).convert("RGB") img.thumbnail((512, 512)) return StreamingResponse( generate_inference_stream(img, category, prompt, model_id), media_type="text/event-stream", ) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # --- Frontend UI --- @app.get("/", response_class=HTMLResponse) async def homepage(request: Request): return """ Multimodal-Edge-Comparator
| Node-Based Inference Canvas 10x Vision Models
Input Image ID: 01
Click or drop image here
Model Selector ID: 02
QWEN3-VL · 2B

Qwen3-VL-2B-Instruct — dedicated vision-language model by Alibaba Cloud. Strong spatial grounding, OCR & instruction-following.
Task Config ID: 03
Output Stream ID: 04
Results will stream here...
View Grounding ID: 05
Active for Point / Detect tasks.
Run inference to visualise.
""" if __name__ == "__main__": app.launch(show_error=True)