prithivMLmods's picture
update app [final] βœ…
bca4e3d verified
raw
history blame
68.1 kB
import os
import io
import json
import ast
import re
import uuid
import threading
from pathlib import Path
from typing import Optional
import spaces
import torch
from PIL import Image
from gradio import Server
from fastapi import Request, UploadFile, File, Form
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
from transformers import (
Qwen2_5_VLForConditionalGeneration,
Qwen3_5ForConditionalGeneration,
Qwen3VLForConditionalGeneration,
Gemma4ForConditionalGeneration,
AutoProcessor,
AutoModelForImageTextToText,
TextIteratorStreamer,
)
from qwen_vl_utils import process_vision_info
# --- App Configuration & Initialization ---
app = Server()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = (
torch.bfloat16
if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
else torch.float16
)
QWEN_VL_2B_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"
QWEN_VL_4B_MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct"
QWEN_4B_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-4B-Unredacted-MAX"
QWEN_4B_MODEL_NAME = "Qwen/Qwen3.5-4B"
QWEN_2B_MODEL_NAME = "Qwen/Qwen3.5-2B"
LFM_450_MODEL_NAME = "LiquidAI/LFM2.5-VL-450M"
GEMMA4_E2B_NAME = "google/gemma-4-E2B-it"
LFM_16_MODEL_NAME = "LiquidAI/LFM2.5-VL-1.6B"
QWEN_UNREDACTED_NAME = "prithivMLmods/Qwen3.5-2B-Unredacted-MAX"
QWEN25_VL_3B_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
# ── Qwen3-VL-2B-Instruct ────────────────────────────────
print(f"Loading Qwen3-VL-2B model: {QWEN_VL_2B_MODEL_NAME} on {DEVICE}...")
try:
qwen_vl_2b_model = Qwen3VLForConditionalGeneration.from_pretrained(
QWEN_VL_2B_MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
qwen_vl_2b_processor = AutoProcessor.from_pretrained(
QWEN_VL_2B_MODEL_NAME, trust_remote_code=True
)
print("Qwen3-VL-2B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3-VL-2B model loading failed. Error: {e}")
qwen_vl_2b_model = None
qwen_vl_2b_processor = None
# ── Qwen3-VL-4B-Instruct ────────────────────────────────
print(f"Loading Qwen3-VL-4B model: {QWEN_VL_4B_MODEL_NAME} on {DEVICE}...")
try:
qwen_vl_4b_model = Qwen3VLForConditionalGeneration.from_pretrained(
QWEN_VL_4B_MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).to(DEVICE).eval()
qwen_vl_4b_processor = AutoProcessor.from_pretrained(
QWEN_VL_4B_MODEL_NAME, trust_remote_code=True
)
print("Qwen3-VL-4B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3-VL-4B model loading failed. Error: {e}")
qwen_vl_4b_model = None
qwen_vl_4b_processor = None
# ── Qwen3.5-4B-Unredacted-MAX ───────────────────────────
print(f"Loading Qwen3.5-4B-Unredacted-MAX: {QWEN_4B_UNREDACTED_NAME} on {DEVICE}...")
try:
qwen_4b_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_4B_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_4b_unredacted_processor = AutoProcessor.from_pretrained(QWEN_4B_UNREDACTED_NAME)
print("Qwen3.5-4B-Unredacted-MAX model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-4B-Unredacted-MAX model loading failed. Error: {e}")
qwen_4b_unredacted_model = None
qwen_4b_unredacted_processor = None
# ── Qwen3.5-4B ──────────────────────────────────────────
print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
try:
qwen_4b_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_4B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_4b_processor = AutoProcessor.from_pretrained(QWEN_4B_MODEL_NAME)
print("Qwen3.5-4B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
qwen_4b_model = None
qwen_4b_processor = None
# ── Qwen3.5-2B ──────────────────────────────────────────
print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
try:
qwen_2b_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_2B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_2b_processor = AutoProcessor.from_pretrained(QWEN_2B_MODEL_NAME)
print("Qwen3.5-2B model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
qwen_2b_model = None
qwen_2b_processor = None
# ── LFM2.5-VL-450M ──────────────────────────────────────
print(f"Loading LFM-450M model: {LFM_450_MODEL_NAME} on {DEVICE}...")
try:
lfm_450_model = AutoModelForImageTextToText.from_pretrained(
LFM_450_MODEL_NAME,
device_map="auto",
torch_dtype=torch.bfloat16,
).eval()
lfm_450_processor = AutoProcessor.from_pretrained(LFM_450_MODEL_NAME)
print("LFM-450M model loaded successfully.")
except Exception as e:
print(f"Warning: LFM-450M model loading failed. Error: {e}")
lfm_450_model = None
lfm_450_processor = None
# ── Gemma4-E2B-it ───────────────────────────────────────
print(f"Loading Gemma4-E2B-it: {GEMMA4_E2B_NAME} on {DEVICE}...")
try:
gemma4_e2b_model = Gemma4ForConditionalGeneration.from_pretrained(
GEMMA4_E2B_NAME,
torch_dtype=torch.bfloat16,
device_map="auto" if torch.cuda.is_available() else None,
).eval()
if not torch.cuda.is_available():
gemma4_e2b_model = gemma4_e2b_model.to(DEVICE)
gemma4_e2b_processor = AutoProcessor.from_pretrained(GEMMA4_E2B_NAME)
print("Gemma4-E2B-it model loaded successfully.")
except Exception as e:
print(f"Warning: Gemma4-E2B-it model loading failed. Error: {e}")
gemma4_e2b_model = None
gemma4_e2b_processor = None
# ── LFM2.5-VL-1.6B ──────────────────────────────────────
print(f"Loading LFM-1.6B model: {LFM_16_MODEL_NAME} on {DEVICE}...")
try:
lfm_16_model = AutoModelForImageTextToText.from_pretrained(
LFM_16_MODEL_NAME,
device_map="auto",
torch_dtype=torch.bfloat16,
).eval()
lfm_16_processor = AutoProcessor.from_pretrained(LFM_16_MODEL_NAME)
print("LFM-1.6B model loaded successfully.")
except Exception as e:
print(f"Warning: LFM-1.6B model loading failed. Error: {e}")
lfm_16_model = None
lfm_16_processor = None
# ── Qwen3.5-2B-Unredacted-MAX ───────────────────────────
print(f"Loading Qwen3.5-2B-Unredacted-MAX: {QWEN_UNREDACTED_NAME} on {DEVICE}...")
try:
qwen_unredacted_model = Qwen3_5ForConditionalGeneration.from_pretrained(
QWEN_UNREDACTED_NAME, torch_dtype=DTYPE, device_map=DEVICE,
).eval()
qwen_unredacted_processor = AutoProcessor.from_pretrained(QWEN_UNREDACTED_NAME)
print("Qwen3.5-2B-Unredacted-MAX model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen3.5-2B-Unredacted-MAX model loading failed. Error: {e}")
qwen_unredacted_model = None
qwen_unredacted_processor = None
# ── Qwen2.5-VL-3B-Instruct ──────────────────────────────
print(f"Loading Qwen2.5-VL-3B-Instruct: {QWEN25_VL_3B_NAME} on {DEVICE}...")
try:
qwen25_vl_3b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
QWEN25_VL_3B_NAME,
torch_dtype="auto",
device_map="auto",
).eval()
qwen25_vl_3b_processor = AutoProcessor.from_pretrained(QWEN25_VL_3B_NAME)
print("Qwen2.5-VL-3B-Instruct model loaded successfully.")
except Exception as e:
print(f"Warning: Qwen2.5-VL-3B-Instruct model loading failed. Error: {e}")
qwen25_vl_3b_model = None
qwen25_vl_3b_processor = None
# --- Utility Functions ---
def safe_parse_json(text: str):
text = text.strip()
text = re.sub(r"^```(json)?", "", text)
text = re.sub(r"```$", "", text)
text = text.strip()
try:
return json.loads(text)
except json.JSONDecodeError:
pass
try:
return ast.literal_eval(text)
except Exception:
return {}
# --- Inference Generator (Streaming) ---
@spaces.GPU(duration=120)
def generate_inference_stream(
image: Image.Image, category: str, prompt: str, model_id: str = "qwen_vl_2b"
):
if category == "Query":
full_prompt = prompt
elif category == "Caption":
full_prompt = f"Provide a {prompt} length caption for the image."
elif category == "Point":
full_prompt = f"Provide 2d point coordinates for {prompt}. Report in JSON format."
elif category == "Detect":
full_prompt = f"Provide bounding box coordinates for {prompt}. Report in JSON format."
else:
full_prompt = prompt
# ── Qwen3-VL-2B ─────────────────────────────────────
if model_id == "qwen_vl_2b":
if qwen_vl_2b_model is None or qwen_vl_2b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-2B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen_vl_2b_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = qwen_vl_2b_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True
).to(qwen_vl_2b_model.device)
streamer = TextIteratorStreamer(
qwen_vl_2b_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen_vl_2b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.0, do_sample=True),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3-VL-4B ─────────────────────────────────────
elif model_id == "qwen_vl_4b":
if qwen_vl_4b_model is None or qwen_vl_4b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL-4B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen_vl_4b_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = qwen_vl_4b_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True
).to(qwen_vl_4b_model.device)
streamer = TextIteratorStreamer(
qwen_vl_4b_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen_vl_4b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.0, do_sample=True),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-4B-Unredacted-MAX ───────────────────────
elif model_id == "qwen_4b_unredacted":
if qwen_4b_unredacted_model is None or qwen_4b_unredacted_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B-Unredacted-MAX model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen_4b_unredacted_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = qwen_4b_unredacted_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True
).to(qwen_4b_unredacted_model.device)
streamer = TextIteratorStreamer(
qwen_4b_unredacted_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen_4b_unredacted_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.5, min_p=0.1),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-4B ──────────────────────────────────────
elif model_id == "qwen_4b":
if qwen_4b_model is None or qwen_4b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen_4b_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = qwen_4b_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True
).to(qwen_4b_model.device)
streamer = TextIteratorStreamer(
qwen_4b_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen_4b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.5, min_p=0.1),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-2B ──────────────────────────────────────
elif model_id == "qwen_2b":
if qwen_2b_model is None or qwen_2b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen_2b_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = qwen_2b_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True
).to(qwen_2b_model.device)
streamer = TextIteratorStreamer(
qwen_2b_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen_2b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.5, min_p=0.1),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── LFM-450M ────────────────────────────────────────
elif model_id == "lfm_450":
if lfm_450_model is None or lfm_450_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] LFM-450M model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
conversation = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
inputs = lfm_450_processor.apply_chat_template(
conversation, add_generation_prompt=True,
return_tensors="pt", return_dict=True, tokenize=True,
).to(lfm_450_model.device)
streamer = TextIteratorStreamer(
lfm_450_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=lfm_450_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Gemma4-E2B-it ───────────────────────────────────
elif model_id == "gemma4_e2b":
if gemma4_e2b_model is None or gemma4_e2b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Gemma4-E2B-it model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = gemma4_e2b_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = gemma4_e2b_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True,
).to(gemma4_e2b_model.device)
streamer = TextIteratorStreamer(
gemma4_e2b_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=gemma4_e2b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.0, do_sample=True),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── LFM-1.6B ────────────────────────────────────────
elif model_id == "lfm_16":
if lfm_16_model is None or lfm_16_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] LFM-1.6B model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
conversation = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
inputs = lfm_16_processor.apply_chat_template(
conversation, add_generation_prompt=True,
return_tensors="pt", return_dict=True, tokenize=True,
).to(lfm_16_model.device)
streamer = TextIteratorStreamer(
lfm_16_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=lfm_16_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024, use_cache=True),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen3.5-2B-Unredacted-MAX ───────────────────────
elif model_id == "qwen_unredacted":
if qwen_unredacted_model is None or qwen_unredacted_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B-Unredacted-MAX model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen_unredacted_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = qwen_unredacted_processor(
text=[text_input], images=[image], return_tensors="pt", padding=True
).to(qwen_unredacted_model.device)
streamer = TextIteratorStreamer(
qwen_unredacted_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen_unredacted_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.5, min_p=0.1),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
# ── Qwen2.5-VL-3B-Instruct ──────────────────────────
elif model_id == "qwen25_vl_3b":
if qwen25_vl_3b_model is None or qwen25_vl_3b_processor is None:
yield f"data: {json.dumps({'chunk': '[Error] Qwen2.5-VL-3B-Instruct model not loaded.'})}\n\n"
yield "data: [DONE]\n\n"
return
messages = [{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": full_prompt},
]}]
text_input = qwen25_vl_3b_processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = qwen25_vl_3b_processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
return_tensors="pt",
padding=True,
).to(qwen25_vl_3b_model.device)
streamer = TextIteratorStreamer(
qwen25_vl_3b_processor.tokenizer,
skip_prompt=True, skip_special_tokens=True, timeout=120,
)
thread = threading.Thread(
target=qwen25_vl_3b_model.generate,
kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
use_cache=True, temperature=1.0, do_sample=True),
)
thread.start()
for tok in streamer:
if tok:
yield f"data: {json.dumps({'chunk': tok})}\n\n"
thread.join()
yield "data: [DONE]\n\n"
# --- FastAPI Endpoints ---
@app.post("/api/run")
async def run_inference(
image: UploadFile = File(...),
category: str = Form(...),
prompt: str = Form(...),
model_id: str = Form("qwen_vl_2b"),
):
try:
img_bytes = await image.read()
img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
img.thumbnail((512, 512))
return StreamingResponse(
generate_inference_stream(img, category, prompt, model_id),
media_type="text/event-stream",
)
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
# --- Frontend UI ---
@app.get("/", response_class=HTMLResponse)
async def homepage(request: Request):
return """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Multimodal-Edge-Comparator</title>
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;600;700&display=swap" rel="stylesheet">
<style>
:root {
--bg: #0d0d0f;
--grid: #1a1a1f;
--node-bg: #13131a;
--node-header: #1c1c26;
--node-border: #2a2a3a;
--accent: #7c6af7;
--accent2: #4ecdc4;
--accent3: #ff6b6b;
--text: #e8e8f0;
--muted: #6b6b8a;
--port: #4ecdc4;
--wire: #2a2a4a;
--wire-active: #7c6af7;
}
* { box-sizing: border-box; margin: 0; padding: 0; }
html, body {
min-height: 100%; background: var(--bg);
color: var(--text); font-family: 'JetBrains Mono', monospace;
}
body {
background-image:
radial-gradient(circle at 20% 50%, rgba(124,106,247,0.04) 0%, transparent 50%),
radial-gradient(circle at 80% 20%, rgba(78,205,196,0.04) 0%, transparent 50%),
linear-gradient(var(--grid) 1px, transparent 1px),
linear-gradient(90deg, var(--grid) 1px, transparent 1px);
background-size: 100% 100%, 100% 100%, 24px 24px, 24px 24px;
overflow-x: auto; overflow-y: auto;
}
/* ── Top Bar ── */
.top-bar {
position: sticky; top: 0; left: 0; right: 0; height: 42px;
background: rgba(13,13,15,0.95);
border-bottom: 1px solid var(--node-border);
display: flex; align-items: center; padding: 0 20px;
gap: 12px; z-index: 1000; backdrop-filter: blur(12px);
}
.top-bar .logo { font-size: 13px; font-weight: 700; color: var(--accent); letter-spacing: 0.05em; }
.top-bar .sep { color: var(--node-border); }
.top-bar .sub { font-size: 11px; color: var(--muted); }
.top-bar .badge {
margin-left: auto;
background: rgba(124,106,247,0.15);
border: 1px solid rgba(124,106,247,0.3);
padding: 3px 10px; border-radius: 20px;
font-size: 10px; color: var(--accent);
}
/* ── Canvas ── */
#canvas {
position: relative; width: 1360px;
min-height: calc(100vh - 42px); height: 900px; margin: 0 auto;
}
svg.wires {
position: absolute; top: 0; left: 0;
width: 100%; height: 100%;
pointer-events: none; z-index: 2; overflow: visible;
}
path.wire { fill: none; stroke: var(--wire); stroke-width: 2.5; stroke-linecap: round; }
path.wire.active {
stroke: var(--wire-active); stroke-width: 3;
stroke-dasharray: 8 4; animation: flow 0.6s linear infinite;
}
@keyframes flow { to { stroke-dashoffset: -24; } }
/* ── Nodes ── */
.node {
position: absolute; width: 295px;
background: var(--node-bg); border: 1px solid var(--node-border);
border-radius: 9px; box-shadow: 0 8px 28px rgba(0,0,0,0.5);
z-index: 10; display: flex; flex-direction: column; transition: box-shadow 0.2s;
}
.node:hover { box-shadow: 0 8px 28px rgba(0,0,0,0.5), 0 0 0 1px rgba(124,106,247,0.3); }
.node.fixed-height { height: 340px; }
.node-header {
background: var(--node-header); padding: 7px 12px;
border-bottom: 1px solid var(--node-border); border-radius: 9px 9px 0 0;
font-size: 11px; font-weight: 700; cursor: grab;
display: flex; justify-content: space-between; align-items: center;
flex-shrink: 0; user-select: none;
}
.node-header:active { cursor: grabbing; }
.node-header .id {
font-size: 10px; color: var(--muted);
background: rgba(255,255,255,0.04); padding: 2px 7px; border-radius: 4px;
}
.node-body { padding: 10px; display: flex; flex-direction: column; gap: 8px; flex: 1; overflow: hidden; }
/* ── Ports ── */
.port {
position: absolute; width: 11px; height: 11px;
background: var(--node-bg); border: 2px solid var(--port);
border-radius: 50%; z-index: 30;
}
.port.out { right: -6px; }
.port.in { left: -6px; }
/* ── Labels ── */
label {
font-size: 10px; color: var(--muted); font-weight: 600;
display: block; margin-bottom: 3px; letter-spacing: 0.07em; text-transform: uppercase;
}
input[type="file"] { display: none; }
/* ── Upload Zone ── */
.file-upload {
border: 1.5px dashed var(--node-border); border-radius: 7px; padding: 12px 10px;
text-align: center; cursor: pointer; font-size: 11px; color: var(--muted);
transition: border-color 0.2s, background 0.2s; background: rgba(255,255,255,0.01);
display: flex; flex-direction: column; align-items: center; gap: 5px;
}
.file-upload:hover { border-color: var(--accent); background: rgba(124,106,247,0.04); }
.file-upload svg { opacity: 0.5; transition: opacity 0.2s; }
.file-upload:hover svg { opacity: 0.9; }
/* ── Preview wrapper ── */
.preview-wrap {
display: none; position: relative; border-radius: 7px;
overflow: hidden; border: 1px solid var(--node-border); background: #000;
}
.preview-wrap.visible { display: block; }
.img-preview { width: 100%; height: 170px; object-fit: contain; display: block; }
/* ── Clear button ── */
.clear-btn {
position: absolute; top: 6px; right: 6px; width: 24px; height: 24px;
border-radius: 50%; background: rgba(13,13,15,0.80);
border: 1px solid var(--node-border); color: var(--accent3); cursor: pointer;
display: flex; align-items: center; justify-content: center;
transition: background 0.18s, border-color 0.18s, transform 0.12s;
z-index: 20; backdrop-filter: blur(6px);
}
.clear-btn:hover { background: rgba(255,107,107,0.18); border-color: var(--accent3); transform: scale(1.08); }
.clear-btn:active { transform: scale(0.95); }
.clear-btn svg { pointer-events: none; }
/* ── Filename chip ── */
.img-chip {
display: none; align-items: center; gap: 6px;
background: rgba(124,106,247,0.08); border: 1px solid rgba(124,106,247,0.22);
border-radius: 5px; padding: 4px 8px; font-size: 9px; color: var(--muted); overflow: hidden;
}
.img-chip.visible { display: flex; }
.img-chip .chip-dot { width: 5px; height: 5px; border-radius: 50%; background: var(--accent2); flex-shrink: 0; box-shadow: 0 0 4px var(--accent2); }
.img-chip .chip-name { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; flex: 1; color: var(--text); font-size: 9px; }
.img-chip .chip-size { color: var(--muted); flex-shrink: 0; font-size: 9px; }
select, textarea {
width: 100%; background: rgba(0,0,0,0.3); border: 1px solid var(--node-border);
color: var(--text); padding: 7px 9px; border-radius: 5px; outline: none;
font-size: 11px; font-family: 'JetBrains Mono', monospace;
resize: none; transition: border-color 0.2s;
}
select:focus, textarea:focus { border-color: var(--accent); }
select option { background: #1c1c26; }
button.run-btn {
background: linear-gradient(135deg, var(--accent), #9b59b6);
color: #fff; border: none; padding: 8px; border-radius: 6px;
font-weight: 700; font-size: 11px; font-family: 'JetBrains Mono', monospace;
cursor: pointer; transition: opacity 0.2s, transform 0.1s;
display: flex; justify-content: center; align-items: center; gap: 8px;
letter-spacing: 0.04em; flex-shrink: 0;
}
button.run-btn:hover { opacity: 0.9; }
button.run-btn:active { transform: scale(0.98); }
button.run-btn:disabled { background: var(--node-border); cursor: not-allowed; color: #555; }
/* ── Output node ── */
.output-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
.output-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
/* ── Icon buttons (copy / download) ── */
.icon-btn {
display: flex; align-items: center; gap: 5px;
background: rgba(124,106,247,0.10); border: 1px solid rgba(124,106,247,0.25);
border-radius: 5px; padding: 3px 8px;
font-size: 9px; font-weight: 700; font-family: 'JetBrains Mono', monospace;
color: var(--accent); cursor: pointer; letter-spacing: 0.05em;
transition: background 0.18s, border-color 0.18s, transform 0.1s; flex-shrink: 0;
text-decoration: none;
}
.icon-btn:hover { background: rgba(124,106,247,0.22); border-color: var(--accent); }
.icon-btn:active { transform: scale(0.95); }
.icon-btn.teal {
background: rgba(78,205,196,0.10); border-color: rgba(78,205,196,0.25); color: var(--accent2);
}
.icon-btn.teal:hover { background: rgba(78,205,196,0.22); border-color: var(--accent2); }
.icon-btn.copied { background: rgba(78,205,196,0.15); border-color: var(--accent2); color: var(--accent2); }
.icon-btn svg { pointer-events: none; flex-shrink: 0; }
.output-box {
background: rgba(0,0,0,0.4); border: 1px solid var(--node-border);
border-radius: 5px; padding: 10px; flex: 1; overflow-y: auto;
font-size: 11px; line-height: 1.6; color: #c8c8e0; white-space: pre-wrap;
user-select: text; font-family: 'JetBrains Mono', monospace; min-height: 0;
}
/* ── Grounding node ── */
.ground-node-body { padding: 10px; display: flex; flex-direction: column; gap: 6px; flex: 1; overflow: hidden; }
.ground-header-row { display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
.ground-canvas-wrap {
position: relative; flex: 1; border: 1px solid var(--node-border);
border-radius: 5px; overflow: hidden; background: #000; min-height: 0;
}
.ground-canvas-wrap canvas { width: 100%; height: 100%; object-fit: contain; display: block; }
.ground-placeholder {
position: absolute; inset: 0; display: flex; align-items: center;
justify-content: center; font-size: 11px; color: var(--muted); text-align: center; padding: 10px;
}
.loader {
width: 11px; height: 11px; border: 2px solid rgba(255,255,255,0.3);
border-top-color: #fff; border-radius: 50%;
animation: spin 0.7s linear infinite; display: none;
}
@keyframes spin { to { transform: rotate(360deg); } }
.status-dot { width: 6px; height: 6px; border-radius: 50%; background: var(--muted); display: inline-block; margin-right: 6px; }
.status-dot.active { background: var(--accent2); box-shadow: 0 0 5px var(--accent2); }
/* ── Model badges ── */
.model-badge {
display: inline-block; padding: 2px 7px; border-radius: 4px;
font-size: 9px; font-weight: 700; letter-spacing: 0.06em; text-transform: uppercase;
}
.model-badge.qvl2b { background: rgba(255,150,50,0.15); color: #ff9632; border: 1px solid rgba(255,150,50,0.35); }
.model-badge.qvl4b { background: rgba(255,100,80,0.15); color: #ff6450; border: 1px solid rgba(255,100,80,0.35); }
.model-badge.q4bunred { background: rgba(255,80,80,0.18); color: #ff5050; border: 1px solid rgba(255,80,80,0.40); }
.model-badge.q4b { background: rgba(255,200,80,0.15); color: #ffc850; border: 1px solid rgba(255,200,80,0.35); }
.model-badge.q2b { background: rgba(124,106,247,0.2); color: var(--accent); border: 1px solid rgba(124,106,247,0.3); }
.model-badge.lfm450 { background: rgba(78,205,196,0.15); color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
.model-badge.g4e2b { background: rgba(66,197,107,0.15); color: #42c56b; border: 1px solid rgba(66,197,107,0.35); }
.model-badge.lfm16 { background: rgba(107,203,119,0.15); color: #6bcb77; border: 1px solid rgba(107,203,119,0.35); }
.model-badge.qunred { background: rgba(255,80,160,0.15); color: #ff50a0; border: 1px solid rgba(255,80,160,0.35); }
.model-badge.q25vl3b { background: rgba(80,180,255,0.15); color: #50b4ff; border: 1px solid rgba(80,180,255,0.35); }
.model-info-box { border-radius: 6px; padding: 9px; font-size: 10px; color: var(--muted); line-height: 1.55; flex-shrink: 0; }
.canvas-footer { height: 36px; }
</style>
</head>
<body>
<div class="top-bar">
<span class="logo">MULTIMODAL EDGE</span>
<span class="sep">|</span>
<span class="sub">Node-Based Inference Canvas</span>
<span class="badge">10x Vision Models</span>
</div>
<div id="canvas">
<svg class="wires">
<path id="wire-img-task" class="wire" />
<path id="wire-model-task" class="wire" />
<path id="wire-task-out" class="wire" />
<path id="wire-task-gnd" class="wire" />
</svg>
<!-- ─── ID 01 : Image Input ─── -->
<div class="node fixed-height" id="node-img" style="left:40px; top:52px;">
<div class="node-header">
<span><span class="status-dot" id="dot-img"></span>Input Image</span>
<span class="id">ID: 01</span>
</div>
<div class="node-body">
<div>
<label>Upload Image</label>
<div class="file-upload" id="dropZone">
<svg width="30" height="30" viewBox="0 0 24 24" fill="none"
stroke="#7c6af7" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round">
<rect x="3" y="3" width="18" height="18" rx="2" ry="2"/>
<circle cx="8.5" cy="8.5" r="1.5"/>
<polyline points="21 15 16 10 5 21"/>
</svg>
<span>Click or drop image here</span>
<input type="file" id="fileInput" accept="image/*">
</div>
<div class="preview-wrap" id="previewWrap">
<img id="imgPreview" class="img-preview" />
<button class="clear-btn" id="clearBtn" title="Remove image">
<svg width="12" height="12" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round">
<line x1="18" y1="6" x2="6" y2="18"/>
<line x1="6" y1="6" x2="18" y2="18"/>
</svg>
</button>
</div>
<div class="img-chip" id="imgChip" style="margin-top:6px;">
<span class="chip-dot"></span>
<span class="chip-name" id="chipName">β€”</span>
<span class="chip-size" id="chipSize"></span>
</div>
</div>
</div>
<div class="port out" id="port-img-out" style="top:50%;transform:translateY(-50%);"></div>
</div>
<!-- ─── ID 02 : Model Selector ─── -->
<div class="node fixed-height" id="node-model" style="left:40px; top:412px;">
<div class="node-header">
<span><span class="status-dot" id="dot-model"></span>Model Selector</span>
<span class="id">ID: 02</span>
</div>
<div class="node-body">
<div>
<label>Active Model</label>
<select id="modelSelect">
<option value="qwen_vl_2b">Qwen3-VL-2B-Instruct</option>
<option value="qwen_vl_4b">Qwen3-VL-4B-Instruct</option>
<option value="qwen_4b_unredacted">Qwen3.5-4B-Unredacted-MAX</option>
<option value="qwen_4b">Qwen3.5-4B</option>
<option value="qwen_2b">Qwen3.5-2B</option>
<option value="lfm_450">LFM2.5-VL-450M (LiquidAI)</option>
<option value="gemma4_e2b">Gemma4-E2B-it (Google)</option>
<option value="lfm_16">LFM2.5-VL-1.6B (LiquidAI)</option>
<option value="qwen_unredacted">Qwen3.5-2B-Unredacted-MAX</option>
<option value="qwen25_vl_3b">Qwen2.5-VL-3B-Instruct</option>
</select>
</div>
<div id="modelInfoBox" class="model-info-box"
style="background:rgba(255,150,50,0.07);border:1px solid rgba(255,150,50,0.3);">
<span class="model-badge qvl2b">QWEN3-VL Β· 2B</span><br><br>
Qwen3-VL-2B-Instruct β€” dedicated vision-language model by Alibaba Cloud.
Strong spatial grounding, OCR &amp; instruction-following.
</div>
<div style="flex:1;"></div>
</div>
<div class="port out" id="port-model-out" style="top:50%;transform:translateY(-50%);"></div>
</div>
<!-- ─── ID 03 : Task Config ─── -->
<div class="node fixed-height" id="node-task" style="left:425px; top:52px;">
<div class="port in" id="port-task-in" style="top:50%;transform:translateY(-50%);"></div>
<div class="node-header">
<span><span class="status-dot" id="dot-task"></span>Task Config</span>
<span class="id">ID: 03</span>
</div>
<div class="node-body">
<div>
<label>Task Category</label>
<select id="categorySelect">
<option value="Query">Query</option>
<option value="Caption">Caption</option>
<option value="Point">Point</option>
<option value="Detect">Detect</option>
</select>
</div>
<div>
<label>Prompt Directive</label>
<textarea id="promptInput" rows="4"
placeholder="e.g., Count the total number of boats and describe the environment."></textarea>
</div>
<button class="run-btn" id="runBtn">
<span>Execute</span>
<span class="loader" id="btnLoader"></span>
</button>
</div>
<div class="port out" id="port-task-out" style="top:50%;transform:translateY(-50%);"></div>
</div>
<!-- ─── ID 04 : Output Stream ─── -->
<div class="node fixed-height" id="node-out" style="left:810px; top:52px;">
<div class="port in" id="port-out-in" style="top:50%;transform:translateY(-50%);"></div>
<div class="node-header">
<span><span class="status-dot" id="dot-out"></span>Output Stream</span>
<span class="id">ID: 04</span>
</div>
<div class="output-node-body">
<div class="output-header-row">
<label style="margin-bottom:0;">Streamed Result</label>
<button class="icon-btn" id="copyBtn" title="Copy result to clipboard">
<svg width="11" height="11" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round">
<rect x="9" y="9" width="13" height="13" rx="2" ry="2"/>
<path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/>
</svg>
COPY
</button>
</div>
<div class="output-box" id="outputBox">Results will stream here...</div>
</div>
</div>
<!-- ─── ID 05 : Grounding Visualiser ─── -->
<div class="node fixed-height" id="node-gnd" style="left:810px; top:412px;">
<div class="port in" id="port-gnd-in" style="top:50%;transform:translateY(-50%);"></div>
<div class="node-header">
<span><span class="status-dot" id="dot-gnd"></span>View Grounding</span>
<span class="id">ID: 05</span>
</div>
<div class="ground-node-body">
<div class="ground-header-row">
<label style="margin-bottom:0;">Point / Detect Overlay</label>
<a class="icon-btn teal" id="downloadBtn" title="Download overlay image" style="display:none;">
<svg width="11" height="11" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round">
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
<polyline points="7 10 12 15 17 10"/>
<line x1="12" y1="15" x2="12" y2="3"/>
</svg>
SAVE
</a>
</div>
<div class="ground-canvas-wrap">
<canvas id="groundCanvas"></canvas>
<div class="ground-placeholder" id="groundPlaceholder">
Active for Point / Detect tasks.<br>Run inference to visualise.
</div>
</div>
</div>
</div>
<div class="canvas-footer"></div>
</div>
<script>
// ══════════════════════════════════════════════
// WIRE DRAWING
// ══════════════════════════════════════════════
const canvasEl = document.getElementById('canvas');
function portCenter(id) {
const el = document.getElementById(id);
if (!el) return {x:0,y:0};
const er = el.getBoundingClientRect(), cr = canvasEl.getBoundingClientRect();
return { x: er.left + er.width/2 - cr.left, y: er.top + er.height/2 - cr.top };
}
function bezier(p1, p2) {
const dx = Math.abs(p2.x - p1.x) * 0.55;
return `M ${p1.x} ${p1.y} C ${p1.x+dx} ${p1.y}, ${p2.x-dx} ${p2.y}, ${p2.x} ${p2.y}`;
}
function updateWires() {
const wires = [
['wire-img-task', 'port-img-out', 'port-task-in'],
['wire-model-task', 'port-model-out','port-task-in'],
['wire-task-out', 'port-task-out', 'port-out-in'],
['wire-task-gnd', 'port-task-out', 'port-gnd-in'],
];
for (const [id, from, to] of wires) {
const el = document.getElementById(id);
if (el) el.setAttribute('d', bezier(portCenter(from), portCenter(to)));
}
}
// ══════════════════════════════════════════════
// DRAGGING
// ══════════════════════════════════════════════
document.querySelectorAll('.node').forEach(node => {
const header = node.querySelector('.node-header');
let drag = false, sx, sy, il, it;
header.addEventListener('mousedown', e => {
drag=true; sx=e.clientX; sy=e.clientY;
il=parseInt(node.style.left)||0; it=parseInt(node.style.top)||0;
node.style.zIndex=100; e.preventDefault();
});
document.addEventListener('mousemove', e => {
if (!drag) return;
node.style.left=`${il+e.clientX-sx}px`; node.style.top=`${it+e.clientY-sy}px`;
updateWires();
});
document.addEventListener('mouseup', () => { if(drag){drag=false;node.style.zIndex=10;} });
});
window.addEventListener('resize', updateWires);
window.addEventListener('scroll', updateWires);
document.addEventListener('scroll', updateWires, true);
requestAnimationFrame(updateWires);
// ══════════════════════════════════════════════
// FILE UPLOAD + CLEAR
// ══════════════════════════════════════════════
let currentFile = null;
const dropZone = document.getElementById('dropZone');
const fileInput = document.getElementById('fileInput');
const previewWrap = document.getElementById('previewWrap');
const imgPreview = document.getElementById('imgPreview');
const clearBtn = document.getElementById('clearBtn');
const imgChip = document.getElementById('imgChip');
const chipName = document.getElementById('chipName');
const chipSize = document.getElementById('chipSize');
const dotImg = document.getElementById('dot-img');
function formatBytes(b) {
if (b<1024) return b+' B'; if (b<1048576) return (b/1024).toFixed(1)+' KB';
return (b/1048576).toFixed(1)+' MB';
}
function handleFile(file) {
if (!file||!file.type.startsWith('image/')) return;
currentFile=file; imgPreview.src=URL.createObjectURL(file);
previewWrap.classList.add('visible'); dropZone.style.display='none';
chipName.textContent=file.name; chipSize.textContent=formatBytes(file.size);
imgChip.classList.add('visible'); dotImg.classList.add('active');
requestAnimationFrame(updateWires);
}
function clearImage() {
currentFile=null; imgPreview.src=''; previewWrap.classList.remove('visible');
dropZone.style.display=''; imgChip.classList.remove('visible');
chipName.textContent='β€”'; chipSize.textContent=''; fileInput.value='';
dotImg.classList.remove('active'); requestAnimationFrame(updateWires);
}
dropZone.onclick = () => fileInput.click();
fileInput.onchange = e => handleFile(e.target.files[0]);
clearBtn.onclick = e => { e.stopPropagation(); clearImage(); };
dropZone.ondragover = e => { e.preventDefault(); dropZone.style.borderColor='var(--accent)'; };
dropZone.ondragleave = () => { dropZone.style.borderColor=''; };
dropZone.ondrop = e => {
e.preventDefault(); dropZone.style.borderColor='';
if (e.dataTransfer.files.length) handleFile(e.dataTransfer.files[0]);
};
// ══════════════════════════════════════════════
// MODEL SELECTOR
// ══════════════════════════════════════════════
const modelSelect = document.getElementById('modelSelect');
const modelInfoBox = document.getElementById('modelInfoBox');
const dotModel = document.getElementById('dot-model');
dotModel.classList.add('active');
const MODEL_INFO = {
qwen_vl_2b: {
html: `<span class="model-badge qvl2b">QWEN3-VL Β· 2B</span><br><br>
Qwen3-VL-2B-Instruct β€” dedicated vision-language model by Alibaba Cloud.
Strong spatial grounding, OCR &amp; instruction-following.`,
bg: 'rgba(255,150,50,0.07)', border: 'rgba(255,150,50,0.30)',
},
qwen_vl_4b: {
html: `<span class="model-badge qvl4b">QWEN3-VL Β· 4B</span><br><br>
Qwen3-VL-4B-Instruct β€” enhanced vision-language model by Alibaba Cloud.
Superior spatial grounding, richer OCR &amp; stronger multi-step reasoning.`,
bg: 'rgba(255,100,80,0.07)', border: 'rgba(255,100,80,0.25)',
},
qwen_4b_unredacted: {
html: `<span class="model-badge q4bunred">QWEN 3.5 Β· 4B UNREDACTED MAX</span><br><br>
Qwen3.5-4B-Unredacted-MAX by prithivMLmods. Uncensored fine-tune of Qwen3.5-4B
with extended instruction-following &amp; unrestricted reasoning.`,
bg: 'rgba(255,80,80,0.07)', border: 'rgba(255,80,80,0.30)',
},
qwen_4b: {
html: `<span class="model-badge q4b">QWEN 3.5 Β· 4B</span><br><br>
Qwen3.5 4B multimodal model by Alibaba Cloud.
Enhanced capacity β€” richer reasoning &amp; better instruction following.`,
bg: 'rgba(255,200,80,0.07)', border: 'rgba(255,200,80,0.30)',
},
qwen_2b: {
html: `<span class="model-badge q2b">QWEN 3.5 Β· 2B</span><br><br>
Qwen3.5 2B multimodal model by Alibaba Cloud.
Lightweight &amp; fast β€” ideal for quick Query, Caption, Point &amp; Detect tasks.`,
bg: 'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.25)',
},
lfm_450: {
html: `<span class="model-badge lfm450">LFM Β· 450M</span><br><br>
LFM2.5-VL 450M by LiquidAI. Ultra-lightweight edge model
with solid grounding capabilities.`,
bg: 'rgba(78,205,196,0.07)', border: 'rgba(78,205,196,0.25)',
},
gemma4_e2b: {
html: `<span class="model-badge g4e2b">GEMMA 4 Β· E2B</span><br><br>
Gemma4-E2B-it by Google DeepMind. Efficient 2B multimodal model
with strong vision-language understanding &amp; instruction-following.`,
bg: 'rgba(66,197,107,0.07)', border: 'rgba(66,197,107,0.25)',
},
lfm_16: {
html: `<span class="model-badge lfm16">LFM Β· 1.6B</span><br><br>
LFM2.5-VL 1.6B by LiquidAI. Larger liquid-state model offering
enhanced reasoning &amp; richer visual understanding.`,
bg: 'rgba(107,203,119,0.07)', border: 'rgba(107,203,119,0.25)',
},
qwen_unredacted: {
html: `<span class="model-badge qunred">QWEN 3.5 Β· 2B UNREDACTED MAX</span><br><br>
Qwen3.5-2B-Unredacted-MAX by prithivMLmods. Fine-tuned variant of Qwen3.5-2B
with uncensored &amp; extended instruction-following capabilities.`,
bg: 'rgba(255,80,160,0.07)', border: 'rgba(255,80,160,0.25)',
},
qwen25_vl_3b: {
html: `<span class="model-badge q25vl3b">QWEN 2.5-VL Β· 3B</span><br><br>
Qwen2.5-VL-3B-Instruct by Alibaba Cloud. Powerful 3B vision-language model
with strong grounding, OCR &amp; multi-task visual reasoning.`,
bg: 'rgba(80,180,255,0.07)', border: 'rgba(80,180,255,0.25)',
},
};
modelSelect.onchange = () => {
const info = MODEL_INFO[modelSelect.value];
if (!info) return;
modelInfoBox.innerHTML = info.html;
modelInfoBox.style.background = info.bg;
modelInfoBox.style.border = `1px solid ${info.border}`;
};
// ══════════════════════════════════════════════
// CATEGORY PLACEHOLDER
// ══════════════════════════════════════════════
const categorySelect = document.getElementById('categorySelect');
const promptInput = document.getElementById('promptInput');
const PLACEHOLDERS = {
Query: 'e.g., Count the total number of boats and describe the environment.',
Caption: 'e.g., short | normal | detailed',
Point: 'e.g., The gun held by the person.',
Detect: 'e.g., The headlight of the car.',
};
categorySelect.onchange = e => { promptInput.placeholder = PLACEHOLDERS[e.target.value]||''; };
// ══════════════════════════════════════════════
// ROBUST JSON EXTRACTOR
// ══════════════════════════════════════════════
function extractGroundingJSON(raw) {
// 1. Strip all <think>…</think> blocks (multi-pass)
let text = raw, prev = null;
while (prev !== text) {
prev = text;
text = text.replace(/<think>[\s\S]*?<\/think>/gi, '');
}
// 2. Strip markdown fences, keep inner content
text = text.replace(/```(?:json)?([\s\S]*?)```/gi, '$1');
text = text.replace(/```/g, '').trim();
// Balanced bracket extractor
function extractBalanced(str, startIdx, openCh, closeCh) {
let depth=0, inStr=false, esc=false;
for (let i=startIdx; i<str.length; i++) {
const c=str[i];
if (esc) { esc=false; continue; }
if (c==='\\\\') { esc=true; continue; }
if (c==='"') { inStr=!inStr; continue; }
if (inStr) continue;
if (c===openCh) depth++;
if (c===closeCh) {
depth--;
if (depth===0) {
try { return JSON.parse(str.slice(startIdx, i+1)); } catch(_) { return null; }
}
}
}
return null;
}
// Search from the END β€” models emit JSON after reasoning prose
for (let i=text.length-1; i>=0; i--) {
if (text[i]==='[') { const r=extractBalanced(text,i,'[',']'); if(r!==null) return r; }
}
for (let i=text.length-1; i>=0; i--) {
if (text[i]==='{') { const r=extractBalanced(text,i,'{','}'); if(r!==null) return r; }
}
// Fallback: search from start
const fa=text.indexOf('['); if(fa!==-1){const r=extractBalanced(text,fa,'[',']');if(r!==null)return r;}
const fo=text.indexOf('{'); if(fo!==-1){const r=extractBalanced(text,fo,'{','}');if(r!==null)return r;}
try { return JSON.parse(text); } catch(_) {}
return null;
}
// ══════════════════════════════════════════════
// GROUNDING VISUALIZER
// ══════════════════════════════════════════════
const groundCanvas = document.getElementById('groundCanvas');
const groundPlaceholder = document.getElementById('groundPlaceholder');
const gCtx = groundCanvas.getContext('2d');
const downloadBtn = document.getElementById('downloadBtn');
const PALETTE = ['#4ecdc4','#7c6af7','#ff6b6b','#ffd93d','#6bcb77','#ff922b','#cc5de8','#339af0'];
function hexToRgba(hex, alpha) {
const r=parseInt(hex.slice(1,3),16), g=parseInt(hex.slice(3,5),16), b=parseInt(hex.slice(5,7),16);
return `rgba(${r},${g},${b},${alpha})`;
}
function roundRect(ctx, x, y, w, h, r) {
ctx.beginPath(); ctx.moveTo(x+r,y);
ctx.lineTo(x+w-r,y); ctx.quadraticCurveTo(x+w,y,x+w,y+r);
ctx.lineTo(x+w,y+h-r); ctx.quadraticCurveTo(x+w,y+h,x+w-r,y+h);
ctx.lineTo(x+r,y+h); ctx.quadraticCurveTo(x,y+h,x,y+h-r);
ctx.lineTo(x,y+r); ctx.quadraticCurveTo(x,y,x+r,y); ctx.closePath();
}
function updateDownloadBtn() {
// Build a timestamped filename and update the anchor href
const dataURL = groundCanvas.toDataURL('image/png');
const ts = new Date().toISOString().replace(/[:.]/g,'-').slice(0,19);
downloadBtn.href = dataURL;
downloadBtn.download = `grounding_${ts}.png`;
downloadBtn.style.display = 'flex';
}
function drawGrounding(imgSrc, rawText) {
const parsed = extractGroundingJSON(rawText);
if (!parsed) { console.warn('Grounding: no JSON found in:', rawText.slice(0,200)); return; }
const img = new Image();
img.onload = () => {
const W=img.naturalWidth, H=img.naturalHeight;
groundCanvas.width=W; groundCanvas.height=H;
gCtx.drawImage(img, 0, 0);
groundPlaceholder.style.display='none';
const lw=Math.max(2,W/200), fs=Math.max(12,W/40);
gCtx.lineWidth=lw;
gCtx.font=`bold ${fs}px JetBrains Mono, monospace`;
const items = Array.isArray(parsed) ? parsed : [parsed];
items.forEach((item, i) => {
const col = PALETTE[i % PALETTE.length];
// ── Bounding box ─────────────────────────────
let bbox = null;
if (Array.isArray(item?.bbox_2d) && item.bbox_2d.length===4) bbox=item.bbox_2d;
else if (Array.isArray(item?.bbox) && item.bbox.length===4) bbox=item.bbox;
else if (Array.isArray(item) && item.length===4 && item.every(n=>typeof n==='number')) bbox=item;
if (bbox) {
let [x1,y1,x2,y2]=bbox.map(Number);
if (x1<=1&&y1<=1&&x2<=1&&y2<=1) { x1*=W;y1*=H;x2*=W;y2*=H; }
if (x2<x1)[x1,x2]=[x2,x1]; if (y2<y1)[y1,y2]=[y2,y1];
const bw=x2-x1, bh=y2-y1;
const lbl=(item?.label??`obj ${i+1}`).toString();
gCtx.fillStyle=hexToRgba(col,0.20); gCtx.fillRect(x1,y1,bw,bh);
gCtx.strokeStyle=col; gCtx.lineWidth=lw; gCtx.strokeRect(x1,y1,bw,bh);
const tw=gCtx.measureText(lbl).width, ph=fs*1.45, pw=tw+12;
const lx=x1, ly=Math.max(0,y1-ph);
gCtx.fillStyle=col; roundRect(gCtx,lx,ly,pw,ph,4); gCtx.fill();
gCtx.fillStyle='#fff'; gCtx.fillText(lbl,lx+6,ly+ph*0.76);
return;
}
// ── Point ────────────────────────────────────
let pt = null;
if (Array.isArray(item?.point_2d) && item.point_2d.length===2) pt=item.point_2d;
else if (Array.isArray(item?.point) && item.point.length===2) pt=item.point;
else if (Array.isArray(item) && item.length===2 && item.every(n=>typeof n==='number')) pt=item;
if (pt) {
let [x,y]=pt.map(Number);
if (x<=1&&y<=1){x*=W;y*=H;}
const r=Math.max(8,W/60);
const lbl=(item?.label??`pt ${i+1}`).toString();
gCtx.beginPath(); gCtx.arc(x,y,r*1.8,0,Math.PI*2);
gCtx.fillStyle=hexToRgba(col,0.18); gCtx.fill();
gCtx.beginPath(); gCtx.arc(x,y,r,0,Math.PI*2);
gCtx.fillStyle=col; gCtx.fill();
gCtx.strokeStyle='#fff'; gCtx.lineWidth=Math.max(1.5,lw); gCtx.stroke();
gCtx.fillStyle='#fff'; gCtx.fillText(lbl,x+r+5,y+fs*0.4);
}
});
// Enable download button after drawing
updateDownloadBtn();
};
img.onerror = () => console.error('Grounding: failed to load image.');
img.src = imgSrc;
}
// ══════════════════════════════════════════════
// COPY BUTTON
// ══════════════════════════════════════════════
const copyBtn = document.getElementById('copyBtn');
const outputBox = document.getElementById('outputBox');
let copyTimer = null;
function resetCopyBtn() {
copyBtn.classList.remove('copied');
copyBtn.innerHTML = `
<svg width="11" height="11" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2.2" stroke-linecap="round" stroke-linejoin="round">
<rect x="9" y="9" width="13" height="13" rx="2" ry="2"/>
<path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"/>
</svg> COPY`;
}
copyBtn.onclick = () => {
const txt = outputBox.innerText||'';
if (!txt||txt==='Results will stream here...') return;
navigator.clipboard.writeText(txt).then(() => {
copyBtn.classList.add('copied');
copyBtn.innerHTML = `
<svg width="11" height="11" viewBox="0 0 24 24" fill="none"
stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round">
<polyline points="20 6 9 17 4 12"/>
</svg> COPIED`;
clearTimeout(copyTimer); copyTimer=setTimeout(resetCopyBtn,2000);
}).catch(() => {
const ta=document.createElement('textarea'); ta.value=txt;
ta.style.position='fixed'; ta.style.opacity='0';
document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);
});
};
// ══════════════════════════════════════════════
// RUN INFERENCE
// ══════════════════════════════════════════════
const runBtn = document.getElementById('runBtn');
const btnLoader = document.getElementById('btnLoader');
const allWires = ['wire-img-task','wire-model-task','wire-task-out','wire-task-gnd'];
const dotTask = document.getElementById('dot-task');
const dotOut = document.getElementById('dot-out');
const dotGnd = document.getElementById('dot-gnd');
runBtn.onclick = async () => {
if (!currentFile) { alert('Please upload an image into the Input Node.'); return; }
const promptStr = promptInput.value.trim();
if (!promptStr) { alert('Please enter a prompt directive.'); return; }
runBtn.disabled=true; btnLoader.style.display='inline-block';
outputBox.innerText=''; outputBox.style.color='';
groundPlaceholder.style.display='flex';
gCtx.clearRect(0,0,groundCanvas.width,groundCanvas.height);
downloadBtn.style.display='none';
dotTask.classList.add('active');
dotOut.classList.remove('active'); dotGnd.classList.remove('active');
allWires.forEach(id=>document.getElementById(id)?.classList.add('active'));
resetCopyBtn();
const formData=new FormData();
formData.append('image', currentFile);
formData.append('category', categorySelect.value);
formData.append('prompt', promptStr);
formData.append('model_id', modelSelect.value);
let fullText='';
const imgObjectURL=URL.createObjectURL(currentFile);
try {
const response=await fetch('/api/run',{method:'POST',body:formData});
if (!response.ok) { const err=await response.json(); throw new Error(err.error||'Execution failed.'); }
const reader=response.body.getReader(), decoder=new TextDecoder('utf-8');
let buffer='';
while (true) {
const {value,done}=await reader.read(); if(done)break;
buffer+=decoder.decode(value,{stream:true});
const lines=buffer.split('\\n\\n'); buffer=lines.pop();
for (const line of lines) {
if (!line.startsWith('data: ')) continue;
const payload=line.replace('data: ','');
if (payload==='[DONE]') break;
try {
const data=JSON.parse(payload);
if (data.chunk) { fullText+=data.chunk; outputBox.innerText=fullText; outputBox.scrollTop=outputBox.scrollHeight; }
} catch(_) {}
}
}
dotOut.classList.add('active');
// Grounding overlay for Point / Detect
const cat=categorySelect.value;
if ((cat==='Point'||cat==='Detect') && fullText.trim()) {
const parsed=extractGroundingJSON(fullText);
if (parsed!==null) {
dotGnd.classList.add('active');
drawGrounding(imgObjectURL, fullText);
} else {
console.warn('No grounding JSON found in output.');
}
}
} catch(err) {
outputBox.innerText=`[Error] ${err.message}`; outputBox.style.color='#ff6b6b';
} finally {
runBtn.disabled=false; btnLoader.style.display='none';
dotTask.classList.remove('active');
allWires.forEach(id=>document.getElementById(id)?.classList.remove('active'));
}
};
</script>
</body>
</html>
"""
if __name__ == "__main__":
app.launch(show_error=True)