iris / core /vlm.py
Marcus Ramalho
Iris: hands-free live mode, money/bill reading, accessible UI, Qwen3-VL-2B
df6b3ac
"""Vision-language: describe / answer about an image, in PT/EN.
Generic loader (AutoModelForImageTextToText) — supports:
- Qwen/Qwen3-VL-2B-Instruct (default — light, fast, strong OCR)
- Qwen/Qwen2.5-VL-3B-Instruct, openbmb/MiniCPM-V-4.6, etc.
Swappable via IRIS_VLM_MODEL. The VLM IS the text generator for speech.
"""
import os
_model = None
_aux = None
MODEL_ID = os.environ.get("IRIS_VLM_MODEL", "Qwen/Qwen3-VL-2B-Instruct")
DOWNSAMPLE = os.environ.get("IRIS_DOWNSAMPLE", "4x") # MiniCPM only (detail for OCR)
SYSTEM_PT = (
"Você é os olhos de uma pessoa cega. RESPONDA OBRIGATORIAMENTE EM PORTUGUÊS "
"DO BRASIL, em no máximo duas frases curtas, dizendo só o que é relevante e "
"útil sobre a cena. Não comece com 'a imagem mostra'. "
"Se houver texto importante (rótulo, placa, remédio), leia-o exatamente como está. "
"Se houver DINHEIRO (cédulas ou moedas de real), identifique cada valor e diga o TOTAL. "
"Se for uma CONTA, boleto ou documento, leia o VALOR TOTAL e a DATA DE VENCIMENTO."
)
SYSTEM_EN = (
"You are the eyes of a blind person. ALWAYS REPLY IN ENGLISH, in at most two "
"short sentences, saying only what is relevant and useful about the scene. Do "
"not start with 'the image shows'. "
"If there is important text (label, sign, medicine), read it exactly as written. "
"If there is MONEY (banknotes or coins), identify each value and state the TOTAL. "
"If it is a BILL or document, read the TOTAL AMOUNT and the DUE DATE."
)
def _prompt(lang):
"""Return (system_prompt, default_question) for the language."""
if lang == "en":
return SYSTEM_EN, "What is in front of me?"
return SYSTEM_PT, "O que há à minha frente?"
def _family() -> str:
return "minicpm" if "minicpm" in MODEL_ID.lower() else "qwen"
def _load():
global _model, _aux
if _model is None:
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
kw = {"trust_remote_code": True} if _family() == "minicpm" else {}
_model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID, torch_dtype=torch.float16, device_map="cuda:0",
low_cpu_mem_usage=True, **kw,
).eval()
_aux = AutoProcessor.from_pretrained(MODEL_ID, **kw)
return _model, _aux
def _to_pil(image):
from PIL import Image
if isinstance(image, str):
image = Image.open(image)
elif not isinstance(image, Image.Image):
image = Image.fromarray(image) # numpy frame from the webcam
image = image.convert("RGB")
image.thumbnail((1024, 1024)) # fewer vision tokens -> faster; still good OCR
return image
from .gpu import gpu
@gpu(duration=60)
def describe(image, question: str = "", lang: str = "pt", system: str = None) -> str:
import torch
image = _to_pil(image)
sys_prompt, default_q = _prompt(lang)
if system:
sys_prompt = system
user = (question or "").strip() or default_q
model, aux = _load()
messages = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": [
{"type": "image", "image": image},
{"type": "text", "text": user},
]},
]
tmpl_kw, gen_kw = {}, {}
if _family() == "minicpm":
tmpl_kw = {"downsample_mode": DOWNSAMPLE, "max_slice_nums": 36}
gen_kw = {"downsample_mode": DOWNSAMPLE}
inputs = aux.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True,
return_dict=True, return_tensors="pt", **tmpl_kw,
).to(model.device)
with torch.no_grad():
generated = model.generate(**inputs, max_new_tokens=96, do_sample=False, **gen_kw)
trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, generated)]
return aux.batch_decode(trimmed, skip_special_tokens=True)[0].strip()