"""Vision-language: describe / answer about an image, in PT/EN. Generic loader (AutoModelForImageTextToText) — supports: - Qwen/Qwen3-VL-2B-Instruct (default — light, fast, strong OCR) - Qwen/Qwen2.5-VL-3B-Instruct, openbmb/MiniCPM-V-4.6, etc. Swappable via IRIS_VLM_MODEL. The VLM IS the text generator for speech. """ import os _model = None _aux = None MODEL_ID = os.environ.get("IRIS_VLM_MODEL", "Qwen/Qwen3-VL-2B-Instruct") DOWNSAMPLE = os.environ.get("IRIS_DOWNSAMPLE", "4x") # MiniCPM only (detail for OCR) SYSTEM_PT = ( "Você é os olhos de uma pessoa cega. RESPONDA OBRIGATORIAMENTE EM PORTUGUÊS " "DO BRASIL, em no máximo duas frases curtas, dizendo só o que é relevante e " "útil sobre a cena. Não comece com 'a imagem mostra'. " "Se houver texto importante (rótulo, placa, remédio), leia-o exatamente como está. " "Se houver DINHEIRO (cédulas ou moedas de real), identifique cada valor e diga o TOTAL. " "Se for uma CONTA, boleto ou documento, leia o VALOR TOTAL e a DATA DE VENCIMENTO." ) SYSTEM_EN = ( "You are the eyes of a blind person. ALWAYS REPLY IN ENGLISH, in at most two " "short sentences, saying only what is relevant and useful about the scene. Do " "not start with 'the image shows'. " "If there is important text (label, sign, medicine), read it exactly as written. " "If there is MONEY (banknotes or coins), identify each value and state the TOTAL. " "If it is a BILL or document, read the TOTAL AMOUNT and the DUE DATE." ) def _prompt(lang): """Return (system_prompt, default_question) for the language.""" if lang == "en": return SYSTEM_EN, "What is in front of me?" return SYSTEM_PT, "O que há à minha frente?" def _family() -> str: return "minicpm" if "minicpm" in MODEL_ID.lower() else "qwen" def _load(): global _model, _aux if _model is None: import torch from transformers import AutoModelForImageTextToText, AutoProcessor kw = {"trust_remote_code": True} if _family() == "minicpm" else {} _model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="cuda:0", low_cpu_mem_usage=True, **kw, ).eval() _aux = AutoProcessor.from_pretrained(MODEL_ID, **kw) return _model, _aux def _to_pil(image): from PIL import Image if isinstance(image, str): image = Image.open(image) elif not isinstance(image, Image.Image): image = Image.fromarray(image) # numpy frame from the webcam image = image.convert("RGB") image.thumbnail((1024, 1024)) # fewer vision tokens -> faster; still good OCR return image from .gpu import gpu @gpu(duration=60) def describe(image, question: str = "", lang: str = "pt", system: str = None) -> str: import torch image = _to_pil(image) sys_prompt, default_q = _prompt(lang) if system: sys_prompt = system user = (question or "").strip() or default_q model, aux = _load() messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": user}, ]}, ] tmpl_kw, gen_kw = {}, {} if _family() == "minicpm": tmpl_kw = {"downsample_mode": DOWNSAMPLE, "max_slice_nums": 36} gen_kw = {"downsample_mode": DOWNSAMPLE} inputs = aux.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", **tmpl_kw, ).to(model.device) with torch.no_grad(): generated = model.generate(**inputs, max_new_tokens=96, do_sample=False, **gen_kw) trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, generated)] return aux.batch_decode(trimmed, skip_special_tokens=True)[0].strip()