from __future__ import annotations import io import modal APP_NAME = "third-eye-backend" VISION_MODEL_ID = "openbmb/MiniCPM-V-2" TTS_MODEL_ID = "openbmb/VoxCPM2" app = modal.App(APP_NAME) model_cache = modal.Volume.from_name("third-eye-model-cache", create_if_missing=True) cache_mount = {"/cache": model_cache} cache_env = { "HF_HOME": "/cache/huggingface", "TRANSFORMERS_CACHE": "/cache/huggingface", } vision_image = ( modal.Image.debian_slim(python_version="3.11") .pip_install( "torch==2.1.2", "torchvision==0.16.2", "transformers==4.36.2", "accelerate>=0.25", "sentencepiece>=0.1.99", "timm==0.9.10", "pillow>=10", "peft==0.9.0", "numpy<2", ) .env(cache_env) ) tts_image = ( modal.Image.from_registry( "nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04", add_python="3.11", ) .apt_install("ffmpeg", "libsox-dev", "build-essential") .pip_install( "torch>=2.5", "voxcpm>=0.1.0", "misaki[zh]>=0.9", "soundfile>=0.12", "numpy>=1.26", ) .env({**cache_env, "TORCHDYNAMO_DISABLE": "1"}) ) stt_image = ( modal.Image.from_registry( "nvidia/cuda:12.8.1-cudnn-runtime-ubuntu22.04", add_python="3.11", ) .pip_install( "torch>=2.5", "transformers>=5.4", "accelerate>=1.0", "librosa>=0.10", "sentencepiece", "protobuf", "soundfile>=0.12", ) .add_local_file("cohere_stt.py", "/root/cohere_stt.py", copy=True) .env(cache_env) ) _vision_model = None _vision_tokenizer = None _tts_model = None def _load_vision(): global _vision_model, _vision_tokenizer if _vision_model is None: import torch from transformers import AutoModel, AutoTokenizer # The MiniCPM-V-2 model card recommends bfloat16; it is more numerically # stable than float16 and reduces OCR drift on small text. _vision_model = ( AutoModel.from_pretrained( VISION_MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16, ) .to(device="cuda", dtype=torch.bfloat16) .eval() ) _vision_tokenizer = AutoTokenizer.from_pretrained( VISION_MODEL_ID, trust_remote_code=True, ) param = next(_vision_model.parameters()) print( f"[third-eye VISION] loaded {VISION_MODEL_ID} " f"| device={param.device} | dtype={param.dtype}", flush=True, ) return _vision_model, _vision_tokenizer def _chat_once(model, tokenizer, image, prompt: str) -> str: # Low temperature (0.2) for faithful, repeatable output while avoiding # the refusals that greedy (sampling=False) sometimes triggers on # MiniCPM-V-2. Keeps hallucination low (no "$20" for an "$18" item). answer, _, _ = model.chat( image=image, msgs=[{"role": "user", "content": prompt}], context=None, tokenizer=tokenizer, sampling=True, temperature=0.2, ) return answer.strip() def stitch_overlapping_text(parts: list[str], max_overlap_words: int = 12) -> str: """Join OCR results from overlapping image bands, removing the duplicated region. Finds the longest suffix of the running text that matches the prefix of the next part (case-insensitive) and drops it. Pure function — unit tested.""" parts = [p.strip() for p in parts if p and p.strip()] if not parts: return "" words = parts[0].split() for nxt in parts[1:]: nwords = nxt.split() limit = min(len(words), len(nwords), max_overlap_words) overlap = 0 for k in range(limit, 0, -1): if [w.lower() for w in words[-k:]] == [w.lower() for w in nwords[:k]]: overlap = k break words += nwords[overlap:] return " ".join(words) @app.function( gpu="A10G", image=vision_image, timeout=300, volumes=cache_mount, ) def describe_scene( image_bytes: bytes, question: str, lang: str = "en", tile: bool = False ) -> str: import time from PIL import Image model, tokenizer = _load_vision() image = Image.open(io.BytesIO(image_bytes)).convert("RGB") prompt = question.strip() or "Describe everything visible for a blind user." if lang == "zh": prompt += " Answer in Chinese." start = time.time() if not tile: answer = _chat_once(model, tokenizer, image, prompt) print(f"[third-eye VISION] chat: {time.time() - start:.2f}s", flush=True) return answer # Tiled OCR for verbatim Read Text mode: small text on a full image exceeds a # 2.8B VLM's OCR resolution (it merged "MANGO LASSI" -> "MANGOLAISSI"). Splitting # into overlapping top/bottom bands enlarges the relative text per call; the # overlap is stitched away. Automatic — no box-drawing, which a blind user can't do. w, h = image.size bands = [(0, 0, w, int(h * 0.55)), (0, int(h * 0.45), w, h)] parts = [_chat_once(model, tokenizer, image.crop(box), prompt) for box in bands] answer = stitch_overlapping_text(parts) print( f"[third-eye VISION] tiled chat ({len(bands)} bands): " f"{time.time() - start:.2f}s", flush=True, ) return answer def _load_tts(): global _tts_model if _tts_model is None: from voxcpm import VoxCPM _tts_model = VoxCPM.from_pretrained(TTS_MODEL_ID, load_denoiser=False) return _tts_model @app.function( gpu="A10G", image=tts_image, timeout=300, volumes=cache_mount, ) def speak(text: str, lang: str = "en") -> bytes: import numpy as np import soundfile as sf if not text.strip(): raise ValueError("Cannot synthesize empty text.") text = text.strip()[:500] model = _load_tts() waveform = model.generate( text=text, cfg_value=2.0, inference_timesteps=10, ) output = io.BytesIO() sf.write( output, np.asarray(waveform, dtype=np.float32), model.tts_model.sample_rate, format="WAV", ) return output.getvalue() @app.function( gpu="A10G", image=stt_image, timeout=300, volumes=cache_mount, secrets=[modal.Secret.from_name("third-eye-hf")], ) def transcribe_audio(audio_bytes: bytes, language: str = "en") -> str: from cohere_stt import transcribe_wav_bytes return transcribe_wav_bytes(audio_bytes, language) @app.local_entrypoint() def smoke_test(image_path: str = "assets/sample_menu.jpg"): image_bytes = open(image_path, "rb").read() answer = describe_scene.remote( image_bytes, "Read the menu and summarize the available items.", "en", ) print(answer) audio = speak.remote(answer, "en") with open("out.wav", "wb") as output: output.write(audio) print("Saved out.wav") @app.local_entrypoint() def read_test(image_path: str = "assets/sample_menu.jpg", prompt: str = ""): """Test Read Text (verbatim OCR) mode for transcription distortion.""" read_prompt = prompt or ( "Read every word and number in this image exactly as written. " "Include all text, labels, prices, dates, directions, and signs. " "Do not interpret or explain - just read the text verbatim." ) image_bytes = open(image_path, "rb").read() answer = describe_scene.remote(image_bytes, read_prompt, "en", True) print(f"READ [{image_path}]:\n{answer}") @app.local_entrypoint() def ask_test( image_path: str = "assets/sample_menu.jpg", question_text: str = "What is the cheapest item on the menu and how much does it cost?", ): """End-to-end 'Ask' pipeline test: speak a question -> STT -> vision answer. Synthesizes the question to audio so we can compare what was SPOKEN vs HEARD, then checks whether the vision model actually ANSWERS that question. """ print(f"SPOKEN: {question_text!r}") q_audio = speak.remote(question_text, "en") heard = transcribe_audio.remote(q_audio, "en") print(f"HEARD: {heard!r}") image_bytes = open(image_path, "rb").read() answer = describe_scene.remote(image_bytes, heard, "en") print(f"ANSWER: {answer!r}") @app.local_entrypoint() def stt_benchmark(audio_path: str = "test_speech.wav"): import time audio_bytes = open(audio_path, "rb").read() print(f"Benchmarking STT on {audio_path} ({len(audio_bytes)} bytes)") t0 = time.time() text1 = transcribe_audio.remote(audio_bytes, "en") cold = time.time() - t0 print(f"\n[COLD] total round-trip: {cold:.1f}s") print(f"[COLD] transcript: {text1!r}") t1 = time.time() text2 = transcribe_audio.remote(audio_bytes, "en") warm = time.time() - t1 print(f"\n[WARM] total round-trip: {warm:.1f}s") print(f"[WARM] transcript: {text2!r}")