Spaces:
Sleeping
Sleeping
| """In-process inference backend for Hugging Face ZeroGPU Spaces. | |
| All three stages run in one Python environment on a ZeroGPU slice, exposed as | |
| plain functions (``describe_scene``, ``transcribe_audio``, ``speak``) so the | |
| Gradio app can call them exactly like the Modal backend. | |
| Model stack (single, Transformers >= 5.4 compatible environment): | |
| * Vision / OCR -> Qwen/Qwen2.5-VL-3B-Instruct (bilingual EN/ZH, < 4B) | |
| * Speech-to-text -> CohereLabs/cohere-transcribe-03-2026 (via cohere_stt) | |
| * Text-to-speech -> openbmb/VoxCPM2 | |
| Models are lazy-loaded once and cached; loading happens inside the GPU context | |
| so it works under ZeroGPU's on-demand allocation. | |
| """ | |
| from __future__ import annotations | |
| import io | |
| # ``spaces`` only exists on a Hugging Face Space. Fall back to a no-op decorator | |
| # so this module still imports in a plain environment (e.g. for unit tests). | |
| try: | |
| import spaces | |
| GPU = spaces.GPU | |
| except Exception: # pragma: no cover - exercised only off-Space | |
| def GPU(*args, **kwargs): | |
| if len(args) == 1 and callable(args[0]) and not kwargs: | |
| return args[0] | |
| def decorator(fn): | |
| return fn | |
| return decorator | |
| VISION_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" | |
| TTS_MODEL_ID = "openbmb/VoxCPM2" | |
| _vision = None | |
| _tts = None | |
| # --------------------------------------------------------------------------- # | |
| # Pure helper (unit-testable, no GPU): stitch overlapping OCR bands. | |
| # --------------------------------------------------------------------------- # | |
| def stitch_overlapping_text(parts: list[str], max_overlap_words: int = 12) -> str: | |
| """Join OCR results from overlapping image bands, removing the duplicated | |
| region. Finds the longest suffix of the running text that matches the prefix | |
| of the next part (case-insensitive) and drops it.""" | |
| parts = [p.strip() for p in parts if p and p.strip()] | |
| if not parts: | |
| return "" | |
| words = parts[0].split() | |
| for nxt in parts[1:]: | |
| nwords = nxt.split() | |
| limit = min(len(words), len(nwords), max_overlap_words) | |
| overlap = 0 | |
| for k in range(limit, 0, -1): | |
| if [w.lower() for w in words[-k:]] == [w.lower() for w in nwords[:k]]: | |
| overlap = k | |
| break | |
| words += nwords[overlap:] | |
| return " ".join(words) | |
| # --------------------------------------------------------------------------- # | |
| # Vision / OCR — Qwen2.5-VL | |
| # --------------------------------------------------------------------------- # | |
| def _load_vision(): | |
| global _vision | |
| if _vision is None: | |
| import torch | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 | |
| model = ( | |
| Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| VISION_MODEL_ID, | |
| torch_dtype=dtype, | |
| ) | |
| .to("cuda") | |
| .eval() | |
| ) | |
| # Cap visual tokens so menus/labels stay fast without losing legible text. | |
| processor = AutoProcessor.from_pretrained( | |
| VISION_MODEL_ID, | |
| min_pixels=256 * 28 * 28, | |
| max_pixels=1280 * 28 * 28, | |
| ) | |
| param = next(model.parameters()) | |
| print( | |
| f"[third-eye VISION] loaded {VISION_MODEL_ID} " | |
| f"| device={param.device} | dtype={param.dtype}", | |
| flush=True, | |
| ) | |
| _vision = (model, processor) | |
| return _vision | |
| def _chat_once(model, processor, image, prompt: str) -> str: | |
| import torch | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=[text], | |
| images=[image], | |
| padding=True, | |
| return_tensors="pt", | |
| ).to(model.device) | |
| with torch.inference_mode(): | |
| generated = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.2, | |
| ) | |
| trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated)] | |
| answer = processor.batch_decode( | |
| trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| return answer.strip() | |
| def describe_scene( | |
| image_bytes: bytes, question: str, lang: str = "en", tile: bool = False | |
| ) -> str: | |
| import time | |
| from PIL import Image | |
| model, processor = _load_vision() | |
| image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
| prompt = question.strip() or "Describe everything visible for a blind user." | |
| if lang == "zh": | |
| prompt += " Answer in Chinese." | |
| start = time.time() | |
| if not tile: | |
| answer = _chat_once(model, processor, image, prompt) | |
| print(f"[third-eye VISION] chat: {time.time() - start:.2f}s", flush=True) | |
| return answer | |
| # Tiled OCR for verbatim Read Text mode: splitting into overlapping top/bottom | |
| # bands enlarges the relative text per call; the overlap is stitched away. | |
| w, h = image.size | |
| bands = [(0, 0, w, int(h * 0.55)), (0, int(h * 0.45), w, h)] | |
| parts = [_chat_once(model, processor, image.crop(box), prompt) for box in bands] | |
| answer = stitch_overlapping_text(parts) | |
| print( | |
| f"[third-eye VISION] tiled chat ({len(bands)} bands): " | |
| f"{time.time() - start:.2f}s", | |
| flush=True, | |
| ) | |
| return answer | |
| # --------------------------------------------------------------------------- # | |
| # Speech-to-text — Cohere Transcribe (shared with the Modal backend) | |
| # --------------------------------------------------------------------------- # | |
| def transcribe_audio(audio_bytes: bytes, language: str = "en") -> str: | |
| from cohere_stt import transcribe_wav_bytes | |
| return transcribe_wav_bytes(audio_bytes, language) | |
| # --------------------------------------------------------------------------- # | |
| # Text-to-speech — VoxCPM2 | |
| # --------------------------------------------------------------------------- # | |
| def _load_tts(): | |
| global _tts | |
| if _tts is None: | |
| from voxcpm import VoxCPM | |
| _tts = VoxCPM.from_pretrained(TTS_MODEL_ID, load_denoiser=False) | |
| return _tts | |
| def speak(text: str, lang: str = "en") -> bytes: | |
| import numpy as np | |
| import soundfile as sf | |
| if not text.strip(): | |
| raise ValueError("Cannot synthesize empty text.") | |
| model = _load_tts() | |
| waveform = model.generate( | |
| text=text.strip()[:500], | |
| cfg_value=2.0, | |
| inference_timesteps=10, | |
| ) | |
| output = io.BytesIO() | |
| sf.write( | |
| output, | |
| np.asarray(waveform, dtype=np.float32), | |
| model.tts_model.sample_rate, | |
| format="WAV", | |
| ) | |
| return output.getvalue() | |