""" scene_captioner.py ────────────────── Lightweight captioner that works reliably on HF Spaces free-tier CPU. Model ladder (tries fastest/smallest first): 1. nlpconnect/vit-gpt2-image-captioning ~330 MB — default, CPU-fast 2. Salesforce/blip-image-captioning-base ~990 MB — better quality 3. Mock captions — last resort (no crash) """ import io import hashlib import logging import os logger = logging.getLogger(__name__) # ── Safe torch import ───────────────────────────────────────────────────────── try: import torch TORCH_OK = True DEVICE = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"PyTorch {torch.__version__} on {DEVICE}") except Exception as e: TORCH_OK = False DEVICE = "cpu" logger.error(f"PyTorch unavailable: {e}") from PIL import Image, ImageStat USER_PROMPT = ( "Describe this scene clearly for a visually-impaired person in 2-3 sentences. " "Mention the main subjects, setting, and any safety hazards if present." ) # ── Mock caption banks ──────────────────────────────────────────────────────── SAFE_CAPTIONS = [ "A well-lit indoor room with wooden furniture and soft natural light coming through a window. The space looks clean and organized with no visible hazards present.", "A sunny outdoor park scene with green grass and mature trees providing shade. Several people are relaxing peacefully with no dangers visible.", "A modern kitchen with a clean counter, sink, and cooking utensils neatly arranged. The environment looks safe and well-maintained.", "A quiet residential street lined with parked cars and houses. Pedestrians are visible on the pavement and the road is clear.", "An office with rows of desks, monitors, and overhead lighting. The walkways are unobstructed and the environment is calm.", ] DANGEROUS_CAPTIONS = [ "A room showing visible fire and thick smoke billowing from a burning structure in the background. The area poses serious danger and should be evacuated immediately.", "A flooded street where rising water has reached the doors of parked vehicles. Pedestrians attempting to wade through the dangerous floodwater face serious risk.", "An electrical panel with exposed and sparking wires hanging from the ceiling. This presents an immediate electrocution hazard.", "A road accident scene with an overturned vehicle blocking lanes and debris scattered across the road. Emergency services are needed.", "Dark storm clouds and lightning strikes approaching over an open area. Anyone outdoors should seek shelter immediately.", ] class SceneCaptioner: """Caption a PIL image using a lightweight transformer pipeline.""" def __init__(self): self.pipe = None self._backend = "mock" if not TORCH_OK: logger.warning("PyTorch not available — using mock captions.") return # Try models smallest → larger for model_id, loader in [ ("nlpconnect/vit-gpt2-image-captioning", self._load_vitgpt2), ("Salesforce/blip-image-captioning-base", self._load_blip), ]: try: loader(model_id) logger.info(f"✅ Captioner ready: {model_id} [{self._backend}]") break except Exception as exc: logger.warning(f"Failed to load {model_id}: {exc}") if self._backend == "mock": logger.warning("All models failed — using mock captions.") # ── Loaders ─────────────────────────────────────────────────────────────── def _load_vitgpt2(self, model_id: str): from transformers import pipeline self.pipe = pipeline( "image-to-text", model=model_id, device=-1, # CPU max_new_tokens=64, ) self._backend = "vitgpt2" def _load_blip(self, model_id: str): from transformers import pipeline self.pipe = pipeline( "image-to-text", model=model_id, device=-1, max_new_tokens=100, ) self._backend = "blip" # ── Inference ───────────────────────────────────────────────────────────── def describe(self, image: Image.Image) -> str: image = image.convert("RGB") if self.pipe is not None: try: result = self.pipe(image) caption = result[0]["generated_text"].strip() if caption: return caption except Exception as exc: logger.error(f"Inference error ({self._backend}): {exc}") # Fallback to mock return self._mock_caption(image) # ── Deterministic mock ──────────────────────────────────────────────────── def _mock_caption(self, image: Image.Image) -> str: stat = ImageStat.Stat(image) brightness = sum(stat.mean[:3]) / 3 r, g, b = stat.mean[:3] buf = io.BytesIO() image.resize((32, 32)).save(buf, format="PNG") h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16) if brightness < 80 or r > g + 30: return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)] return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)]