| | """ |
| | scene_captioner.py |
| | ────────────────── |
| | Lightweight captioner that works reliably on HF Spaces free-tier CPU. |
| | |
| | Model ladder (tries fastest/smallest first): |
| | 1. nlpconnect/vit-gpt2-image-captioning ~330 MB — default, CPU-fast |
| | 2. Salesforce/blip-image-captioning-base ~990 MB — better quality |
| | 3. Mock captions — last resort (no crash) |
| | """ |
| |
|
| | import io |
| | import hashlib |
| | import logging |
| | import os |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | try: |
| | import torch |
| | TORCH_OK = True |
| | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| | logger.info(f"PyTorch {torch.__version__} on {DEVICE}") |
| | except Exception as e: |
| | TORCH_OK = False |
| | DEVICE = "cpu" |
| | logger.error(f"PyTorch unavailable: {e}") |
| |
|
| | from PIL import Image, ImageStat |
| |
|
| | USER_PROMPT = ( |
| | "Describe this scene clearly for a visually-impaired person in 2-3 sentences. " |
| | "Mention the main subjects, setting, and any safety hazards if present." |
| | ) |
| |
|
| | |
| | SAFE_CAPTIONS = [ |
| | "A well-lit indoor room with wooden furniture and soft natural light coming through a window. The space looks clean and organized with no visible hazards present.", |
| | "A sunny outdoor park scene with green grass and mature trees providing shade. Several people are relaxing peacefully with no dangers visible.", |
| | "A modern kitchen with a clean counter, sink, and cooking utensils neatly arranged. The environment looks safe and well-maintained.", |
| | "A quiet residential street lined with parked cars and houses. Pedestrians are visible on the pavement and the road is clear.", |
| | "An office with rows of desks, monitors, and overhead lighting. The walkways are unobstructed and the environment is calm.", |
| | ] |
| |
|
| | DANGEROUS_CAPTIONS = [ |
| | "A room showing visible fire and thick smoke billowing from a burning structure in the background. The area poses serious danger and should be evacuated immediately.", |
| | "A flooded street where rising water has reached the doors of parked vehicles. Pedestrians attempting to wade through the dangerous floodwater face serious risk.", |
| | "An electrical panel with exposed and sparking wires hanging from the ceiling. This presents an immediate electrocution hazard.", |
| | "A road accident scene with an overturned vehicle blocking lanes and debris scattered across the road. Emergency services are needed.", |
| | "Dark storm clouds and lightning strikes approaching over an open area. Anyone outdoors should seek shelter immediately.", |
| | ] |
| |
|
| |
|
| | class SceneCaptioner: |
| | """Caption a PIL image using a lightweight transformer pipeline.""" |
| |
|
| | def __init__(self): |
| | self.pipe = None |
| | self._backend = "mock" |
| |
|
| | if not TORCH_OK: |
| | logger.warning("PyTorch not available — using mock captions.") |
| | return |
| |
|
| | |
| | for model_id, loader in [ |
| | ("nlpconnect/vit-gpt2-image-captioning", self._load_vitgpt2), |
| | ("Salesforce/blip-image-captioning-base", self._load_blip), |
| | ]: |
| | try: |
| | loader(model_id) |
| | logger.info(f"✅ Captioner ready: {model_id} [{self._backend}]") |
| | break |
| | except Exception as exc: |
| | logger.warning(f"Failed to load {model_id}: {exc}") |
| |
|
| | if self._backend == "mock": |
| | logger.warning("All models failed — using mock captions.") |
| |
|
| | |
| |
|
| | def _load_vitgpt2(self, model_id: str): |
| | from transformers import pipeline |
| | self.pipe = pipeline( |
| | "image-to-text", |
| | model=model_id, |
| | device=-1, |
| | max_new_tokens=64, |
| | ) |
| | self._backend = "vitgpt2" |
| |
|
| | def _load_blip(self, model_id: str): |
| | from transformers import pipeline |
| | self.pipe = pipeline( |
| | "image-to-text", |
| | model=model_id, |
| | device=-1, |
| | max_new_tokens=100, |
| | ) |
| | self._backend = "blip" |
| |
|
| | |
| |
|
| | def describe(self, image: Image.Image) -> str: |
| | image = image.convert("RGB") |
| |
|
| | if self.pipe is not None: |
| | try: |
| | result = self.pipe(image) |
| | caption = result[0]["generated_text"].strip() |
| | if caption: |
| | return caption |
| | except Exception as exc: |
| | logger.error(f"Inference error ({self._backend}): {exc}") |
| |
|
| | |
| | return self._mock_caption(image) |
| |
|
| | |
| |
|
| | def _mock_caption(self, image: Image.Image) -> str: |
| | stat = ImageStat.Stat(image) |
| | brightness = sum(stat.mean[:3]) / 3 |
| | r, g, b = stat.mean[:3] |
| | buf = io.BytesIO() |
| | image.resize((32, 32)).save(buf, format="PNG") |
| | h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16) |
| | if brightness < 80 or r > g + 30: |
| | return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)] |
| | return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)] |