File size: 5,949 Bytes
4fd9791 4e350ba 4fd9791 4e350ba 4fd9791 f2f0e94 4e350ba 4fd9791 4e350ba f2f0e94 4e350ba f2f0e94 4e350ba 4fd9791 f2f0e94 4e350ba 4fd9791 4e350ba 4fd9791 4e350ba 4fd9791 f2f0e94 4e350ba f2f0e94 4e350ba f2f0e94 4e350ba f2f0e94 4e350ba f2f0e94 4e350ba 4fd9791 f2f0e94 4e350ba 4fd9791 4e350ba 4fd9791 4e350ba f2f0e94 4e350ba 4fd9791 4e350ba f2f0e94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """
scene_captioner.py
──────────────────
Lightweight captioner that works reliably on HF Spaces free-tier CPU.
Model ladder (tries fastest/smallest first):
1. nlpconnect/vit-gpt2-image-captioning ~330 MB — default, CPU-fast
2. Salesforce/blip-image-captioning-base ~990 MB — better quality
3. Mock captions — last resort (no crash)
"""
import io
import hashlib
import logging
import os
logger = logging.getLogger(__name__)
# ── Safe torch import ─────────────────────────────────────────────────────────
try:
import torch
TORCH_OK = True
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"PyTorch {torch.__version__} on {DEVICE}")
except Exception as e:
TORCH_OK = False
DEVICE = "cpu"
logger.error(f"PyTorch unavailable: {e}")
from PIL import Image, ImageStat
USER_PROMPT = (
"Describe this scene clearly for a visually-impaired person in 2-3 sentences. "
"Mention the main subjects, setting, and any safety hazards if present."
)
# ── Mock caption banks ────────────────────────────────────────────────────────
SAFE_CAPTIONS = [
"A well-lit indoor room with wooden furniture and soft natural light coming through a window. The space looks clean and organized with no visible hazards present.",
"A sunny outdoor park scene with green grass and mature trees providing shade. Several people are relaxing peacefully with no dangers visible.",
"A modern kitchen with a clean counter, sink, and cooking utensils neatly arranged. The environment looks safe and well-maintained.",
"A quiet residential street lined with parked cars and houses. Pedestrians are visible on the pavement and the road is clear.",
"An office with rows of desks, monitors, and overhead lighting. The walkways are unobstructed and the environment is calm.",
]
DANGEROUS_CAPTIONS = [
"A room showing visible fire and thick smoke billowing from a burning structure in the background. The area poses serious danger and should be evacuated immediately.",
"A flooded street where rising water has reached the doors of parked vehicles. Pedestrians attempting to wade through the dangerous floodwater face serious risk.",
"An electrical panel with exposed and sparking wires hanging from the ceiling. This presents an immediate electrocution hazard.",
"A road accident scene with an overturned vehicle blocking lanes and debris scattered across the road. Emergency services are needed.",
"Dark storm clouds and lightning strikes approaching over an open area. Anyone outdoors should seek shelter immediately.",
]
class SceneCaptioner:
"""Caption a PIL image using a lightweight transformer pipeline."""
def __init__(self):
self.pipe = None
self._backend = "mock"
if not TORCH_OK:
logger.warning("PyTorch not available — using mock captions.")
return
# Try models smallest → larger
for model_id, loader in [
("nlpconnect/vit-gpt2-image-captioning", self._load_vitgpt2),
("Salesforce/blip-image-captioning-base", self._load_blip),
]:
try:
loader(model_id)
logger.info(f"✅ Captioner ready: {model_id} [{self._backend}]")
break
except Exception as exc:
logger.warning(f"Failed to load {model_id}: {exc}")
if self._backend == "mock":
logger.warning("All models failed — using mock captions.")
# ── Loaders ───────────────────────────────────────────────────────────────
def _load_vitgpt2(self, model_id: str):
from transformers import pipeline
self.pipe = pipeline(
"image-to-text",
model=model_id,
device=-1, # CPU
max_new_tokens=64,
)
self._backend = "vitgpt2"
def _load_blip(self, model_id: str):
from transformers import pipeline
self.pipe = pipeline(
"image-to-text",
model=model_id,
device=-1,
max_new_tokens=100,
)
self._backend = "blip"
# ── Inference ─────────────────────────────────────────────────────────────
def describe(self, image: Image.Image) -> str:
image = image.convert("RGB")
if self.pipe is not None:
try:
result = self.pipe(image)
caption = result[0]["generated_text"].strip()
if caption:
return caption
except Exception as exc:
logger.error(f"Inference error ({self._backend}): {exc}")
# Fallback to mock
return self._mock_caption(image)
# ── Deterministic mock ────────────────────────────────────────────────────
def _mock_caption(self, image: Image.Image) -> str:
stat = ImageStat.Stat(image)
brightness = sum(stat.mean[:3]) / 3
r, g, b = stat.mean[:3]
buf = io.BytesIO()
image.resize((32, 32)).save(buf, format="PNG")
h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16)
if brightness < 80 or r > g + 30:
return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)]
return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)] |