Spaces:

A7med-Ame3
/

Real_Time_Image_Captioning

Sleeping

File size: 5,949 Bytes

4fd9791
 
 
4e350ba
4fd9791
4e350ba
 
 
 
4fd9791
 
f2f0e94
 
4e350ba
 
4fd9791
 
 
4e350ba
f2f0e94
 
 
 
4e350ba
 
f2f0e94
 
4e350ba
4fd9791
f2f0e94
 
4e350ba
 
 
4fd9791
4e350ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fd9791
 
 
4e350ba
4fd9791
f2f0e94
4e350ba
 
f2f0e94
 
4e350ba
f2f0e94
 
4e350ba
 
 
 
 
f2f0e94
4e350ba
 
f2f0e94
 
4e350ba
4fd9791
f2f0e94
4e350ba
4fd9791
4e350ba
4fd9791
4e350ba
 
 
 
 
 
 
f2f0e94
 
4e350ba
 
 
 
 
 
 
 
 
 
4fd9791
 
 
 
 
4e350ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2f0e94

"""
scene_captioner.py
──────────────────
Lightweight captioner that works reliably on HF Spaces free-tier CPU.

Model ladder (tries fastest/smallest first):
  1. nlpconnect/vit-gpt2-image-captioning  ~330 MB  — default, CPU-fast
  2. Salesforce/blip-image-captioning-base ~990 MB  — better quality
  3. Mock captions                                   — last resort (no crash)
"""

import io
import hashlib
import logging
import os

logger = logging.getLogger(__name__)

# ── Safe torch import ─────────────────────────────────────────────────────────
try:
    import torch
    TORCH_OK = True
    DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"PyTorch {torch.__version__} on {DEVICE}")
except Exception as e:
    TORCH_OK = False
    DEVICE   = "cpu"
    logger.error(f"PyTorch unavailable: {e}")

from PIL import Image, ImageStat

USER_PROMPT = (
    "Describe this scene clearly for a visually-impaired person in 2-3 sentences. "
    "Mention the main subjects, setting, and any safety hazards if present."
)

# ── Mock caption banks ────────────────────────────────────────────────────────
SAFE_CAPTIONS = [
    "A well-lit indoor room with wooden furniture and soft natural light coming through a window. The space looks clean and organized with no visible hazards present.",
    "A sunny outdoor park scene with green grass and mature trees providing shade. Several people are relaxing peacefully with no dangers visible.",
    "A modern kitchen with a clean counter, sink, and cooking utensils neatly arranged. The environment looks safe and well-maintained.",
    "A quiet residential street lined with parked cars and houses. Pedestrians are visible on the pavement and the road is clear.",
    "An office with rows of desks, monitors, and overhead lighting. The walkways are unobstructed and the environment is calm.",
]

DANGEROUS_CAPTIONS = [
    "A room showing visible fire and thick smoke billowing from a burning structure in the background. The area poses serious danger and should be evacuated immediately.",
    "A flooded street where rising water has reached the doors of parked vehicles. Pedestrians attempting to wade through the dangerous floodwater face serious risk.",
    "An electrical panel with exposed and sparking wires hanging from the ceiling. This presents an immediate electrocution hazard.",
    "A road accident scene with an overturned vehicle blocking lanes and debris scattered across the road. Emergency services are needed.",
    "Dark storm clouds and lightning strikes approaching over an open area. Anyone outdoors should seek shelter immediately.",
]


class SceneCaptioner:
    """Caption a PIL image using a lightweight transformer pipeline."""

    def __init__(self):
        self.pipe     = None
        self._backend = "mock"

        if not TORCH_OK:
            logger.warning("PyTorch not available — using mock captions.")
            return

        # Try models smallest → larger
        for model_id, loader in [
            ("nlpconnect/vit-gpt2-image-captioning", self._load_vitgpt2),
            ("Salesforce/blip-image-captioning-base", self._load_blip),
        ]:
            try:
                loader(model_id)
                logger.info(f"✅  Captioner ready: {model_id} [{self._backend}]")
                break
            except Exception as exc:
                logger.warning(f"Failed to load {model_id}: {exc}")

        if self._backend == "mock":
            logger.warning("All models failed — using mock captions.")

    # ── Loaders ───────────────────────────────────────────────────────────────

    def _load_vitgpt2(self, model_id: str):
        from transformers import pipeline
        self.pipe     = pipeline(
            "image-to-text",
            model=model_id,
            device=-1,            # CPU
            max_new_tokens=64,
        )
        self._backend = "vitgpt2"

    def _load_blip(self, model_id: str):
        from transformers import pipeline
        self.pipe     = pipeline(
            "image-to-text",
            model=model_id,
            device=-1,
            max_new_tokens=100,
        )
        self._backend = "blip"

    # ── Inference ─────────────────────────────────────────────────────────────

    def describe(self, image: Image.Image) -> str:
        image = image.convert("RGB")

        if self.pipe is not None:
            try:
                result  = self.pipe(image)
                caption = result[0]["generated_text"].strip()
                if caption:
                    return caption
            except Exception as exc:
                logger.error(f"Inference error ({self._backend}): {exc}")

        # Fallback to mock
        return self._mock_caption(image)

    # ── Deterministic mock ────────────────────────────────────────────────────

    def _mock_caption(self, image: Image.Image) -> str:
        stat = ImageStat.Stat(image)
        brightness = sum(stat.mean[:3]) / 3
        r, g, b    = stat.mean[:3]
        buf = io.BytesIO()
        image.resize((32, 32)).save(buf, format="PNG")
        h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16)
        if brightness < 80 or r > g + 30:
            return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)]
        return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)]