Spaces:

A7med-Ame3
/

Real_Time_Image_Captioning

Sleeping

App Files Files Community

Real_Time_Image_Captioning / scene_captioner.py

A7med-Ame3

Update scene_captioner.py

4e350ba verified 7 days ago

raw

history blame contribute delete

5.95 kB

	"""
	scene_captioner.py
	──────────────────
	Lightweight captioner that works reliably on HF Spaces free-tier CPU.

	Model ladder (tries fastest/smallest first):
	1. nlpconnect/vit-gpt2-image-captioning ~330 MB — default, CPU-fast
	2. Salesforce/blip-image-captioning-base ~990 MB — better quality
	3. Mock captions — last resort (no crash)
	"""

	import io
	import hashlib
	import logging
	import os

	logger = logging.getLogger(__name__)

	# ── Safe torch import ─────────────────────────────────────────────────────────
	try:
	import torch
	TORCH_OK = True
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"PyTorch {torch.__version__} on {DEVICE}")
	except Exception as e:
	TORCH_OK = False
	DEVICE = "cpu"
	logger.error(f"PyTorch unavailable: {e}")

	from PIL import Image, ImageStat

	USER_PROMPT = (
	"Describe this scene clearly for a visually-impaired person in 2-3 sentences. "
	"Mention the main subjects, setting, and any safety hazards if present."
	)

	# ── Mock caption banks ────────────────────────────────────────────────────────
	SAFE_CAPTIONS = [
	"A well-lit indoor room with wooden furniture and soft natural light coming through a window. The space looks clean and organized with no visible hazards present.",
	"A sunny outdoor park scene with green grass and mature trees providing shade. Several people are relaxing peacefully with no dangers visible.",
	"A modern kitchen with a clean counter, sink, and cooking utensils neatly arranged. The environment looks safe and well-maintained.",
	"A quiet residential street lined with parked cars and houses. Pedestrians are visible on the pavement and the road is clear.",
	"An office with rows of desks, monitors, and overhead lighting. The walkways are unobstructed and the environment is calm.",
	]

	DANGEROUS_CAPTIONS = [
	"A room showing visible fire and thick smoke billowing from a burning structure in the background. The area poses serious danger and should be evacuated immediately.",
	"A flooded street where rising water has reached the doors of parked vehicles. Pedestrians attempting to wade through the dangerous floodwater face serious risk.",
	"An electrical panel with exposed and sparking wires hanging from the ceiling. This presents an immediate electrocution hazard.",
	"A road accident scene with an overturned vehicle blocking lanes and debris scattered across the road. Emergency services are needed.",
	"Dark storm clouds and lightning strikes approaching over an open area. Anyone outdoors should seek shelter immediately.",
	]


	class SceneCaptioner:
	"""Caption a PIL image using a lightweight transformer pipeline."""

	def __init__(self):
	self.pipe = None
	self._backend = "mock"

	if not TORCH_OK:
	logger.warning("PyTorch not available — using mock captions.")
	return

	# Try models smallest → larger
	for model_id, loader in [
	("nlpconnect/vit-gpt2-image-captioning", self._load_vitgpt2),
	("Salesforce/blip-image-captioning-base", self._load_blip),
	]:
	try:
	loader(model_id)
	logger.info(f"✅ Captioner ready: {model_id} [{self._backend}]")
	break
	except Exception as exc:
	logger.warning(f"Failed to load {model_id}: {exc}")

	if self._backend == "mock":
	logger.warning("All models failed — using mock captions.")

	# ── Loaders ───────────────────────────────────────────────────────────────

	def _load_vitgpt2(self, model_id: str):
	from transformers import pipeline
	self.pipe = pipeline(
	"image-to-text",
	model=model_id,
	device=-1, # CPU
	max_new_tokens=64,
	)
	self._backend = "vitgpt2"

	def _load_blip(self, model_id: str):
	from transformers import pipeline
	self.pipe = pipeline(
	"image-to-text",
	model=model_id,
	device=-1,
	max_new_tokens=100,
	)
	self._backend = "blip"

	# ── Inference ─────────────────────────────────────────────────────────────

	def describe(self, image: Image.Image) -> str:
	image = image.convert("RGB")

	if self.pipe is not None:
	try:
	result = self.pipe(image)
	caption = result[0]["generated_text"].strip()
	if caption:
	return caption
	except Exception as exc:
	logger.error(f"Inference error ({self._backend}): {exc}")

	# Fallback to mock
	return self._mock_caption(image)

	# ── Deterministic mock ────────────────────────────────────────────────────

	def _mock_caption(self, image: Image.Image) -> str:
	stat = ImageStat.Stat(image)
	brightness = sum(stat.mean[:3]) / 3
	r, g, b = stat.mean[:3]
	buf = io.BytesIO()
	image.resize((32, 32)).save(buf, format="PNG")
	h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16)
	if brightness < 80 or r > g + 30:
	return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)]
	return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)]