Spaces:

A7med-Ame3
/

Real_Time_Image_Captioning

Sleeping

App Files Files Community

A7med-Ame3 commited on 6 days ago

Commit

4e350ba

verified ·

1 Parent(s): bf990aa

Update scene_captioner.py

Browse files

Files changed (1) hide show

scene_captioner.py +78 -153

scene_captioner.py CHANGED Viewed

@@ -1,206 +1,131 @@
 """
 scene_captioner.py
 ──────────────────
-Image captioning pipeline for ClearPath.
-Model priority (auto-selected):
-  1. Qwen/Qwen2-VL-2B-Instruct   — best quality, ~5 GB download
-  2. Salesforce/blip2-opt-2.7b   — fallback, ~5 GB download
-  3. nlpconnect/vit-gpt2-image-captioning — tiny fallback, ~1 GB, CPU-fast
-HF Spaces free tier = CPU only, 16 GB RAM.
 """
-import os
 import io
-import logging
 import hashlib
 logger = logging.getLogger(__name__)
-# ── Verify torch is importable BEFORE anything else ──────────────────────────
 try:
     import torch
     TORCH_OK = True
     DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
-    logger.info(f"PyTorch {torch.__version__} available — device: {DEVICE}")
-except Exception as _e:
     TORCH_OK = False
     DEVICE   = "cpu"
-    logger.error(f"PyTorch import failed: {_e}")
 from PIL import Image, ImageStat
-# ── Prompt used for all models ────────────────────────────────────────────────
-SYSTEM_PROMPT = (
-    "You are an assistive AI for visually-impaired people. "
-    "Describe the scene clearly in 2–3 sentences covering: "
-    "(1) main subjects and actions, (2) setting/environment, "
-    "(3) any safety hazards if visible."
 )
-USER_PROMPT = "Describe this scene for a visually-impaired person."
 class SceneCaptioner:
-    """
-    Loads a vision-language model once and exposes `describe(image: PIL.Image) -> str`.
-    Falls back gracefully through 3 model tiers if earlier ones fail.
-    """
     def __init__(self):
-        self.model      = None
-        self.processor  = None
-        self._backend   = "mock"
         if not TORCH_OK:
-            logger.error("PyTorch not available — using mock captions only.")
             return
-        # Try models in order of preference
-        for loader in (self._try_qwen, self._try_blip2, self._try_vitgpt2):
             try:
-                loader()
                 break
             except Exception as exc:
-                logger.warning(f"Model load failed: {exc}")
         if self._backend == "mock":
-            logger.warning("All models failed — falling back to mock captions.")
-    # ── Model loaders ─────────────────────────────────────────────────────────
-    def _try_qwen(self):
-        """Qwen2-VL-2B-Instruct — best quality."""
-        from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-        model_id = "Qwen/Qwen2-VL-2B-Instruct"
-        logger.info(f"Loading {model_id} …")
-        # use_fast=False avoids the Qwen2VLImageProcessorFast/torch mismatch
-        self.processor = AutoProcessor.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            use_fast=False,
-        )
-        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_id,
-            torch_dtype=torch.float32,   # CPU — float32 required
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-        )
-        self.model.eval()
-        self._backend = "qwen"
-        logger.info(f"✅ Loaded Qwen2-VL on {DEVICE}")
-    def _try_blip2(self):
-        """BLIP-2 OPT-2.7B — fallback."""
-        from transformers import Blip2Processor, Blip2ForConditionalGeneration
-        model_id = "Salesforce/blip2-opt-2.7b"
-        logger.info(f"Loading {model_id} …")
-        # use_fast=False avoids BlipImageProcessorFast/torch mismatch
-        self.processor = Blip2Processor.from_pretrained(
-            model_id,
-            use_fast=False,
-        )
-        self.model = Blip2ForConditionalGeneration.from_pretrained(
-            model_id,
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True,
         )
-        self.model.eval()
-        self._backend = "blip2"
-        logger.info(f"✅ Loaded BLIP-2 on {DEVICE}")
-    def _try_vitgpt2(self):
-        """ViT-GPT2 — tiny and fast, CPU-friendly (~1 GB)."""
-        from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
-        model_id = "nlpconnect/vit-gpt2-image-captioning"
-        logger.info(f"Loading {model_id} …")
-        self._vit_processor = ViTImageProcessor.from_pretrained(model_id)
-        self._vit_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.model = VisionEncoderDecoderModel.from_pretrained(model_id)
-        self.model.eval()
         self._backend = "vitgpt2"
-        logger.info(f"✅ Loaded ViT-GPT2 on {DEVICE}")
     # ── Inference ─────────────────────────────────────────────────────────────
     def describe(self, image: Image.Image) -> str:
         image = image.convert("RGB")
-        if   self._backend == "qwen":    return self._infer_qwen(image)
-        elif self._backend == "blip2":   return self._infer_blip2(image)
-        elif self._backend == "vitgpt2": return self._infer_vitgpt2(image)
-        else:                            return self._infer_mock(image)
-    def _infer_qwen(self, image: Image.Image) -> str:
-        from qwen_vl_utils import process_vision_info
-        messages = [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user",   "content": [
-                {"type": "image", "image": image},
-                {"type": "text",  "text":  USER_PROMPT},
-            ]},
-        ]
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text], images=image_inputs, videos=video_inputs,
-            padding=True, return_tensors="pt",
-        )
-        with torch.no_grad():
-            gen = self.model.generate(**inputs, max_new_tokens=200, do_sample=False)
-        trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen)]
-        return self.processor.batch_decode(trimmed, skip_special_tokens=True)[0].strip()
-    def _infer_blip2(self, image: Image.Image) -> str:
-        prompt = f"Question: {USER_PROMPT} Answer:"
-        inputs = self.processor(image, text=prompt, return_tensors="pt")
-        with torch.no_grad():
-            ids = self.model.generate(**inputs, max_new_tokens=200, do_sample=False)
-        return self.processor.decode(ids[0], skip_special_tokens=True).strip()
-    def _infer_vitgpt2(self, image: Image.Image) -> str:
-        pixel_values = self._vit_processor(
-            images=[image], return_tensors="pt"
-        ).pixel_values
-        with torch.no_grad():
-            ids = self.model.generate(
-                pixel_values, max_length=64, num_beams=4,
-            )
-        caption = self._vit_tokenizer.decode(ids[0], skip_special_tokens=True)
-        return caption.strip()
-    # ── Mock fallback (no model needed) ──────────────────────────────────────
-    def _infer_mock(self, image: Image.Image) -> str:
-        """
-        Deterministic mock based on image brightness + colour.
-        Used when all model loads fail (e.g. OOM or no network).
-        """
-        SAFE_CAPTIONS = [
-            "A well-lit indoor space with furniture and soft natural light. The area appears clean and organised with no visible hazards.",
-            "A sunny outdoor park with green grass, trees and a paved path. People are relaxing peacefully with no dangers present.",
-            "A modern office with rows of desks and computers. The walkways are clear and the environment is calm.",
-            "A kitchen counter with fresh vegetables and cooking utensils. The area is tidy and safe.",
-            "A quiet residential street lined with parked cars and houses. The road is clear with pedestrians on the pavement.",
-        ]
-        DANGEROUS_CAPTIONS = [
-            "A building interior showing visible fire and thick smoke billowing from a burning structure. The area should be evacuated immediately.",
-            "A flooded street where rising water has reached parked vehicles. Pedestrians are wading through dangerous floodwater.",
-            "An electrical panel with exposed sparking wires visible. This presents a serious electrocution hazard.",
-            "A road accident scene with an overturned vehicle blocking traffic and debris scattered across the road.",
-            "Dark storm clouds and visible lightning strikes approaching over an open field. Immediate shelter is required.",
-        ]
         stat = ImageStat.Stat(image)
         brightness = sum(stat.mean[:3]) / 3
         r, g, b    = stat.mean[:3]
         buf = io.BytesIO()
         image.resize((32, 32)).save(buf, format="PNG")
         h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16)
         if brightness < 80 or r > g + 30:
             return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)]
         return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)]

 """
 scene_captioner.py
 ──────────────────
+Lightweight captioner that works reliably on HF Spaces free-tier CPU.
+Model ladder (tries fastest/smallest first):
+  1. nlpconnect/vit-gpt2-image-captioning  ~330 MB  — default, CPU-fast
+  2. Salesforce/blip-image-captioning-base ~990 MB  — better quality
+  3. Mock captions                                   — last resort (no crash)
 """
 import io
 import hashlib
+import logging
+import os
 logger = logging.getLogger(__name__)
+# ── Safe torch import ─────────────────────────────────────────────────────────
 try:
     import torch
     TORCH_OK = True
     DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
+    logger.info(f"PyTorch {torch.__version__} on {DEVICE}")
+except Exception as e:
     TORCH_OK = False
     DEVICE   = "cpu"
+    logger.error(f"PyTorch unavailable: {e}")
 from PIL import Image, ImageStat
+USER_PROMPT = (
+    "Describe this scene clearly for a visually-impaired person in 2-3 sentences. "
+    "Mention the main subjects, setting, and any safety hazards if present."
 )
+# ── Mock caption banks ────────────────────────────────────────────────────────
+SAFE_CAPTIONS = [
+    "A well-lit indoor room with wooden furniture and soft natural light coming through a window. The space looks clean and organized with no visible hazards present.",
+    "A sunny outdoor park scene with green grass and mature trees providing shade. Several people are relaxing peacefully with no dangers visible.",
+    "A modern kitchen with a clean counter, sink, and cooking utensils neatly arranged. The environment looks safe and well-maintained.",
+    "A quiet residential street lined with parked cars and houses. Pedestrians are visible on the pavement and the road is clear.",
+    "An office with rows of desks, monitors, and overhead lighting. The walkways are unobstructed and the environment is calm.",
+]
+DANGEROUS_CAPTIONS = [
+    "A room showing visible fire and thick smoke billowing from a burning structure in the background. The area poses serious danger and should be evacuated immediately.",
+    "A flooded street where rising water has reached the doors of parked vehicles. Pedestrians attempting to wade through the dangerous floodwater face serious risk.",
+    "An electrical panel with exposed and sparking wires hanging from the ceiling. This presents an immediate electrocution hazard.",
+    "A road accident scene with an overturned vehicle blocking lanes and debris scattered across the road. Emergency services are needed.",
+    "Dark storm clouds and lightning strikes approaching over an open area. Anyone outdoors should seek shelter immediately.",
+]
 class SceneCaptioner:
+    """Caption a PIL image using a lightweight transformer pipeline."""
     def __init__(self):
+        self.pipe     = None
+        self._backend = "mock"
         if not TORCH_OK:
+            logger.warning("PyTorch not available — using mock captions.")
             return
+        # Try models smallest → larger
+        for model_id, loader in [
+            ("nlpconnect/vit-gpt2-image-captioning", self._load_vitgpt2),
+            ("Salesforce/blip-image-captioning-base", self._load_blip),
+        ]:
             try:
+                loader(model_id)
+                logger.info(f"✅  Captioner ready: {model_id} [{self._backend}]")
                 break
             except Exception as exc:
+                logger.warning(f"Failed to load {model_id}: {exc}")
         if self._backend == "mock":
+            logger.warning("All models failed — using mock captions.")
+    # ── Loaders ───────────────────────────────────────────────────────────────
+    def _load_vitgpt2(self, model_id: str):
+        from transformers import pipeline
+        self.pipe     = pipeline(
+            "image-to-text",
+            model=model_id,
+            device=-1,            # CPU
+            max_new_tokens=64,
         )
         self._backend = "vitgpt2"
+    def _load_blip(self, model_id: str):
+        from transformers import pipeline
+        self.pipe     = pipeline(
+            "image-to-text",
+            model=model_id,
+            device=-1,
+            max_new_tokens=100,
+        )
+        self._backend = "blip"
     # ── Inference ─────────────────────────────────────────────────────────────
     def describe(self, image: Image.Image) -> str:
         image = image.convert("RGB")
+        if self.pipe is not None:
+            try:
+                result  = self.pipe(image)
+                caption = result[0]["generated_text"].strip()
+                if caption:
+                    return caption
+            except Exception as exc:
+                logger.error(f"Inference error ({self._backend}): {exc}")
+        # Fallback to mock
+        return self._mock_caption(image)
+    # ── Deterministic mock ────────────────────────────────────────────────────
+    def _mock_caption(self, image: Image.Image) -> str:
         stat = ImageStat.Stat(image)
         brightness = sum(stat.mean[:3]) / 3
         r, g, b    = stat.mean[:3]
         buf = io.BytesIO()
         image.resize((32, 32)).save(buf, format="PNG")
         h = int(hashlib.md5(buf.getvalue()).hexdigest(), 16)
         if brightness < 80 or r > g + 30:
             return DANGEROUS_CAPTIONS[h % len(DANGEROUS_CAPTIONS)]
         return SAFE_CAPTIONS[h % len(SAFE_CAPTIONS)]