""" app.py — ClearPath: Real-Time Scene Description for Visually-Impaired People Pipeline: Upload Image → ViT-GPT2 Caption → Regex Safety Classifier → SAFE / DANGEROUS """ import gradio as gr import numpy as np import logging import time import cv2 from PIL import Image from scene_captioner import SceneCaptioner from safety_classifier import SafetyClassifier, ClassificationResult logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger(__name__) # ── Load pipeline once at startup ───────────────────────────────────────────── logger.info("🚀 Starting ClearPath — loading captioner …") captioner = SceneCaptioner() classifier = SafetyClassifier() logger.info(f"✅ Pipeline ready — captioner backend: {captioner._backend}") history_log: list[dict] = [] # ── Core pipeline function ──────────────────────────────────────────────────── def analyse(image: np.ndarray): """ Main pipeline: 1. Convert numpy array → PIL Image 2. SceneCaptioner.describe() → caption string 3. SafetyClassifier.classify() → SAFE / DANGEROUS 4. Return results to Gradio UI """ if image is None: return ( _info_html("⬆️ Please upload an image first.", "#6366f1"), "", _build_history_md(), ) t0 = time.time() pil = Image.fromarray(image).convert("RGB") # ── Step 2: Caption ─────────────────────────────────────────────────────── try: caption = captioner.describe(pil) except Exception as exc: logger.error(f"Caption error: {exc}") caption = "Unable to generate caption for this image." # ── Step 3: Classify ────────────────────────────────────────────────────── result = classifier.classify(caption) elapsed = round(time.time() - t0, 2) # ── Build banner HTML ───────────────────────────────────────────────────── if result.label == "DANGEROUS": hazard_str = " | ".join(result.hazards) token_str = ", ".join(result.matches[:8]) banner_html = f"""

⚠️

DANGER DETECTED

Categories: {hazard_str}

              Matched tokens: {token_str}
            

⏱ Analysed in {elapsed}s | Backend: {captioner._backend}

""" else: banner_html = f"""

✅

SAFE ENVIRONMENT

No hazards detected by the 16-category regex engine.

⏱ Analysed in {elapsed}s | Backend: {captioner._backend}

""" # ── Log to history ──────────────────────────────────────────────────────── history_log.insert(0, { "time" : time.strftime("%H:%M:%S"), "label" : result.label, "hazards": ", ".join(result.hazards) if result.hazards else "—", "caption": caption, }) return banner_html, caption, _build_history_md() def _info_html(msg: str, color: str) -> str: return ( f'

' f'{msg}

' ) def _build_history_md() -> str: if not history_log: return "_No analyses yet — upload an image above._" rows = ["| Time | Result | Hazards | Caption |", "|------|--------|---------|---------|"] for h in history_log[:10]: short = (h["caption"][:70] + "…") if len(h["caption"]) > 70 else h["caption"] icon = "⚠️" if h["label"] == "DANGEROUS" else "✅" rows.append(f"| `{h['time']}` | {icon} **{h['label']}** | {h['hazards']} | {short} |") return "\n".join(rows) # ── Custom CSS ──────────────────────────────────────────────────────────────── CSS = """ @import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;600;800&family=JetBrains+Mono:wght@500;700&display=swap'); body, .gradio-container { background: #0a0a10 !important; color: #e2e8f0 !important; font-family: 'DM Sans', sans-serif !important; } .gradio-container { max-width: 1100px !important; margin: 0 auto !important; } gradio-app { background: #0a0a10 !important; } /* Header */ .app-header { text-align: center; padding: 2rem 1rem 1.25rem; border-bottom: 1px solid rgba(99,102,241,.2); margin-bottom: 1.25rem; background: linear-gradient(180deg,rgba(99,102,241,.07) 0%,transparent 100%); } .app-title { font-size: 2.5rem; font-weight: 800; letter-spacing: -.03em; margin: 0; background: linear-gradient(135deg,#a5b4fc,#e879f9); -webkit-background-clip: text; -webkit-text-fill-color: transparent; } .app-sub { color: #64748b; font-size: .9rem; margin-top: .4rem; } /* Pipeline bar */ .pipe-bar { display: flex; align-items: center; justify-content: center; flex-wrap: wrap; gap: .4rem; padding: .75rem; margin-bottom: 1.25rem; background: rgba(99,102,241,.04); border: 1px solid rgba(99,102,241,.15); border-radius: 12px; font-family: 'JetBrains Mono', monospace; font-size: .75rem; } .pipe-node { background: rgba(99,102,241,.14); border: 1px solid rgba(99,102,241,.3); color: #a5b4fc; padding: .25rem .75rem; border-radius: 7px; font-weight: 700; } .pipe-arrow { color: #334155; font-size: .9rem; } /* Panels */ .gr-block, .gr-box, .panel { background: #13131e !important; border: 1px solid rgba(99,102,241,.2) !important; border-radius: 14px !important; } /* Upload widget */ .gr-image { border-radius: 12px !important; } /* Caption textbox */ .gr-textbox textarea { background: rgba(255,255,255,.03) !important; border: 1px solid rgba(99,102,241,.2) !important; border-radius: 10px !important; color: #e2e8f0 !important; font-family: 'DM Sans', sans-serif !important; font-size: .95rem !important; line-height: 1.75 !important; } /* Buttons */ .gr-button-primary, button[variant=primary] { background: linear-gradient(135deg,#6366f1,#8b5cf6) !important; border: none !important; border-radius: 10px !important; color: white !important; font-weight: 700 !important; font-family: 'DM Sans', sans-serif !important; font-size: .95rem !important; transition: opacity .2s !important; } .gr-button-primary:hover { opacity: .85 !important; } /* History table */ .history-box table { width: 100%; border-collapse: collapse; font-size: .8rem; } .history-box th { background: rgba(99,102,241,.1); color: #a5b4fc; padding: .4rem .65rem; text-align: left; border-bottom: 1px solid rgba(99,102,241,.2); } .history-box td { padding: .4rem .65rem; color: #64748b; border-bottom: 1px solid rgba(255,255,255,.04); vertical-align: top; } /* Tabs */ .tab-nav button { font-family: 'DM Sans', sans-serif !important; font-weight: 600 !important; color: #64748b !important; } .tab-nav button.selected { color: #a5b4fc !important; } @keyframes fadeIn { from {opacity:0;transform:translateY(-6px)} to {opacity:1;transform:translateY(0)} } """ # ── Build Gradio UI ─────────────────────────────────────────────────────────── def build_ui(): with gr.Blocks(css=CSS, title="ClearPath — Scene Description") as demo: # ── Header ──────────────────────────────────────────────────────────── gr.HTML("""

📥 Image Input → 🧠 ViT-GPT2 / BLIP Captioning → 🔍 Regex Safety Classifier → 🏷️ SAFE / DANGEROUS

""") with gr.Tabs(): # ── Tab 1: Image Upload ─────────────────────────────────────────── with gr.TabItem("📁 Upload Image"): with gr.Row(): with gr.Column(scale=1): img_input = gr.Image( label="Upload or drag an image", type="numpy", height=300, ) analyse_btn = gr.Button( "🔍 Analyse Scene", variant="primary", size="lg", ) with gr.Column(scale=1): result_banner = gr.HTML( value='

' '⬆️ Upload an image and click Analyse Scene

' ) caption_out = gr.Textbox( label="🔊 Scene Description (generated caption)", lines=5, interactive=False, placeholder="The AI-generated scene description will appear here…", ) analyse_btn.click( fn=analyse, inputs=[img_input], outputs=[result_banner, caption_out, gr.State()], ) # ── Tab 2: Webcam ───────────────────────────────────────────────── with gr.TabItem("📷 Webcam"): with gr.Row(): with gr.Column(scale=1): cam_input = gr.Image( label="Webcam — capture a snapshot", sources=["webcam"], type="numpy", height=300, ) cam_btn = gr.Button( "📸 Capture & Analyse", variant="primary", size="lg", ) with gr.Column(scale=1): cam_banner = gr.HTML( value='

' '📷 Point your camera and click Capture & Analyse

' ) cam_caption = gr.Textbox( label="🔊 Scene Description", lines=5, interactive=False, ) cam_btn.click( fn=analyse, inputs=[cam_input], outputs=[cam_banner, cam_caption, gr.State()], ) # ── Tab 3: Video ────────────────────────────────────────────────── with gr.TabItem("🎬 Video"): gr.Markdown("Upload a video — ClearPath samples one frame every N seconds.") with gr.Row(): vid_input = gr.Video(label="Upload Video") interval = gr.Slider(1, 10, value=3, step=1, label="Interval (seconds)") vid_btn = gr.Button("▶ Analyse Video", variant="primary") vid_out = gr.Dataframe( headers=["Frame", "Time (s)", "Label", "Hazards", "Caption"], datatype=["number", "number", "str", "str", "str"], visible=False, ) def analyse_video(path, secs): if path is None: return gr.update(visible=False) cap = cv2.VideoCapture(path) fps = cap.get(cv2.CAP_PROP_FPS) or 25 step = max(1, int(fps * secs)) rows, idx, n = [], 0, 0 while True: ret, frame = cap.read() if not ret: break if idx % step == 0: pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) try: cap_text = captioner.describe(pil) res = classifier.classify(cap_text) except Exception as e: cap_text, res = str(e), ClassificationResult("ERROR", [], []) rows.append([n + 1, round(idx / fps, 1), res.label, ", ".join(res.hazards) or "—", cap_text]) n += 1 idx += 1 cap.release() return gr.update(value=rows, visible=True) vid_btn.click(fn=analyse_video, inputs=[vid_input, interval], outputs=[vid_out]) # ── History ─────────────────────────────────────────────────────────── with gr.Accordion("📋 Analysis History", open=False): history_out = gr.Markdown( "_No analyses yet._", elem_classes=["history-box"], ) # Wire history refresh on every analyse def analyse_with_history(image): banner, caption, _ = analyse(image) return banner, caption, _build_history_md() analyse_btn.click( fn=analyse_with_history, inputs=[img_input], outputs=[result_banner, caption_out, history_out], ) cam_btn.click( fn=analyse_with_history, inputs=[cam_input], outputs=[cam_banner, cam_caption, history_out], ) return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860)

👁 ClearPath