Spaces:

A7med-Ame3
/

Real_Time_Image_Captioning

Sleeping

App Files Files Community

A7med-Ame3 commited on 7 days ago

Commit

bf990aa

verified ·

1 Parent(s): f2f0e94

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -204

app.py CHANGED Viewed

@@ -1,213 +1,260 @@
 """
-╔══════════════════════════════════════════════════════════════════╗
-║         ClearPath — Real-Time Scene Description System          ║
-║         For Visually-Impaired People                            ║
-║                                                                  ║
-║  Pipeline:  Input → Qwen2-VL Captioning → Regex Classifier      ║
-║             → SAFE / DANGEROUS + TTS Output                      ║
-╚══════════════════════════════════════════════════════════════════╝
 """
 import gradio as gr
-from scene_captioner import SceneCaptioner
-from safety_classifier import SafetyClassifier, ClassificationResult
-from tts_engine import TTSEngine
-import cv2
 import numpy as np
-from PIL import Image
-import time
 import logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 logger = logging.getLogger(__name__)
-# ── Global singletons (loaded once) ──────────────────────────────────────────
 captioner  = SceneCaptioner()
 classifier = SafetyClassifier()
-tts        = TTSEngine()
 history_log: list[dict] = []
-# ── Core pipeline ─────────────────────────────────────────────────────────────
-def run_pipeline(image: Image.Image) -> tuple[str, ClassificationResult]:
-    """Full pipeline: caption → classify."""
-    if image is None:
-        raise ValueError("No image provided.")
-    logger.info("▶  Running captioning model …")
-    caption = captioner.describe(image)
-    logger.info(f"   Caption: {caption}")
-    logger.info("▶  Running safety classifier …")
-    result  = classifier.classify(caption)
-    logger.info(f"   Classification: {result.label}  |  Hazards: {result.hazards}")
-    return caption, result
-def process_image(image: np.ndarray | None):
-    """Gradio callback for image upload / webcam snapshot."""
     if image is None:
         return (
-            gr.update(value="⚠️ No image provided.", visible=True),
-            gr.update(value="", visible=False),
-            gr.update(visible=False),
-            _build_history_markdown(),
         )
-    pil_image = Image.fromarray(image).convert("RGB")
     try:
-        caption, result = run_pipeline(pil_image)
     except Exception as exc:
-        logger.error(f"Pipeline error: {exc}")
-        return (
-            gr.update(value=f"❌ Error: {exc}", visible=True),
-            gr.update(value="", visible=False),
-            gr.update(visible=False),
-            _build_history_markdown(),
-        )
-    # ── TTS ───────────────────────────────────────────────────────────────────
-    prefix = "⚠️ DANGER DETECTED! " if result.label == "DANGEROUS" else "Safe environment. "
-    tts.speak_async(prefix + caption)
-    # ── Build HTML banner ─────────────────────────────────────────────────────
     if result.label == "DANGEROUS":
-        banner_html = (
-            '<div style="background:rgba(239,68,68,0.12);border:1px solid rgba(239,68,68,0.4);'
-            'border-radius:12px;padding:1rem 1.25rem;display:flex;align-items:flex-start;gap:.85rem;">'
-            '<span style="font-size:2rem;">⚠️</span>'
-            '<div><strong style="color:#fca5a5;font-size:1.05rem;letter-spacing:.04em;">DANGER DETECTED</strong>'
-            f'<br><span style="color:#f87171;font-size:.82rem;">Hazards: {", ".join(result.hazards)}</span>'
-            f'<br><span style="color:#94a3b8;font-size:.75rem;font-family:monospace;">Tokens: {", ".join(result.matches)}</span>'
-            '</div></div>'
-        )
     else:
-        banner_html = (
-            '<div style="background:rgba(34,197,94,0.1);border:1px solid rgba(34,197,94,0.35);'
-            'border-radius:12px;padding:1rem 1.25rem;display:flex;align-items:flex-start;gap:.85rem;">'
-            '<span style="font-size:2rem;">✅</span>'
-            '<div><strong style="color:#86efac;font-size:1.05rem;letter-spacing:.04em;">SAFE ENVIRONMENT</strong>'
-            '<br><span style="color:#4ade80;font-size:.82rem;">No hazards detected by the regex engine.</span>'
-            '</div></div>'
-        )
-    # ── History ───────────────────────────────────────────────────────────────
     history_log.insert(0, {
         "time"   : time.strftime("%H:%M:%S"),
-        "caption": caption,
         "label"  : result.label,
         "hazards": ", ".join(result.hazards) if result.hazards else "—",
     })
     return (
-        gr.update(value=banner_html,  visible=True),
-        gr.update(value=caption,      visible=True),
-        gr.update(visible=True),
-        _build_history_markdown(),
     )
-def _build_history_markdown() -> str:
     if not history_log:
-        return "_No analyses yet._"
-    rows = ["| Time | Label | Hazards | Caption |",
-            "|------|-------|---------|---------|"]
-    for h in history_log[:8]:
-        short = h["caption"][:65] + "…" if len(h["caption"]) > 65 else h["caption"]
-        emoji = "⚠️" if h["label"] == "DANGEROUS" else "✅"
-        rows.append(f"| {h['time']} | {emoji} {h['label']} | {h['hazards']} | {short} |")
     return "\n".join(rows)
 # ── Custom CSS ────────────────────────────────────────────────────────────────
 CSS = """
 @import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;600;800&family=JetBrains+Mono:wght@500;700&display=swap');
-:root {
-    --bg:#0a0a10; --surface:#13131e; --border:rgba(99,102,241,.22);
-    --accent:#6366f1; --text:#e2e8f0; --muted:#64748b;
-}
 body, .gradio-container {
-    background: var(--bg) !important;
-    color: var(--text) !important;
     font-family: 'DM Sans', sans-serif !important;
 }
-.gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
 .app-header {
-    text-align: center; padding: 2rem 1rem 1.25rem;
-    background: linear-gradient(180deg, rgba(99,102,241,0.08) 0%, transparent 100%);
-    border-bottom: 1px solid var(--border); margin-bottom: 1.5rem;
 }
 .app-title {
-    font-size: 2.4rem; font-weight: 800; letter-spacing: -.03em;
-    background: linear-gradient(135deg, #a5b4fc, #e879f9);
     -webkit-background-clip: text; -webkit-text-fill-color: transparent;
-    margin: 0;
 }
-.app-subtitle { color: var(--muted); font-size: .92rem; margin-top: .4rem; }
-.pipeline-bar {
     display: flex; align-items: center; justify-content: center;
-    gap: .4rem; flex-wrap: wrap; padding: .8rem 1rem;
-    background: rgba(99,102,241,.04); border-bottom: 1px solid var(--border);
     font-family: 'JetBrains Mono', monospace; font-size: .75rem;
-    margin-bottom: 1.5rem;
 }
 .pipe-node {
-    padding: .28rem .75rem; border-radius: 7px;
-    background: rgba(99,102,241,.12); border: 1px solid var(--border);
-    color: #a5b4fc; font-weight: 700;
 }
-.pipe-arrow { color: #334155; }
-.panel {
-    background: var(--surface) !important;
-    border: 1px solid var(--border) !important;
     border-radius: 14px !important;
 }
-.caption-output textarea {
     background: rgba(255,255,255,.03) !important;
-    border: 1px solid var(--border) !important;
-    border-radius: 10px !important; color: var(--text) !important;
-    font-size: .95rem !important; line-height: 1.75 !important;
     font-family: 'DM Sans', sans-serif !important;
 }
-button.primary-btn {
-    background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
-    border: none !important; color: white !important;
-    font-weight: 700 !important; border-radius: 10px !important;
 }
-button.primary-btn:hover { opacity: .85 !important; }
-.history-md table { width: 100%; border-collapse: collapse; font-size: .8rem; }
-.history-md th { background: rgba(99,102,241,.1); color: #a5b4fc; padding: .4rem .6rem; border-bottom: 1px solid var(--border); }
-.history-md td { padding: .4rem .6rem; border-bottom: 1px solid rgba(255,255,255,.04); color: var(--muted); }
-.tab-nav button { font-family: 'DM Sans', sans-serif !important; font-weight: 600 !important; }
 """
-# ── Gradio UI ─────────────────────────────────────────────────────────────────
-def build_ui() -> gr.Blocks:
     with gr.Blocks(css=CSS, title="ClearPath — Scene Description") as demo:
         gr.HTML("""
         <div class="app-header">
           <h1 class="app-title">👁 ClearPath</h1>
-          <p class="app-subtitle">Real-Time Scene Description for Visually-Impaired People</p>
         </div>
-        <div class="pipeline-bar">
-          <span class="pipe-node">📥 Input</span>
           <span class="pipe-arrow">→</span>
-          <span class="pipe-node">🧠 Qwen2-VL</span>
           <span class="pipe-arrow">→</span>
-          <span class="pipe-node">🔍 Regex Classifier</span>
           <span class="pipe-arrow">→</span>
           <span class="pipe-node">🏷️ SAFE / DANGEROUS</span>
-          <span class="pipe-arrow">→</span>
-          <span class="pipe-node">🔊 TTS</span>
         </div>
         """)
@@ -217,135 +264,131 @@ def build_ui() -> gr.Blocks:
             with gr.TabItem("📁 Upload Image"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        upload_input = gr.Image(
-                            label="Upload or drag an image here",
                             type="numpy",
-                            elem_classes=["panel"],
                         )
-                        upload_btn = gr.Button(
-                            "🔍  Describe Scene",
                             variant="primary",
-                            elem_classes=["primary-btn"],
                         )
                     with gr.Column(scale=1):
-                        upload_banner  = gr.HTML(visible=False)
-                        upload_caption = gr.Textbox(
-                            label="🔊 Scene Description",
                             lines=5,
                             interactive=False,
-                            placeholder="Scene description will appear here after analysis…",
-                            elem_classes=["caption-output"],
-                            visible=False,
                         )
-                        upload_speak_btn = gr.Button(
-                            "▶ Read Aloud Again",
-                            visible=False,
-                            elem_classes=["primary-btn"],
-                        )
-                history_md_upload = gr.Markdown("", elem_classes=["history-md"])
-                upload_btn.click(
-                    fn=process_image,
-                    inputs=[upload_input],
-                    outputs=[upload_banner, upload_caption, upload_speak_btn, history_md_upload],
-                )
-                upload_speak_btn.click(
-                    fn=lambda cap: tts.speak_async(cap),
-                    inputs=[upload_caption],
                 )
             # ── Tab 2: Webcam ─────────────────────────────────────────────────
             with gr.TabItem("📷 Webcam"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        webcam_input = gr.Image(
-                            label="Webcam — click the camera button to capture",
                             sources=["webcam"],
                             type="numpy",
-                            elem_classes=["panel"],
                         )
                         cam_btn = gr.Button(
-                            "📸  Capture & Describe",
                             variant="primary",
-                            elem_classes=["primary-btn"],
                         )
                     with gr.Column(scale=1):
-                        cam_banner  = gr.HTML(visible=False)
                         cam_caption = gr.Textbox(
                             label="🔊 Scene Description",
                             lines=5,
                             interactive=False,
-                            placeholder="Point your camera at a scene and click Capture…",
-                            elem_classes=["caption-output"],
-                            visible=False,
-                        )
-                        cam_speak_btn = gr.Button(
-                            "▶ Read Aloud Again",
-                            visible=False,
-                            elem_classes=["primary-btn"],
                         )
-                history_md_cam = gr.Markdown("", elem_classes=["history-md"])
                 cam_btn.click(
-                    fn=process_image,
-                    inputs=[webcam_input],
-                    outputs=[cam_banner, cam_caption, cam_speak_btn, history_md_cam],
-                )
-                cam_speak_btn.click(
-                    fn=lambda cap: tts.speak_async(cap),
-                    inputs=[cam_caption],
                 )
             # ── Tab 3: Video ──────────────────────────────────────────────────
             with gr.TabItem("🎬 Video"):
-                gr.Markdown("Upload a video — ClearPath extracts one frame every N seconds and describes each.")
                 with gr.Row():
-                    video_input    = gr.Video(label="Upload Video")
-                    interval_input = gr.Slider(1, 10, value=3, step=1, label="Capture interval (seconds)")
-                video_btn = gr.Button("▶  Analyse Video", variant="primary", elem_classes=["primary-btn"])
-                video_captions = gr.Dataframe(
-                    headers=["Frame #", "Time (s)", "Label", "Hazards", "Caption"],
                     datatype=["number", "number", "str", "str", "str"],
                     visible=False,
                 )
-                def process_video(video_path, interval):
-                    if video_path is None:
                         return gr.update(visible=False)
-                    cap  = cv2.VideoCapture(video_path)
-                    fps  = cap.get(cv2.CAP_PROP_FPS) or 25
-                    step = max(1, int(fps * interval))
-                    rows, idx, sample_no = [], 0, 0
                     while True:
                         ret, frame = cap.read()
                         if not ret:
                             break
                         if idx % step == 0:
-                            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                            pil = Image.fromarray(rgb)
                             try:
-                                caption, result = run_pipeline(pil)
                             except Exception as e:
-                                caption, result = str(e), ClassificationResult("ERROR", [], [])
-                            rows.append([sample_no + 1, round(idx / fps, 1),
-                                         result.label, ", ".join(result.hazards) or "—", caption])
-                            sample_no += 1
                         idx += 1
                     cap.release()
                     return gr.update(value=rows, visible=True)
-                video_btn.click(
-                    fn=process_video,
-                    inputs=[video_input, interval_input],
-                    outputs=[video_captions],
-                )
-        # ── Analysis History ──────────────────────────────────────────────────
         with gr.Accordion("📋 Analysis History", open=False):
-            gr.Markdown("Recent analyses appear here after each run.", elem_classes=["history-md"])
     return demo

 """
+app.py — ClearPath: Real-Time Scene Description for Visually-Impaired People
+Pipeline: Upload Image → ViT-GPT2 Caption → Regex Safety Classifier → SAFE / DANGEROUS
 """
 import gradio as gr
 import numpy as np
 import logging
+import time
+import cv2
+from PIL import Image
+from scene_captioner   import SceneCaptioner
+from safety_classifier import SafetyClassifier, ClassificationResult
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 logger = logging.getLogger(__name__)
+# ── Load pipeline once at startup ─────────────────────────────────────────────
+logger.info("🚀 Starting ClearPath — loading captioner …")
 captioner  = SceneCaptioner()
 classifier = SafetyClassifier()
+logger.info(f"✅ Pipeline ready — captioner backend: {captioner._backend}")
 history_log: list[dict] = []
+# ── Core pipeline function ────────────────────────────────────────────────────
+def analyse(image: np.ndarray):
+    """
+    Main pipeline:
+      1. Convert numpy array → PIL Image
+      2. SceneCaptioner.describe() → caption string
+      3. SafetyClassifier.classify() → SAFE / DANGEROUS
+      4. Return results to Gradio UI
+    """
     if image is None:
         return (
+            _info_html("⬆️ Please upload an image first.", "#6366f1"),
+            "",
+            _build_history_md(),
         )
+    t0  = time.time()
+    pil = Image.fromarray(image).convert("RGB")
+    # ── Step 2: Caption ───────────────────────────────────────────────────────
     try:
+        caption = captioner.describe(pil)
     except Exception as exc:
+        logger.error(f"Caption error: {exc}")
+        caption = "Unable to generate caption for this image."
+    # ── Step 3: Classify ──────────────────────────────────────────────────────
+    result  = classifier.classify(caption)
+    elapsed = round(time.time() - t0, 2)
+    # ── Build banner HTML ─────────────────────────────────────────────────────
     if result.label == "DANGEROUS":
+        hazard_str  = " &nbsp;|&nbsp; ".join(result.hazards)
+        token_str   = ", ".join(result.matches[:8])
+        banner_html = f"""
+        <div style="
+            background:rgba(239,68,68,0.12);
+            border:2px solid rgba(239,68,68,0.45);
+            border-radius:14px; padding:1.1rem 1.4rem;
+            display:flex; align-items:flex-start; gap:1rem;
+            animation: fadeIn .3s ease;
+        ">
+          <span style="font-size:2.5rem; line-height:1;">⚠️</span>
+          <div>
+            <div style="font-weight:800; font-size:1.15rem; color:#fca5a5;
+                        letter-spacing:.04em; margin-bottom:.3rem;">
+              DANGER DETECTED
+            </div>
+            <div style="font-size:.85rem; color:#f87171; margin-bottom:.25rem;">
+              <strong>Categories:</strong> {hazard_str}
+            </div>
+            <div style="font-size:.75rem; color:#94a3b8; font-family:monospace;">
+              <strong>Matched tokens:</strong> {token_str}
+            </div>
+            <div style="font-size:.7rem; color:#64748b; margin-top:.3rem;">
+              ⏱ Analysed in {elapsed}s &nbsp;|&nbsp; Backend: {captioner._backend}
+            </div>
+          </div>
+        </div>"""
     else:
+        banner_html = f"""
+        <div style="
+            background:rgba(34,197,94,0.1);
+            border:2px solid rgba(34,197,94,0.4);
+            border-radius:14px; padding:1.1rem 1.4rem;
+            display:flex; align-items:flex-start; gap:1rem;
+        ">
+          <span style="font-size:2.5rem; line-height:1;">✅</span>
+          <div>
+            <div style="font-weight:800; font-size:1.15rem; color:#86efac;
+                        letter-spacing:.04em; margin-bottom:.3rem;">
+              SAFE ENVIRONMENT
+            </div>
+            <div style="font-size:.85rem; color:#4ade80;">
+              No hazards detected by the 16-category regex engine.
+            </div>
+            <div style="font-size:.7rem; color:#64748b; margin-top:.3rem;">
+              ⏱ Analysed in {elapsed}s &nbsp;|&nbsp; Backend: {captioner._backend}
+            </div>
+          </div>
+        </div>"""
+    # ── Log to history ────────────────────────────────────────────────────────
     history_log.insert(0, {
         "time"   : time.strftime("%H:%M:%S"),
         "label"  : result.label,
         "hazards": ", ".join(result.hazards) if result.hazards else "—",
+        "caption": caption,
     })
+    return banner_html, caption, _build_history_md()
+def _info_html(msg: str, color: str) -> str:
     return (
+        f'<div style="background:rgba(99,102,241,.08);border:1px solid {color}33;'
+        f'border-radius:12px;padding:1rem 1.25rem;color:#94a3b8;font-size:.9rem;">'
+        f'{msg}</div>'
     )
+def _build_history_md() -> str:
     if not history_log:
+        return "_No analyses yet — upload an image above._"
+    rows = ["| Time | Result | Hazards | Caption |",
+            "|------|--------|---------|---------|"]
+    for h in history_log[:10]:
+        short = (h["caption"][:70] + "…") if len(h["caption"]) > 70 else h["caption"]
+        icon  = "⚠️" if h["label"] == "DANGEROUS" else "✅"
+        rows.append(f"| `{h['time']}` | {icon} **{h['label']}** | {h['hazards']} | {short} |")
     return "\n".join(rows)
 # ── Custom CSS ────────────────────────────────────────────────────────────────
 CSS = """
 @import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;600;800&family=JetBrains+Mono:wght@500;700&display=swap');
 body, .gradio-container {
+    background: #0a0a10 !important;
+    color: #e2e8f0 !important;
     font-family: 'DM Sans', sans-serif !important;
 }
+.gradio-container { max-width: 1100px !important; margin: 0 auto !important; }
+gradio-app { background: #0a0a10 !important; }
+/* Header */
 .app-header {
+    text-align: center;
+    padding: 2rem 1rem 1.25rem;
+    border-bottom: 1px solid rgba(99,102,241,.2);
+    margin-bottom: 1.25rem;
+    background: linear-gradient(180deg,rgba(99,102,241,.07) 0%,transparent 100%);
 }
 .app-title {
+    font-size: 2.5rem; font-weight: 800; letter-spacing: -.03em; margin: 0;
+    background: linear-gradient(135deg,#a5b4fc,#e879f9);
     -webkit-background-clip: text; -webkit-text-fill-color: transparent;
 }
+.app-sub { color: #64748b; font-size: .9rem; margin-top: .4rem; }
+/* Pipeline bar */
+.pipe-bar {
     display: flex; align-items: center; justify-content: center;
+    flex-wrap: wrap; gap: .4rem;
+    padding: .75rem; margin-bottom: 1.25rem;
+    background: rgba(99,102,241,.04);
+    border: 1px solid rgba(99,102,241,.15); border-radius: 12px;
     font-family: 'JetBrains Mono', monospace; font-size: .75rem;
 }
 .pipe-node {
+    background: rgba(99,102,241,.14); border: 1px solid rgba(99,102,241,.3);
+    color: #a5b4fc; padding: .25rem .75rem; border-radius: 7px; font-weight: 700;
 }
+.pipe-arrow { color: #334155; font-size: .9rem; }
+/* Panels */
+.gr-block, .gr-box, .panel {
+    background: #13131e !important;
+    border: 1px solid rgba(99,102,241,.2) !important;
     border-radius: 14px !important;
 }
+/* Upload widget */
+.gr-image { border-radius: 12px !important; }
+/* Caption textbox */
+.gr-textbox textarea {
     background: rgba(255,255,255,.03) !important;
+    border: 1px solid rgba(99,102,241,.2) !important;
+    border-radius: 10px !important;
+    color: #e2e8f0 !important;
     font-family: 'DM Sans', sans-serif !important;
+    font-size: .95rem !important;
+    line-height: 1.75 !important;
 }
+/* Buttons */
+.gr-button-primary, button[variant=primary] {
+    background: linear-gradient(135deg,#6366f1,#8b5cf6) !important;
+    border: none !important; border-radius: 10px !important;
+    color: white !important; font-weight: 700 !important;
+    font-family: 'DM Sans', sans-serif !important;
+    font-size: .95rem !important;
+    transition: opacity .2s !important;
+}
+.gr-button-primary:hover { opacity: .85 !important; }
+/* History table */
+.history-box table { width: 100%; border-collapse: collapse; font-size: .8rem; }
+.history-box th {
+    background: rgba(99,102,241,.1); color: #a5b4fc;
+    padding: .4rem .65rem; text-align: left;
+    border-bottom: 1px solid rgba(99,102,241,.2);
+}
+.history-box td {
+    padding: .4rem .65rem; color: #64748b;
+    border-bottom: 1px solid rgba(255,255,255,.04);
+    vertical-align: top;
 }
+/* Tabs */
+.tab-nav button {
+    font-family: 'DM Sans', sans-serif !important;
+    font-weight: 600 !important; color: #64748b !important;
+}
+.tab-nav button.selected { color: #a5b4fc !important; }
+@keyframes fadeIn { from {opacity:0;transform:translateY(-6px)} to {opacity:1;transform:translateY(0)} }
 """
+# ── Build Gradio UI ───────────────────────────────────────────────────────────
+def build_ui():
     with gr.Blocks(css=CSS, title="ClearPath — Scene Description") as demo:
+        # ── Header ────────────────────────────────────────────────────────────
         gr.HTML("""
         <div class="app-header">
           <h1 class="app-title">👁 ClearPath</h1>
+          <p class="app-sub">Real-Time Scene Description for Visually-Impaired People</p>
         </div>
+        <div class="pipe-bar">
+          <span class="pipe-node">📥 Image Input</span>
           <span class="pipe-arrow">→</span>
+          <span class="pipe-node">🧠 ViT-GPT2 / BLIP Captioning</span>
           <span class="pipe-arrow">→</span>
+          <span class="pipe-node">🔍 Regex Safety Classifier</span>
           <span class="pipe-arrow">→</span>
           <span class="pipe-node">🏷️ SAFE / DANGEROUS</span>
         </div>
         """)
             with gr.TabItem("📁 Upload Image"):
                 with gr.Row():
                     with gr.Column(scale=1):
+                        img_input = gr.Image(
+                            label="Upload or drag an image",
                             type="numpy",
+                            height=300,
                         )
+                        analyse_btn = gr.Button(
+                            "🔍  Analyse Scene",
                             variant="primary",
+                            size="lg",
                         )
                     with gr.Column(scale=1):
+                        result_banner = gr.HTML(
+                            value='<div style="background:rgba(99,102,241,.06);border:1px solid rgba(99,102,241,.2);'
+                                  'border-radius:12px;padding:1.25rem;color:#475569;text-align:center;">'
+                                  '⬆️ Upload an image and click <strong>Analyse Scene</strong></div>'
+                        )
+                        caption_out = gr.Textbox(
+                            label="🔊 Scene Description (generated caption)",
                             lines=5,
                             interactive=False,
+                            placeholder="The AI-generated scene description will appear here…",
                         )
+                analyse_btn.click(
+                    fn=analyse,
+                    inputs=[img_input],
+                    outputs=[result_banner, caption_out, gr.State()],
                 )
             # ── Tab 2: Webcam ─────────────────────────────────────────────────
             with gr.TabItem("📷 Webcam"):
                 with gr.Row():
                     with gr.Column(scale=1):
+                        cam_input = gr.Image(
+                            label="Webcam — capture a snapshot",
                             sources=["webcam"],
                             type="numpy",
+                            height=300,
                         )
                         cam_btn = gr.Button(
+                            "📸  Capture & Analyse",
                             variant="primary",
+                            size="lg",
                         )
                     with gr.Column(scale=1):
+                        cam_banner  = gr.HTML(
+                            value='<div style="background:rgba(99,102,241,.06);border:1px solid rgba(99,102,241,.2);'
+                                  'border-radius:12px;padding:1.25rem;color:#475569;text-align:center;">'
+                                  '📷 Point your camera and click <strong>Capture & Analyse</strong></div>'
+                        )
                         cam_caption = gr.Textbox(
                             label="🔊 Scene Description",
                             lines=5,
                             interactive=False,
                         )
                 cam_btn.click(
+                    fn=analyse,
+                    inputs=[cam_input],
+                    outputs=[cam_banner, cam_caption, gr.State()],
                 )
             # ── Tab 3: Video ──────────────────────────────────────────────────
             with gr.TabItem("🎬 Video"):
+                gr.Markdown("Upload a video — ClearPath samples one frame every N seconds.")
                 with gr.Row():
+                    vid_input = gr.Video(label="Upload Video")
+                    interval  = gr.Slider(1, 10, value=3, step=1, label="Interval (seconds)")
+                vid_btn = gr.Button("▶  Analyse Video", variant="primary")
+                vid_out = gr.Dataframe(
+                    headers=["Frame", "Time (s)", "Label", "Hazards", "Caption"],
                     datatype=["number", "number", "str", "str", "str"],
                     visible=False,
                 )
+                def analyse_video(path, secs):
+                    if path is None:
                         return gr.update(visible=False)
+                    cap   = cv2.VideoCapture(path)
+                    fps   = cap.get(cv2.CAP_PROP_FPS) or 25
+                    step  = max(1, int(fps * secs))
+                    rows, idx, n = [], 0, 0
                     while True:
                         ret, frame = cap.read()
                         if not ret:
                             break
                         if idx % step == 0:
+                            pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                             try:
+                                cap_text = captioner.describe(pil)
+                                res      = classifier.classify(cap_text)
                             except Exception as e:
+                                cap_text, res = str(e), ClassificationResult("ERROR", [], [])
+                            rows.append([n + 1, round(idx / fps, 1),
+                                         res.label, ", ".join(res.hazards) or "—", cap_text])
+                            n += 1
                         idx += 1
                     cap.release()
                     return gr.update(value=rows, visible=True)
+                vid_btn.click(fn=analyse_video, inputs=[vid_input, interval], outputs=[vid_out])
+        # ── History ───────────────────────────────────────────────────────────
         with gr.Accordion("📋 Analysis History", open=False):
+            history_out = gr.Markdown(
+                "_No analyses yet._",
+                elem_classes=["history-box"],
+            )
+        # Wire history refresh on every analyse
+        def analyse_with_history(image):
+            banner, caption, _ = analyse(image)
+            return banner, caption, _build_history_md()
+        analyse_btn.click(
+            fn=analyse_with_history,
+            inputs=[img_input],
+            outputs=[result_banner, caption_out, history_out],
+        )
+        cam_btn.click(
+            fn=analyse_with_history,
+            inputs=[cam_input],
+            outputs=[cam_banner, cam_caption, history_out],
+        )
     return demo