"""
app.py — ClearPath: Real-Time Scene Description for Visually-Impaired People
Pipeline: Upload Image → ViT-GPT2 Caption → Regex Safety Classifier → SAFE / DANGEROUS
"""
import gradio as gr
import numpy as np
import logging
import time
import cv2
from PIL import Image
from scene_captioner import SceneCaptioner
from safety_classifier import SafetyClassifier, ClassificationResult
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
# ── Load pipeline once at startup ─────────────────────────────────────────────
logger.info("🚀 Starting ClearPath — loading captioner …")
captioner = SceneCaptioner()
classifier = SafetyClassifier()
logger.info(f"✅ Pipeline ready — captioner backend: {captioner._backend}")
history_log: list[dict] = []
# ── Core pipeline function ────────────────────────────────────────────────────
def analyse(image: np.ndarray):
"""
Main pipeline:
1. Convert numpy array → PIL Image
2. SceneCaptioner.describe() → caption string
3. SafetyClassifier.classify() → SAFE / DANGEROUS
4. Return results to Gradio UI
"""
if image is None:
return (
_info_html("⬆️ Please upload an image first.", "#6366f1"),
"",
_build_history_md(),
)
t0 = time.time()
pil = Image.fromarray(image).convert("RGB")
# ── Step 2: Caption ───────────────────────────────────────────────────────
try:
caption = captioner.describe(pil)
except Exception as exc:
logger.error(f"Caption error: {exc}")
caption = "Unable to generate caption for this image."
# ── Step 3: Classify ──────────────────────────────────────────────────────
result = classifier.classify(caption)
elapsed = round(time.time() - t0, 2)
# ── Build banner HTML ─────────────────────────────────────────────────────
if result.label == "DANGEROUS":
hazard_str = " | ".join(result.hazards)
token_str = ", ".join(result.matches[:8])
banner_html = f"""
⚠️
DANGER DETECTED
Categories: {hazard_str}
Matched tokens: {token_str}
⏱ Analysed in {elapsed}s | Backend: {captioner._backend}
"""
else:
banner_html = f"""
✅
SAFE ENVIRONMENT
No hazards detected by the 16-category regex engine.
⏱ Analysed in {elapsed}s | Backend: {captioner._backend}
"""
# ── Log to history ────────────────────────────────────────────────────────
history_log.insert(0, {
"time" : time.strftime("%H:%M:%S"),
"label" : result.label,
"hazards": ", ".join(result.hazards) if result.hazards else "—",
"caption": caption,
})
return banner_html, caption, _build_history_md()
def _info_html(msg: str, color: str) -> str:
return (
f''
f'{msg}
'
)
def _build_history_md() -> str:
if not history_log:
return "_No analyses yet — upload an image above._"
rows = ["| Time | Result | Hazards | Caption |",
"|------|--------|---------|---------|"]
for h in history_log[:10]:
short = (h["caption"][:70] + "…") if len(h["caption"]) > 70 else h["caption"]
icon = "⚠️" if h["label"] == "DANGEROUS" else "✅"
rows.append(f"| `{h['time']}` | {icon} **{h['label']}** | {h['hazards']} | {short} |")
return "\n".join(rows)
# ── Custom CSS ────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;600;800&family=JetBrains+Mono:wght@500;700&display=swap');
body, .gradio-container {
background: #0a0a10 !important;
color: #e2e8f0 !important;
font-family: 'DM Sans', sans-serif !important;
}
.gradio-container { max-width: 1100px !important; margin: 0 auto !important; }
gradio-app { background: #0a0a10 !important; }
/* Header */
.app-header {
text-align: center;
padding: 2rem 1rem 1.25rem;
border-bottom: 1px solid rgba(99,102,241,.2);
margin-bottom: 1.25rem;
background: linear-gradient(180deg,rgba(99,102,241,.07) 0%,transparent 100%);
}
.app-title {
font-size: 2.5rem; font-weight: 800; letter-spacing: -.03em; margin: 0;
background: linear-gradient(135deg,#a5b4fc,#e879f9);
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
}
.app-sub { color: #64748b; font-size: .9rem; margin-top: .4rem; }
/* Pipeline bar */
.pipe-bar {
display: flex; align-items: center; justify-content: center;
flex-wrap: wrap; gap: .4rem;
padding: .75rem; margin-bottom: 1.25rem;
background: rgba(99,102,241,.04);
border: 1px solid rgba(99,102,241,.15); border-radius: 12px;
font-family: 'JetBrains Mono', monospace; font-size: .75rem;
}
.pipe-node {
background: rgba(99,102,241,.14); border: 1px solid rgba(99,102,241,.3);
color: #a5b4fc; padding: .25rem .75rem; border-radius: 7px; font-weight: 700;
}
.pipe-arrow { color: #334155; font-size: .9rem; }
/* Panels */
.gr-block, .gr-box, .panel {
background: #13131e !important;
border: 1px solid rgba(99,102,241,.2) !important;
border-radius: 14px !important;
}
/* Upload widget */
.gr-image { border-radius: 12px !important; }
/* Caption textbox */
.gr-textbox textarea {
background: rgba(255,255,255,.03) !important;
border: 1px solid rgba(99,102,241,.2) !important;
border-radius: 10px !important;
color: #e2e8f0 !important;
font-family: 'DM Sans', sans-serif !important;
font-size: .95rem !important;
line-height: 1.75 !important;
}
/* Buttons */
.gr-button-primary, button[variant=primary] {
background: linear-gradient(135deg,#6366f1,#8b5cf6) !important;
border: none !important; border-radius: 10px !important;
color: white !important; font-weight: 700 !important;
font-family: 'DM Sans', sans-serif !important;
font-size: .95rem !important;
transition: opacity .2s !important;
}
.gr-button-primary:hover { opacity: .85 !important; }
/* History table */
.history-box table { width: 100%; border-collapse: collapse; font-size: .8rem; }
.history-box th {
background: rgba(99,102,241,.1); color: #a5b4fc;
padding: .4rem .65rem; text-align: left;
border-bottom: 1px solid rgba(99,102,241,.2);
}
.history-box td {
padding: .4rem .65rem; color: #64748b;
border-bottom: 1px solid rgba(255,255,255,.04);
vertical-align: top;
}
/* Tabs */
.tab-nav button {
font-family: 'DM Sans', sans-serif !important;
font-weight: 600 !important; color: #64748b !important;
}
.tab-nav button.selected { color: #a5b4fc !important; }
@keyframes fadeIn { from {opacity:0;transform:translateY(-6px)} to {opacity:1;transform:translateY(0)} }
"""
# ── Build Gradio UI ───────────────────────────────────────────────────────────
def build_ui():
with gr.Blocks(css=CSS, title="ClearPath — Scene Description") as demo:
# ── Header ────────────────────────────────────────────────────────────
gr.HTML("""
📥 Image Input
→
🧠 ViT-GPT2 / BLIP Captioning
→
🔍 Regex Safety Classifier
→
🏷️ SAFE / DANGEROUS
""")
with gr.Tabs():
# ── Tab 1: Image Upload ───────────────────────────────────────────
with gr.TabItem("📁 Upload Image"):
with gr.Row():
with gr.Column(scale=1):
img_input = gr.Image(
label="Upload or drag an image",
type="numpy",
height=300,
)
analyse_btn = gr.Button(
"🔍 Analyse Scene",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
result_banner = gr.HTML(
value=''
'⬆️ Upload an image and click Analyse Scene
'
)
caption_out = gr.Textbox(
label="🔊 Scene Description (generated caption)",
lines=5,
interactive=False,
placeholder="The AI-generated scene description will appear here…",
)
analyse_btn.click(
fn=analyse,
inputs=[img_input],
outputs=[result_banner, caption_out, gr.State()],
)
# ── Tab 2: Webcam ─────────────────────────────────────────────────
with gr.TabItem("📷 Webcam"):
with gr.Row():
with gr.Column(scale=1):
cam_input = gr.Image(
label="Webcam — capture a snapshot",
sources=["webcam"],
type="numpy",
height=300,
)
cam_btn = gr.Button(
"📸 Capture & Analyse",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
cam_banner = gr.HTML(
value=''
'📷 Point your camera and click Capture & Analyse
'
)
cam_caption = gr.Textbox(
label="🔊 Scene Description",
lines=5,
interactive=False,
)
cam_btn.click(
fn=analyse,
inputs=[cam_input],
outputs=[cam_banner, cam_caption, gr.State()],
)
# ── Tab 3: Video ──────────────────────────────────────────────────
with gr.TabItem("🎬 Video"):
gr.Markdown("Upload a video — ClearPath samples one frame every N seconds.")
with gr.Row():
vid_input = gr.Video(label="Upload Video")
interval = gr.Slider(1, 10, value=3, step=1, label="Interval (seconds)")
vid_btn = gr.Button("▶ Analyse Video", variant="primary")
vid_out = gr.Dataframe(
headers=["Frame", "Time (s)", "Label", "Hazards", "Caption"],
datatype=["number", "number", "str", "str", "str"],
visible=False,
)
def analyse_video(path, secs):
if path is None:
return gr.update(visible=False)
cap = cv2.VideoCapture(path)
fps = cap.get(cv2.CAP_PROP_FPS) or 25
step = max(1, int(fps * secs))
rows, idx, n = [], 0, 0
while True:
ret, frame = cap.read()
if not ret:
break
if idx % step == 0:
pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
try:
cap_text = captioner.describe(pil)
res = classifier.classify(cap_text)
except Exception as e:
cap_text, res = str(e), ClassificationResult("ERROR", [], [])
rows.append([n + 1, round(idx / fps, 1),
res.label, ", ".join(res.hazards) or "—", cap_text])
n += 1
idx += 1
cap.release()
return gr.update(value=rows, visible=True)
vid_btn.click(fn=analyse_video, inputs=[vid_input, interval], outputs=[vid_out])
# ── History ───────────────────────────────────────────────────────────
with gr.Accordion("📋 Analysis History", open=False):
history_out = gr.Markdown(
"_No analyses yet._",
elem_classes=["history-box"],
)
# Wire history refresh on every analyse
def analyse_with_history(image):
banner, caption, _ = analyse(image)
return banner, caption, _build_history_md()
analyse_btn.click(
fn=analyse_with_history,
inputs=[img_input],
outputs=[result_banner, caption_out, history_out],
)
cam_btn.click(
fn=analyse_with_history,
inputs=[cam_input],
outputs=[cam_banner, cam_caption, history_out],
)
return demo
if __name__ == "__main__":
demo = build_ui()
demo.launch(server_name="0.0.0.0", server_port=7860)