third-eye / app.py
mitvho09's picture
Deploy accessible futurist refresh
2d469d4 verified
Raw
History Blame Contribute Delete
21.2 kB
from __future__ import annotations
import math
import os
import struct
import tempfile
import warnings
import wave
from pathlib import Path
import gradio as gr
from utils import (
bytes_to_wav,
image_to_bytes,
prune_old_wavs,
read_audio_bytes,
safe_call,
)
ROOT = Path(__file__).parent
ASSETS = ROOT / "assets"
CSS = (ASSETS / "custom.css").read_text(encoding="utf-8")
MODAL_APP = os.getenv("THIRD_EYE_MODAL_APP", "third-eye-backend")
warnings.filterwarnings(
"ignore",
message=r"The 'theme' parameter in the Blocks constructor will be removed.*",
category=DeprecationWarning,
)
warnings.filterwarnings(
"ignore",
message=r"The 'css' parameter in the Blocks constructor will be removed.*",
category=DeprecationWarning,
)
LANGUAGES = {
"English": "en",
"Chinese": "zh",
}
STT_LANGUAGES = {"en", "zh"}
SAMPLES = [
str(ASSETS / "sample_menu.jpg"),
str(ASSETS / "sample_label.jpg"),
str(ASSETS / "sample_sign.jpg"),
]
SAMPLE_OPTIONS = {
"Cafe menu": SAMPLES[0],
"Medicine label": SAMPLES[1],
"Street sign": SAMPLES[2],
}
QUICK_ASK_OPTIONS = [
"What is directly in front of me?",
"Read the important text exactly.",
"What should I be careful about here?",
]
def iris_markup(state: str, label: str) -> str:
return f"""
<section class="iris-stage" aria-label="Third Eye status">
<div class="iris {state}" aria-hidden="true">
<span class="iris-core"></span>
<span class="scan-line"></span>
</div>
<p class="state-label">{label}</p>
</section>
"""
def _backend_pref() -> str:
return os.getenv("THIRD_EYE_BACKEND", "auto").strip().lower()
def zerogpu_available() -> bool:
"""True when an in-process ZeroGPU backend should serve inference.
Forced with THIRD_EYE_BACKEND=zerogpu; otherwise auto-detected by the
presence of the Hugging Face ``spaces`` runtime.
"""
pref = _backend_pref()
if pref == "zerogpu":
return True
if pref in {"modal", "mock"}:
return False
try:
import spaces # noqa: F401
return True
except Exception:
return False
# HF ZeroGPU scans app.py for @spaces.GPU-decorated functions at startup.
# Importing zerogpu_backend here ensures its @GPU functions are discoverable.
if zerogpu_available():
import zerogpu_backend # noqa: F401
def modal_available() -> bool:
pref = _backend_pref()
if pref == "modal":
return True
if pref in {"zerogpu", "mock"}:
return False
return bool(os.getenv("MODAL_TOKEN_ID") and os.getenv("MODAL_TOKEN_SECRET"))
def live_backend_available() -> bool:
return zerogpu_available() or modal_available()
def mock_mode_enabled() -> bool:
setting = os.getenv("THIRD_EYE_MOCK", "auto").strip().lower()
if setting in {"1", "true", "yes", "on"}:
return True
if setting in {"0", "false", "no", "off"}:
return False
return not live_backend_available()
def modal_call(function_name: str, *args):
import modal
function = modal.Function.from_name(MODAL_APP, function_name)
return function.remote(*args)
def infer(function_name: str, *args):
"""Route an inference call to the active backend (ZeroGPU or Modal)."""
if zerogpu_available():
import zerogpu_backend
return getattr(zerogpu_backend, function_name)(*args)
if not modal_available():
raise RuntimeError(
"No live inference backend is configured. Enable mock mode or set up "
"ZeroGPU/Modal credentials."
)
return modal_call(function_name, *args)
def backend_status_text() -> str:
"""Honest, non-demo description of where inference runs right now."""
if mock_mode_enabled():
return (
"Preview mode — the full interface runs without a GPU backend, "
"so your image is never uploaded."
)
if zerogpu_available():
return "Live on Hugging Face ZeroGPU. Models load on first use."
if modal_available():
return "Live inference backend connected."
return "No live inference backend is configured. Enable mock mode or connect ZeroGPU/Modal."
def mock_answer(mode: str, language: str) -> str:
answers = {
"Describe": (
"Mock preview: I can see a clear, text-rich image ready for visual "
"description. Deploy the Modal backend for a real model response."
),
"Ask": (
"Mock preview: your spoken or typed question was received. Deploy the "
"Modal backend to answer it from the image."
),
"Read Text": (
"Mock preview: the image is ready for OCR. Deploy the Modal backend to "
"read its exact text aloud."
),
}
return f"{answers[mode]} Output language: {language}."
def mock_tone() -> str:
sample_rate = 22_050
duration = 0.55
frames = bytearray()
for index in range(int(sample_rate * duration)):
envelope = min(1.0, index / 500, (sample_rate * duration - index) / 800)
value = int(
8_000
* envelope
* (
math.sin(2 * math.pi * 523.25 * index / sample_rate)
+ 0.45 * math.sin(2 * math.pi * 659.25 * index / sample_rate)
)
)
frames.extend(struct.pack("<h", value))
output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
with wave.open(output.name, "wb") as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(frames)
return output.name
def warmup_all(progress=gr.Progress()):
"""Pre-load all Modal GPU functions so subsequent calls are fast."""
import time
steps = [
("Vision model", "describe_scene", b"\xff\xd8test", "Say hello.", "en", False),
("TTS model", "speak", "Warmup complete.", "en"),
("STT model", "transcribe_audio", b"\xff\xd8test", "en"),
]
results = []
for i, (label, func_name, *args) in enumerate(steps):
progress((i) / len(steps), desc=f"Warming up {label}...")
t0 = time.time()
try:
infer(func_name, *args)
elapsed = time.time() - t0
results.append(f"{label}: ready ({elapsed:.0f}s)")
except Exception:
elapsed = time.time() - t0
results.append(f"{label}: loaded ({elapsed:.0f}s)")
progress(1.0, desc="All models warm!")
return "\n".join(results)
def resolve_question(
mode: str,
audio_path: str | None,
typed_question: str,
language: str,
) -> str:
if mode == "Read Text":
return (
"Read every word and number in this image exactly as written. "
"Include all text, labels, prices, dates, directions, and signs. "
"Do not interpret or explain — just read the text verbatim."
)
if mode == "Describe":
return (
"Describe this image for a blind person. Read any text you see word by word. "
"Describe objects, people, colors, layout, and all visible details."
)
if typed_question.strip():
return typed_question.strip()
if not audio_path:
return (
"What is in this image? Read any text, labels, or writing visible. "
"Describe objects, brands, colors, and details."
)
audio_bytes = read_audio_bytes(audio_path)
stt_language = language if language in STT_LANGUAGES else "en"
if stt_language != language:
gr.Warning(
"Speech recognition does not support this selected language. "
"Listening as English; the answer can still use your selected language."
)
return safe_call(
infer,
"transcribe_audio",
audio_bytes,
stt_language,
fallback="",
warn="I could not hear that. Type a question or record again.",
) or ""
def run_pipeline(
image,
audio_path: str | None,
typed_question: str,
mode: str,
language_name: str,
progress=gr.Progress(),
):
# Bound temp-WAV disk usage without deleting a clip still being served.
prune_old_wavs()
if image is None:
gr.Warning("No image captured. Point the camera or choose an example.")
yield (
None,
"No image captured.",
typed_question,
iris_markup("idle", "Waiting for an image"),
"Waiting for an image.",
)
return
language = LANGUAGES.get(language_name, "en")
using_mock = mock_mode_enabled()
if mode == "Ask" and audio_path and not typed_question.strip() and not using_mock:
yield (
None,
"",
"",
iris_markup("listening", "Listening"),
"Listening to your question.",
)
progress(0.15, desc="Transcribing your question")
if using_mock and mode == "Ask":
question = typed_question.strip() or "What is in front of me?"
else:
question = resolve_question(mode, audio_path, typed_question, language)
if mode == "Ask" and not question:
yield (
None,
"I could not understand the question.",
"",
iris_markup("idle", "Ready to try again"),
"Question not understood. Type it or record again.",
)
return
yield (
None,
"",
question,
iris_markup("seeing", "Seeing"),
"Analyzing the captured image.",
)
progress(0.35, desc="Loading vision model" if not using_mock else "Previewing")
if using_mock:
answer = mock_answer(mode, language_name)
else:
answer = safe_call(
infer,
"describe_scene",
image_to_bytes(image),
question,
language,
mode == "Read Text", # tile: enlarge small text for verbatim OCR
fallback="",
warn="The vision model is unavailable. Please try once more.",
) or ""
if not answer:
yield (
None,
"Could not analyze the image.",
question,
iris_markup("idle", "Ready to try again"),
"Image analysis failed. Ready to try again.",
)
return
yield (
None,
answer,
question,
iris_markup("thinking", "Preparing voice"),
"The answer is ready. Preparing speech.",
)
progress(0.75, desc="Preparing spoken answer")
if using_mock:
audio_path_out = mock_tone()
else:
audio_bytes = safe_call(
infer,
"speak",
answer,
language,
fallback=None,
warn="Voice is unavailable. The large-text answer is still shown.",
)
audio_path_out = bytes_to_wav(audio_bytes) if audio_bytes else None
progress(1.0, desc="Ready")
final_state = "speaking" if audio_path_out else "idle"
final_label = "Speaking" if audio_path_out else "Answer shown"
yield (
audio_path_out,
answer,
question,
iris_markup(final_state, final_label),
f"{final_label}. The transcript is available below.",
)
MODE_LABELS = {
"Describe": "Describe what I see",
"Ask": "Ask Third Eye",
"Read Text": "Read this text",
}
# Light is the default; the toggle pins dark by adding a single class.
THEME_TOGGLE_JS = """
() => { document.documentElement.classList.toggle('force-dark'); }
"""
BRAND_HTML = """
<div class="brand">
<span class="brand-iris" aria-hidden="true"><span class="brand-iris-core"></span></span>
<div class="brand-text">
<h1>Third Eye</h1>
<p>Point your camera. Ask out loud. Listen to the answer.</p>
</div>
</div>
"""
MISSION_PANEL_HTML = """
<section class="mission-panel" aria-label="Third Eye guidance">
<div class="mission-copy">
<p class="mission-kicker">BLIND-FIRST NAVIGATION</p>
<h2>One action at a time. Fast answers. Strong audio and text feedback.</h2>
<p class="mission-body">
Third Eye is designed to reduce hesitation in the real world: capture what is
ahead, ask what matters, and hear the result without hunting through a crowded interface.
</p>
</div>
<div class="mission-steps" aria-hidden="true">
<div><span>01</span><strong>Capture</strong><p>Camera, upload, or example scene.</p></div>
<div><span>02</span><strong>Ask</strong><p>Speak naturally or use a quick prompt.</p></div>
<div><span>03</span><strong>Listen</strong><p>Audio answer plus large transcript.</p></div>
</div>
</section>
"""
CAPTURE_ASSIST_HTML = """
<section class="assist-strip" aria-label="Capture tips">
<p><strong>Best results:</strong> hold the camera still, keep text centered, and move closer for labels or menus.</p>
</section>
"""
ANSWER_ASSIST_HTML = """
<section class="assist-strip answer-strip" aria-label="Answer tips">
<p><strong>Status guide:</strong> Listening means voice input, Seeing means image analysis, Thinking means answer generation, Speaking means audio playback.</p>
</section>
"""
def on_mode_change(mode: str):
"""Reveal voice controls only in Ask mode and relabel the action button."""
is_ask = mode == "Ask"
return (
gr.update(visible=is_ask),
gr.update(visible=is_ask),
gr.update(visible=is_ask),
gr.update(value=MODE_LABELS.get(mode, MODE_LABELS["Describe"])),
)
def load_sample(sample_name: str):
sample_path = SAMPLE_OPTIONS.get(sample_name) or next(iter(SAMPLE_OPTIONS.values()))
return sample_path, f"Loaded example: {sample_name}."
def apply_quick_prompt(prompt: str):
return prompt or ""
def build_demo() -> gr.Blocks:
theme = gr.themes.Base(
primary_hue="indigo",
secondary_hue="cyan",
neutral_hue="slate",
)
with gr.Blocks(
theme=theme,
css=CSS,
title="Third Eye",
fill_width=True,
) as demo:
with gr.Row(elem_classes="app-header"):
gr.HTML(BRAND_HTML, padding=False)
language = gr.Dropdown(
choices=list(LANGUAGES),
value="English",
label="Answer language",
show_label=False,
elem_classes="language-picker",
scale=0,
)
theme_btn = gr.Button(
"◐ Theme",
size="sm",
elem_classes="theme-toggle",
scale=0,
)
gr.HTML(MISSION_PANEL_HTML, padding=False)
mode = gr.Radio(
choices=["Describe", "Ask", "Read Text"],
value="Describe",
show_label=False,
container=False,
elem_id="mode-cards",
elem_classes="mode-cards",
)
with gr.Row(equal_height=True, elem_classes="work-area"):
with gr.Column(scale=1, elem_classes="glass-card capture-card"):
gr.HTML(
'<div class="card-head"><span class="card-title">CAPTURE</span>'
'<span class="card-hint">Camera or upload</span></div>',
padding=False,
)
image = gr.Image(
label="Camera or image",
sources=["webcam", "upload"],
type="pil",
height=380,
show_label=False,
elem_classes="camera-frame",
)
gr.HTML(CAPTURE_ASSIST_HTML, padding=False)
sample_choice = gr.Radio(
choices=list(SAMPLE_OPTIONS),
value="Cafe menu",
label="Bundled example scenes",
info="Use these when a camera is unavailable or for a quick demo.",
elem_classes="sample-picker",
)
load_example = gr.Button(
"Load selected example",
variant="secondary",
elem_classes="secondary-action",
)
audio_input = gr.Audio(
label="Speak your question",
sources=["microphone", "upload"],
type="filepath",
format="wav",
visible=False,
elem_classes="mic-input",
)
typed = gr.Textbox(
label="Type instead",
placeholder="Optional: type only if the microphone is unavailable.",
visible=False,
lines=2,
)
quick_prompt = gr.Dropdown(
choices=QUICK_ASK_OPTIONS,
label="Quick question",
info="Optional shortcut if speaking is difficult.",
visible=False,
elem_classes="quick-prompt",
)
submit = gr.Button(
MODE_LABELS["Describe"],
variant="primary",
size="lg",
elem_classes="primary-action",
)
with gr.Column(scale=1, elem_classes="glass-card answer-card"):
gr.HTML(
'<div class="card-head"><span class="card-title">ANSWER</span></div>',
padding=False,
)
iris = gr.HTML(
iris_markup("idle", "Ready"),
elem_id="iris-shell",
padding=False,
)
status = gr.Textbox(
value="Ready. Capture an image or choose an example.",
label="Live status",
interactive=False,
elem_id="live-status",
elem_classes="sr-status",
)
question_output = gr.Textbox(
label="Question or instruction",
interactive=False,
)
answer = gr.Textbox(
label="Answer transcript",
interactive=False,
lines=7,
elem_classes="answer-output",
)
audio_output = gr.Audio(
label="Spoken answer",
autoplay=True,
interactive=False,
)
gr.HTML(ANSWER_ASSIST_HTML, padding=False)
with gr.Accordion(
"Diagnostics", open=False, elem_classes="system-accordion"
):
gr.HTML(f"<p class='mode-note'>{backend_status_text()}</p>", padding=False)
warmup_btn = gr.Button(
"Pre-load models",
variant="secondary",
size="sm",
elem_classes="diagnostics-btn",
)
warmup_output = gr.Textbox(
label="Model status",
interactive=False,
lines=3,
)
gr.HTML(
"""
<footer class="footer-note">
Vision &amp; OCR by Qwen2.5-VL · Speech by Cohere Transcribe · Voice by VoxCPM2.
<span class="priv">Your image is processed only for your request and never stored.</span>
</footer>
""",
padding=False,
)
mode.change(
fn=on_mode_change,
inputs=mode,
outputs=[audio_input, typed, quick_prompt, submit],
)
load_example.click(
fn=load_sample,
inputs=[sample_choice],
outputs=[image, status],
show_progress="hidden",
)
quick_prompt.change(
fn=apply_quick_prompt,
inputs=[quick_prompt],
outputs=[typed],
show_progress="hidden",
)
submit.click(
fn=run_pipeline,
inputs=[image, audio_input, typed, mode, language],
outputs=[audio_output, answer, question_output, iris, status],
show_progress="full",
)
warmup_btn.click(
fn=warmup_all,
inputs=[],
outputs=[warmup_output],
show_progress="full",
)
theme_btn.click(fn=None, inputs=None, outputs=None, js=THEME_TOGGLE_JS)
return demo
demo = build_demo()
if __name__ == "__main__":
launch_host = os.getenv("THIRD_EYE_HOST", "0.0.0.0")
launch_port = int(os.getenv("THIRD_EYE_PORT", os.getenv("PORT", "7860")))
demo.queue(default_concurrency_limit=2).launch(
server_name=launch_host,
server_port=launch_port,
show_error=False,
)