from __future__ import annotations import math import os import struct import tempfile import warnings import wave from pathlib import Path import gradio as gr from utils import ( bytes_to_wav, image_to_bytes, prune_old_wavs, read_audio_bytes, safe_call, ) ROOT = Path(__file__).parent ASSETS = ROOT / "assets" CSS = (ASSETS / "custom.css").read_text(encoding="utf-8") MODAL_APP = os.getenv("THIRD_EYE_MODAL_APP", "third-eye-backend") warnings.filterwarnings( "ignore", message=r"The 'theme' parameter in the Blocks constructor will be removed.*", category=DeprecationWarning, ) warnings.filterwarnings( "ignore", message=r"The 'css' parameter in the Blocks constructor will be removed.*", category=DeprecationWarning, ) LANGUAGES = { "English": "en", "Chinese": "zh", } STT_LANGUAGES = {"en", "zh"} SAMPLES = [ str(ASSETS / "sample_menu.jpg"), str(ASSETS / "sample_label.jpg"), str(ASSETS / "sample_sign.jpg"), ] SAMPLE_OPTIONS = { "Cafe menu": SAMPLES[0], "Medicine label": SAMPLES[1], "Street sign": SAMPLES[2], } QUICK_ASK_OPTIONS = [ "What is directly in front of me?", "Read the important text exactly.", "What should I be careful about here?", ] def iris_markup(state: str, label: str) -> str: return f"""

{label}

""" def _backend_pref() -> str: return os.getenv("THIRD_EYE_BACKEND", "auto").strip().lower() def zerogpu_available() -> bool: """True when an in-process ZeroGPU backend should serve inference. Forced with THIRD_EYE_BACKEND=zerogpu; otherwise auto-detected by the presence of the Hugging Face ``spaces`` runtime. """ pref = _backend_pref() if pref == "zerogpu": return True if pref in {"modal", "mock"}: return False try: import spaces # noqa: F401 return True except Exception: return False # HF ZeroGPU scans app.py for @spaces.GPU-decorated functions at startup. # Importing zerogpu_backend here ensures its @GPU functions are discoverable. if zerogpu_available(): import zerogpu_backend # noqa: F401 def modal_available() -> bool: pref = _backend_pref() if pref == "modal": return True if pref in {"zerogpu", "mock"}: return False return bool(os.getenv("MODAL_TOKEN_ID") and os.getenv("MODAL_TOKEN_SECRET")) def live_backend_available() -> bool: return zerogpu_available() or modal_available() def mock_mode_enabled() -> bool: setting = os.getenv("THIRD_EYE_MOCK", "auto").strip().lower() if setting in {"1", "true", "yes", "on"}: return True if setting in {"0", "false", "no", "off"}: return False return not live_backend_available() def modal_call(function_name: str, *args): import modal function = modal.Function.from_name(MODAL_APP, function_name) return function.remote(*args) def infer(function_name: str, *args): """Route an inference call to the active backend (ZeroGPU or Modal).""" if zerogpu_available(): import zerogpu_backend return getattr(zerogpu_backend, function_name)(*args) if not modal_available(): raise RuntimeError( "No live inference backend is configured. Enable mock mode or set up " "ZeroGPU/Modal credentials." ) return modal_call(function_name, *args) def backend_status_text() -> str: """Honest, non-demo description of where inference runs right now.""" if mock_mode_enabled(): return ( "Preview mode — the full interface runs without a GPU backend, " "so your image is never uploaded." ) if zerogpu_available(): return "Live on Hugging Face ZeroGPU. Models load on first use." if modal_available(): return "Live inference backend connected." return "No live inference backend is configured. Enable mock mode or connect ZeroGPU/Modal." def mock_answer(mode: str, language: str) -> str: answers = { "Describe": ( "Mock preview: I can see a clear, text-rich image ready for visual " "description. Deploy the Modal backend for a real model response." ), "Ask": ( "Mock preview: your spoken or typed question was received. Deploy the " "Modal backend to answer it from the image." ), "Read Text": ( "Mock preview: the image is ready for OCR. Deploy the Modal backend to " "read its exact text aloud." ), } return f"{answers[mode]} Output language: {language}." def mock_tone() -> str: sample_rate = 22_050 duration = 0.55 frames = bytearray() for index in range(int(sample_rate * duration)): envelope = min(1.0, index / 500, (sample_rate * duration - index) / 800) value = int( 8_000 * envelope * ( math.sin(2 * math.pi * 523.25 * index / sample_rate) + 0.45 * math.sin(2 * math.pi * 659.25 * index / sample_rate) ) ) frames.extend(struct.pack(" str: if mode == "Read Text": return ( "Read every word and number in this image exactly as written. " "Include all text, labels, prices, dates, directions, and signs. " "Do not interpret or explain — just read the text verbatim." ) if mode == "Describe": return ( "Describe this image for a blind person. Read any text you see word by word. " "Describe objects, people, colors, layout, and all visible details." ) if typed_question.strip(): return typed_question.strip() if not audio_path: return ( "What is in this image? Read any text, labels, or writing visible. " "Describe objects, brands, colors, and details." ) audio_bytes = read_audio_bytes(audio_path) stt_language = language if language in STT_LANGUAGES else "en" if stt_language != language: gr.Warning( "Speech recognition does not support this selected language. " "Listening as English; the answer can still use your selected language." ) return safe_call( infer, "transcribe_audio", audio_bytes, stt_language, fallback="", warn="I could not hear that. Type a question or record again.", ) or "" def run_pipeline( image, audio_path: str | None, typed_question: str, mode: str, language_name: str, progress=gr.Progress(), ): # Bound temp-WAV disk usage without deleting a clip still being served. prune_old_wavs() if image is None: gr.Warning("No image captured. Point the camera or choose an example.") yield ( None, "No image captured.", typed_question, iris_markup("idle", "Waiting for an image"), "Waiting for an image.", ) return language = LANGUAGES.get(language_name, "en") using_mock = mock_mode_enabled() if mode == "Ask" and audio_path and not typed_question.strip() and not using_mock: yield ( None, "", "", iris_markup("listening", "Listening"), "Listening to your question.", ) progress(0.15, desc="Transcribing your question") if using_mock and mode == "Ask": question = typed_question.strip() or "What is in front of me?" else: question = resolve_question(mode, audio_path, typed_question, language) if mode == "Ask" and not question: yield ( None, "I could not understand the question.", "", iris_markup("idle", "Ready to try again"), "Question not understood. Type it or record again.", ) return yield ( None, "", question, iris_markup("seeing", "Seeing"), "Analyzing the captured image.", ) progress(0.35, desc="Loading vision model" if not using_mock else "Previewing") if using_mock: answer = mock_answer(mode, language_name) else: answer = safe_call( infer, "describe_scene", image_to_bytes(image), question, language, mode == "Read Text", # tile: enlarge small text for verbatim OCR fallback="", warn="The vision model is unavailable. Please try once more.", ) or "" if not answer: yield ( None, "Could not analyze the image.", question, iris_markup("idle", "Ready to try again"), "Image analysis failed. Ready to try again.", ) return yield ( None, answer, question, iris_markup("thinking", "Preparing voice"), "The answer is ready. Preparing speech.", ) progress(0.75, desc="Preparing spoken answer") if using_mock: audio_path_out = mock_tone() else: audio_bytes = safe_call( infer, "speak", answer, language, fallback=None, warn="Voice is unavailable. The large-text answer is still shown.", ) audio_path_out = bytes_to_wav(audio_bytes) if audio_bytes else None progress(1.0, desc="Ready") final_state = "speaking" if audio_path_out else "idle" final_label = "Speaking" if audio_path_out else "Answer shown" yield ( audio_path_out, answer, question, iris_markup(final_state, final_label), f"{final_label}. The transcript is available below.", ) MODE_LABELS = { "Describe": "Describe what I see", "Ask": "Ask Third Eye", "Read Text": "Read this text", } # Light is the default; the toggle pins dark by adding a single class. THEME_TOGGLE_JS = """ () => { document.documentElement.classList.toggle('force-dark'); } """ BRAND_HTML = """

Third Eye

Point your camera. Ask out loud. Listen to the answer.

""" MISSION_PANEL_HTML = """

BLIND-FIRST NAVIGATION

One action at a time. Fast answers. Strong audio and text feedback.

Third Eye is designed to reduce hesitation in the real world: capture what is ahead, ask what matters, and hear the result without hunting through a crowded interface.

""" CAPTURE_ASSIST_HTML = """

Best results: hold the camera still, keep text centered, and move closer for labels or menus.

""" ANSWER_ASSIST_HTML = """

Status guide: Listening means voice input, Seeing means image analysis, Thinking means answer generation, Speaking means audio playback.

""" def on_mode_change(mode: str): """Reveal voice controls only in Ask mode and relabel the action button.""" is_ask = mode == "Ask" return ( gr.update(visible=is_ask), gr.update(visible=is_ask), gr.update(visible=is_ask), gr.update(value=MODE_LABELS.get(mode, MODE_LABELS["Describe"])), ) def load_sample(sample_name: str): sample_path = SAMPLE_OPTIONS.get(sample_name) or next(iter(SAMPLE_OPTIONS.values())) return sample_path, f"Loaded example: {sample_name}." def apply_quick_prompt(prompt: str): return prompt or "" def build_demo() -> gr.Blocks: theme = gr.themes.Base( primary_hue="indigo", secondary_hue="cyan", neutral_hue="slate", ) with gr.Blocks( theme=theme, css=CSS, title="Third Eye", fill_width=True, ) as demo: with gr.Row(elem_classes="app-header"): gr.HTML(BRAND_HTML, padding=False) language = gr.Dropdown( choices=list(LANGUAGES), value="English", label="Answer language", show_label=False, elem_classes="language-picker", scale=0, ) theme_btn = gr.Button( "◐ Theme", size="sm", elem_classes="theme-toggle", scale=0, ) gr.HTML(MISSION_PANEL_HTML, padding=False) mode = gr.Radio( choices=["Describe", "Ask", "Read Text"], value="Describe", show_label=False, container=False, elem_id="mode-cards", elem_classes="mode-cards", ) with gr.Row(equal_height=True, elem_classes="work-area"): with gr.Column(scale=1, elem_classes="glass-card capture-card"): gr.HTML( '

CAPTURE' 'Camera or upload

', padding=False, ) image = gr.Image( label="Camera or image", sources=["webcam", "upload"], type="pil", height=380, show_label=False, elem_classes="camera-frame", ) gr.HTML(CAPTURE_ASSIST_HTML, padding=False) sample_choice = gr.Radio( choices=list(SAMPLE_OPTIONS), value="Cafe menu", label="Bundled example scenes", info="Use these when a camera is unavailable or for a quick demo.", elem_classes="sample-picker", ) load_example = gr.Button( "Load selected example", variant="secondary", elem_classes="secondary-action", ) audio_input = gr.Audio( label="Speak your question", sources=["microphone", "upload"], type="filepath", format="wav", visible=False, elem_classes="mic-input", ) typed = gr.Textbox( label="Type instead", placeholder="Optional: type only if the microphone is unavailable.", visible=False, lines=2, ) quick_prompt = gr.Dropdown( choices=QUICK_ASK_OPTIONS, label="Quick question", info="Optional shortcut if speaking is difficult.", visible=False, elem_classes="quick-prompt", ) submit = gr.Button( MODE_LABELS["Describe"], variant="primary", size="lg", elem_classes="primary-action", ) with gr.Column(scale=1, elem_classes="glass-card answer-card"): gr.HTML( '

ANSWER

', padding=False, ) iris = gr.HTML( iris_markup("idle", "Ready"), elem_id="iris-shell", padding=False, ) status = gr.Textbox( value="Ready. Capture an image or choose an example.", label="Live status", interactive=False, elem_id="live-status", elem_classes="sr-status", ) question_output = gr.Textbox( label="Question or instruction", interactive=False, ) answer = gr.Textbox( label="Answer transcript", interactive=False, lines=7, elem_classes="answer-output", ) audio_output = gr.Audio( label="Spoken answer", autoplay=True, interactive=False, ) gr.HTML(ANSWER_ASSIST_HTML, padding=False) with gr.Accordion( "Diagnostics", open=False, elem_classes="system-accordion" ): gr.HTML(f"

{backend_status_text()}

", padding=False) warmup_btn = gr.Button( "Pre-load models", variant="secondary", size="sm", elem_classes="diagnostics-btn", ) warmup_output = gr.Textbox( label="Model status", interactive=False, lines=3, ) gr.HTML( """ """, padding=False, ) mode.change( fn=on_mode_change, inputs=mode, outputs=[audio_input, typed, quick_prompt, submit], ) load_example.click( fn=load_sample, inputs=[sample_choice], outputs=[image, status], show_progress="hidden", ) quick_prompt.change( fn=apply_quick_prompt, inputs=[quick_prompt], outputs=[typed], show_progress="hidden", ) submit.click( fn=run_pipeline, inputs=[image, audio_input, typed, mode, language], outputs=[audio_output, answer, question_output, iris, status], show_progress="full", ) warmup_btn.click( fn=warmup_all, inputs=[], outputs=[warmup_output], show_progress="full", ) theme_btn.click(fn=None, inputs=None, outputs=None, js=THEME_TOGGLE_JS) return demo demo = build_demo() if __name__ == "__main__": launch_host = os.getenv("THIRD_EYE_HOST", "0.0.0.0") launch_port = int(os.getenv("THIRD_EYE_PORT", os.getenv("PORT", "7860"))) demo.queue(default_concurrency_limit=2).launch( server_name=launch_host, server_port=launch_port, show_error=False, )