Spaces:

build-small-hackathon
/

third-eye

Sleeping

File size: 21,220 Bytes

from __future__ import annotations

import math
import os
import struct
import tempfile
import warnings
import wave
from pathlib import Path

import gradio as gr

from utils import (
    bytes_to_wav,
    image_to_bytes,
    prune_old_wavs,
    read_audio_bytes,
    safe_call,
)

ROOT = Path(__file__).parent
ASSETS = ROOT / "assets"
CSS = (ASSETS / "custom.css").read_text(encoding="utf-8")
MODAL_APP = os.getenv("THIRD_EYE_MODAL_APP", "third-eye-backend")

warnings.filterwarnings(
    "ignore",
    message=r"The 'theme' parameter in the Blocks constructor will be removed.*",
    category=DeprecationWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r"The 'css' parameter in the Blocks constructor will be removed.*",
    category=DeprecationWarning,
)

LANGUAGES = {
    "English": "en",
    "Chinese": "zh",
}
STT_LANGUAGES = {"en", "zh"}

SAMPLES = [
    str(ASSETS / "sample_menu.jpg"),
    str(ASSETS / "sample_label.jpg"),
    str(ASSETS / "sample_sign.jpg"),
]
SAMPLE_OPTIONS = {
    "Cafe menu": SAMPLES[0],
    "Medicine label": SAMPLES[1],
    "Street sign": SAMPLES[2],
}
QUICK_ASK_OPTIONS = [
    "What is directly in front of me?",
    "Read the important text exactly.",
    "What should I be careful about here?",
]


def iris_markup(state: str, label: str) -> str:
    return f"""

    <section class="iris-stage" aria-label="Third Eye status">

      <div class="iris {state}" aria-hidden="true">

        <span class="iris-core"></span>

        <span class="scan-line"></span>

      </div>

      <p class="state-label">{label}</p>

    </section>

    """


def _backend_pref() -> str:
    return os.getenv("THIRD_EYE_BACKEND", "auto").strip().lower()


def zerogpu_available() -> bool:
    """True when an in-process ZeroGPU backend should serve inference.



    Forced with THIRD_EYE_BACKEND=zerogpu; otherwise auto-detected by the

    presence of the Hugging Face ``spaces`` runtime.

    """
    pref = _backend_pref()
    if pref == "zerogpu":
        return True
    if pref in {"modal", "mock"}:
        return False
    try:
        import spaces  # noqa: F401

        return True
    except Exception:
        return False


# HF ZeroGPU scans app.py for @spaces.GPU-decorated functions at startup.
# Importing zerogpu_backend here ensures its @GPU functions are discoverable.
if zerogpu_available():
    import zerogpu_backend  # noqa: F401


def modal_available() -> bool:
    pref = _backend_pref()
    if pref == "modal":
        return True
    if pref in {"zerogpu", "mock"}:
        return False
    return bool(os.getenv("MODAL_TOKEN_ID") and os.getenv("MODAL_TOKEN_SECRET"))


def live_backend_available() -> bool:
    return zerogpu_available() or modal_available()


def mock_mode_enabled() -> bool:
    setting = os.getenv("THIRD_EYE_MOCK", "auto").strip().lower()
    if setting in {"1", "true", "yes", "on"}:
        return True
    if setting in {"0", "false", "no", "off"}:
        return False
    return not live_backend_available()


def modal_call(function_name: str, *args):
    import modal

    function = modal.Function.from_name(MODAL_APP, function_name)
    return function.remote(*args)


def infer(function_name: str, *args):
    """Route an inference call to the active backend (ZeroGPU or Modal)."""
    if zerogpu_available():
        import zerogpu_backend

        return getattr(zerogpu_backend, function_name)(*args)
    if not modal_available():
        raise RuntimeError(
            "No live inference backend is configured. Enable mock mode or set up "
            "ZeroGPU/Modal credentials."
        )
    return modal_call(function_name, *args)


def backend_status_text() -> str:
    """Honest, non-demo description of where inference runs right now."""
    if mock_mode_enabled():
        return (
            "Preview mode — the full interface runs without a GPU backend, "
            "so your image is never uploaded."
        )
    if zerogpu_available():
        return "Live on Hugging Face ZeroGPU. Models load on first use."
    if modal_available():
        return "Live inference backend connected."
    return "No live inference backend is configured. Enable mock mode or connect ZeroGPU/Modal."


def mock_answer(mode: str, language: str) -> str:
    answers = {
        "Describe": (
            "Mock preview: I can see a clear, text-rich image ready for visual "
            "description. Deploy the Modal backend for a real model response."
        ),
        "Ask": (
            "Mock preview: your spoken or typed question was received. Deploy the "
            "Modal backend to answer it from the image."
        ),
        "Read Text": (
            "Mock preview: the image is ready for OCR. Deploy the Modal backend to "
            "read its exact text aloud."
        ),
    }
    return f"{answers[mode]} Output language: {language}."


def mock_tone() -> str:
    sample_rate = 22_050
    duration = 0.55
    frames = bytearray()
    for index in range(int(sample_rate * duration)):
        envelope = min(1.0, index / 500, (sample_rate * duration - index) / 800)
        value = int(
            8_000
            * envelope
            * (
                math.sin(2 * math.pi * 523.25 * index / sample_rate)
                + 0.45 * math.sin(2 * math.pi * 659.25 * index / sample_rate)
            )
        )
        frames.extend(struct.pack("<h", value))

    output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    with wave.open(output.name, "wb") as wav_file:
        wav_file.setnchannels(1)
        wav_file.setsampwidth(2)
        wav_file.setframerate(sample_rate)
        wav_file.writeframes(frames)
    return output.name


def warmup_all(progress=gr.Progress()):
    """Pre-load all Modal GPU functions so subsequent calls are fast."""
    import time

    steps = [
        ("Vision model", "describe_scene", b"\xff\xd8test", "Say hello.", "en", False),
        ("TTS model", "speak", "Warmup complete.", "en"),
        ("STT model", "transcribe_audio", b"\xff\xd8test", "en"),
    ]

    results = []
    for i, (label, func_name, *args) in enumerate(steps):
        progress((i) / len(steps), desc=f"Warming up {label}...")
        t0 = time.time()
        try:
            infer(func_name, *args)
            elapsed = time.time() - t0
            results.append(f"{label}: ready ({elapsed:.0f}s)")
        except Exception:
            elapsed = time.time() - t0
            results.append(f"{label}: loaded ({elapsed:.0f}s)")

    progress(1.0, desc="All models warm!")
    return "\n".join(results)


def resolve_question(

    mode: str,

    audio_path: str | None,

    typed_question: str,

    language: str,

) -> str:
    if mode == "Read Text":
        return (
            "Read every word and number in this image exactly as written. "
            "Include all text, labels, prices, dates, directions, and signs. "
            "Do not interpret or explain — just read the text verbatim."
        )
    if mode == "Describe":
        return (
            "Describe this image for a blind person. Read any text you see word by word. "
            "Describe objects, people, colors, layout, and all visible details."
        )
    if typed_question.strip():
        return typed_question.strip()
    if not audio_path:
        return (
            "What is in this image? Read any text, labels, or writing visible. "
            "Describe objects, brands, colors, and details."
        )

    audio_bytes = read_audio_bytes(audio_path)
    stt_language = language if language in STT_LANGUAGES else "en"
    if stt_language != language:
        gr.Warning(
            "Speech recognition does not support this selected language. "
            "Listening as English; the answer can still use your selected language."
        )
    return safe_call(
        infer,
        "transcribe_audio",
        audio_bytes,
        stt_language,
        fallback="",
        warn="I could not hear that. Type a question or record again.",
    ) or ""


def run_pipeline(

    image,

    audio_path: str | None,

    typed_question: str,

    mode: str,

    language_name: str,

    progress=gr.Progress(),

):
    # Bound temp-WAV disk usage without deleting a clip still being served.
    prune_old_wavs()

    if image is None:
        gr.Warning("No image captured. Point the camera or choose an example.")
        yield (
            None,
            "No image captured.",
            typed_question,
            iris_markup("idle", "Waiting for an image"),
            "Waiting for an image.",
        )
        return

    language = LANGUAGES.get(language_name, "en")
    using_mock = mock_mode_enabled()

    if mode == "Ask" and audio_path and not typed_question.strip() and not using_mock:
        yield (
            None,
            "",
            "",
            iris_markup("listening", "Listening"),
            "Listening to your question.",
        )
        progress(0.15, desc="Transcribing your question")

    if using_mock and mode == "Ask":
        question = typed_question.strip() or "What is in front of me?"
    else:
        question = resolve_question(mode, audio_path, typed_question, language)

    if mode == "Ask" and not question:
        yield (
            None,
            "I could not understand the question.",
            "",
            iris_markup("idle", "Ready to try again"),
            "Question not understood. Type it or record again.",
        )
        return

    yield (
        None,
        "",
        question,
        iris_markup("seeing", "Seeing"),
        "Analyzing the captured image.",
    )
    progress(0.35, desc="Loading vision model" if not using_mock else "Previewing")

    if using_mock:
        answer = mock_answer(mode, language_name)
    else:
        answer = safe_call(
            infer,
            "describe_scene",
            image_to_bytes(image),
            question,
            language,
            mode == "Read Text",  # tile: enlarge small text for verbatim OCR
            fallback="",
            warn="The vision model is unavailable. Please try once more.",
        ) or ""

    if not answer:
        yield (
            None,
            "Could not analyze the image.",
            question,
            iris_markup("idle", "Ready to try again"),
            "Image analysis failed. Ready to try again.",
        )
        return

    yield (
        None,
        answer,
        question,
        iris_markup("thinking", "Preparing voice"),
        "The answer is ready. Preparing speech.",
    )
    progress(0.75, desc="Preparing spoken answer")

    if using_mock:
        audio_path_out = mock_tone()
    else:
        audio_bytes = safe_call(
            infer,
            "speak",
            answer,
            language,
            fallback=None,
            warn="Voice is unavailable. The large-text answer is still shown.",
        )
        audio_path_out = bytes_to_wav(audio_bytes) if audio_bytes else None

    progress(1.0, desc="Ready")
    final_state = "speaking" if audio_path_out else "idle"
    final_label = "Speaking" if audio_path_out else "Answer shown"
    yield (
        audio_path_out,
        answer,
        question,
        iris_markup(final_state, final_label),
        f"{final_label}. The transcript is available below.",
    )


MODE_LABELS = {
    "Describe": "Describe what I see",
    "Ask": "Ask Third Eye",
    "Read Text": "Read this text",
}

# Light is the default; the toggle pins dark by adding a single class.
THEME_TOGGLE_JS = """

() => { document.documentElement.classList.toggle('force-dark'); }

"""

BRAND_HTML = """
<div class="brand">
  <span class="brand-iris" aria-hidden="true"><span class="brand-iris-core"></span></span>
  <div class="brand-text">
    <h1>Third Eye</h1>
    <p>Point your camera. Ask out loud. Listen to the answer.</p>
  </div>
</div>
"""

MISSION_PANEL_HTML = """
<section class="mission-panel" aria-label="Third Eye guidance">
  <div class="mission-copy">
    <p class="mission-kicker">BLIND-FIRST NAVIGATION</p>
    <h2>One action at a time. Fast answers. Strong audio and text feedback.</h2>
    <p class="mission-body">
      Third Eye is designed to reduce hesitation in the real world: capture what is
      ahead, ask what matters, and hear the result without hunting through a crowded interface.
    </p>
  </div>
  <div class="mission-steps" aria-hidden="true">
    <div><span>01</span><strong>Capture</strong><p>Camera, upload, or example scene.</p></div>
    <div><span>02</span><strong>Ask</strong><p>Speak naturally or use a quick prompt.</p></div>
    <div><span>03</span><strong>Listen</strong><p>Audio answer plus large transcript.</p></div>
  </div>
</section>
"""

CAPTURE_ASSIST_HTML = """
<section class="assist-strip" aria-label="Capture tips">
  <p><strong>Best results:</strong> hold the camera still, keep text centered, and move closer for labels or menus.</p>
</section>
"""

ANSWER_ASSIST_HTML = """
<section class="assist-strip answer-strip" aria-label="Answer tips">
  <p><strong>Status guide:</strong> Listening means voice input, Seeing means image analysis, Thinking means answer generation, Speaking means audio playback.</p>
</section>
"""


def on_mode_change(mode: str):
    """Reveal voice controls only in Ask mode and relabel the action button."""
    is_ask = mode == "Ask"
    return (
        gr.update(visible=is_ask),
        gr.update(visible=is_ask),
        gr.update(visible=is_ask),
        gr.update(value=MODE_LABELS.get(mode, MODE_LABELS["Describe"])),
    )


def load_sample(sample_name: str):
    sample_path = SAMPLE_OPTIONS.get(sample_name) or next(iter(SAMPLE_OPTIONS.values()))
    return sample_path, f"Loaded example: {sample_name}."


def apply_quick_prompt(prompt: str):
    return prompt or ""


def build_demo() -> gr.Blocks:
    theme = gr.themes.Base(
        primary_hue="indigo",
        secondary_hue="cyan",
        neutral_hue="slate",
    )
    with gr.Blocks(
        theme=theme,
        css=CSS,
        title="Third Eye",
        fill_width=True,
    ) as demo:
        with gr.Row(elem_classes="app-header"):
            gr.HTML(BRAND_HTML, padding=False)
            language = gr.Dropdown(
                choices=list(LANGUAGES),
                value="English",
                label="Answer language",
                show_label=False,
                elem_classes="language-picker",
                scale=0,
            )
            theme_btn = gr.Button(
                "◐  Theme",
                size="sm",
                elem_classes="theme-toggle",
                scale=0,
            )

        gr.HTML(MISSION_PANEL_HTML, padding=False)

        mode = gr.Radio(
            choices=["Describe", "Ask", "Read Text"],
            value="Describe",
            show_label=False,
            container=False,
            elem_id="mode-cards",
            elem_classes="mode-cards",
        )

        with gr.Row(equal_height=True, elem_classes="work-area"):
            with gr.Column(scale=1, elem_classes="glass-card capture-card"):
                gr.HTML(
                    '<div class="card-head"><span class="card-title">CAPTURE</span>'
                    '<span class="card-hint">Camera or upload</span></div>',
                    padding=False,
                )
                image = gr.Image(
                    label="Camera or image",
                    sources=["webcam", "upload"],
                    type="pil",
                    height=380,
                    show_label=False,
                    elem_classes="camera-frame",
                )
                gr.HTML(CAPTURE_ASSIST_HTML, padding=False)
                sample_choice = gr.Radio(
                    choices=list(SAMPLE_OPTIONS),
                    value="Cafe menu",
                    label="Bundled example scenes",
                    info="Use these when a camera is unavailable or for a quick demo.",
                    elem_classes="sample-picker",
                )
                load_example = gr.Button(
                    "Load selected example",
                    variant="secondary",
                    elem_classes="secondary-action",
                )
                audio_input = gr.Audio(
                    label="Speak your question",
                    sources=["microphone", "upload"],
                    type="filepath",
                    format="wav",
                    visible=False,
                    elem_classes="mic-input",
                )
                typed = gr.Textbox(
                    label="Type instead",
                    placeholder="Optional: type only if the microphone is unavailable.",
                    visible=False,
                    lines=2,
                )
                quick_prompt = gr.Dropdown(
                    choices=QUICK_ASK_OPTIONS,
                    label="Quick question",
                    info="Optional shortcut if speaking is difficult.",
                    visible=False,
                    elem_classes="quick-prompt",
                )
                submit = gr.Button(
                    MODE_LABELS["Describe"],
                    variant="primary",
                    size="lg",
                    elem_classes="primary-action",
                )

            with gr.Column(scale=1, elem_classes="glass-card answer-card"):
                gr.HTML(
                    '<div class="card-head"><span class="card-title">ANSWER</span></div>',
                    padding=False,
                )
                iris = gr.HTML(
                    iris_markup("idle", "Ready"),
                    elem_id="iris-shell",
                    padding=False,
                )
                status = gr.Textbox(
                    value="Ready. Capture an image or choose an example.",
                    label="Live status",
                    interactive=False,
                    elem_id="live-status",
                    elem_classes="sr-status",
                )
                question_output = gr.Textbox(
                    label="Question or instruction",
                    interactive=False,
                )
                answer = gr.Textbox(
                    label="Answer transcript",
                    interactive=False,
                    lines=7,
                    elem_classes="answer-output",
                )
                audio_output = gr.Audio(
                    label="Spoken answer",
                    autoplay=True,
                    interactive=False,
                )
                gr.HTML(ANSWER_ASSIST_HTML, padding=False)

        with gr.Accordion(
            "Diagnostics", open=False, elem_classes="system-accordion"
        ):
            gr.HTML(f"<p class='mode-note'>{backend_status_text()}</p>", padding=False)
            warmup_btn = gr.Button(
                "Pre-load models",
                variant="secondary",
                size="sm",
                elem_classes="diagnostics-btn",
            )
            warmup_output = gr.Textbox(
                label="Model status",
                interactive=False,
                lines=3,
            )

        gr.HTML(
            """
            <footer class="footer-note">
              Vision &amp; OCR by Qwen2.5-VL · Speech by Cohere Transcribe · Voice by VoxCPM2.
              <span class="priv">Your image is processed only for your request and never stored.</span>
            </footer>
            """,
            padding=False,
        )

        mode.change(
            fn=on_mode_change,
            inputs=mode,
            outputs=[audio_input, typed, quick_prompt, submit],
        )
        load_example.click(
            fn=load_sample,
            inputs=[sample_choice],
            outputs=[image, status],
            show_progress="hidden",
        )
        quick_prompt.change(
            fn=apply_quick_prompt,
            inputs=[quick_prompt],
            outputs=[typed],
            show_progress="hidden",
        )
        submit.click(
            fn=run_pipeline,
            inputs=[image, audio_input, typed, mode, language],
            outputs=[audio_output, answer, question_output, iris, status],
            show_progress="full",
        )
        warmup_btn.click(
            fn=warmup_all,
            inputs=[],
            outputs=[warmup_output],
            show_progress="full",
        )
        theme_btn.click(fn=None, inputs=None, outputs=None, js=THEME_TOGGLE_JS)
    return demo


demo = build_demo()

if __name__ == "__main__":
    launch_host = os.getenv("THIRD_EYE_HOST", "0.0.0.0")
    launch_port = int(os.getenv("THIRD_EYE_PORT", os.getenv("PORT", "7860")))
    demo.queue(default_concurrency_limit=2).launch(
        server_name=launch_host,
        server_port=launch_port,
        show_error=False,
    )