Spaces:

build-small-hackathon
/

case0

Running

File size: 22,957 Bytes

414dc55

"""The Gradio Blocks UI: a scene-based, animated pixel detective game.

Single-player: one GameController lives in gr.State per session. The interrogation
stage (animated suspect sprite in a room), the notebook, and the scenery are
deterministic CSS-animated graphics. Suspect dialogue streams into a visual-novel box;
hidden state never reaches the browser. SFX/music play client-side from data URIs.
"""

from __future__ import annotations

import random
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import gradio as gr

from ..config import get_settings
from ..schemas.enums import MotiveCategory
from ..suspects.scrub import scrub_spoken
from .app_state import (
    _TTS_DIR,
    GameController,
    _load_audio,
    _obtain_shared_backend,
    _obtain_shared_tts,
    note_interaction,
    start_case_buffer,
)
from .formatters import (
    briefing_html,
    dialogue_html,
    evidence_html,
    how_to_play_html,
    notebook_html,
    stage_html,
    verdict_html,
)
from .theme import build_css

_WEAPON_DECOYS = ("Poison", "Strangulation", "A fall")


_FLAVOR_LINES = (
    "Opening the precinct", "Dusting for fingerprints", "Rounding up the usual suspects",
    "Brewing the detective's coffee", "Reviewing the case files", "Polishing the interrogation lamp",
    "Chasing a lead down the alley", "Cataloguing the evidence", "Tuning the suspects' alibis",
)


def _audio_setup_js(sfx: dict[str, str], music: str) -> str:
    """Load-event JS: create the audio elements, define the audio/animation helpers,
    cycle the loading-screen flavor text, and try to start the music. Gradio strips
    <script> from gr.HTML, so this load-event JS is how audio/animation actually work."""
    makers = [f"mk('cz-bgm',{music!r},true);"]
    for event, uri in sfx.items():
        makers.append(f"mk('cz-sfx-{event}',{uri!r},false);")
    flavor = list(_FLAVOR_LINES)
    return (
        "() => {"
        "if(window.czAudioReady)return; window.czAudioReady=true;"
        "const mk=(id,src,loop)=>{const a=document.createElement('audio');a.id=id;a.src=src;"
        "a.preload='auto';if(loop)a.loop=true;document.body.appendChild(a);};"
        + "".join(makers)
        + "window.czSfx=function(n){var a=document.getElementById('cz-sfx-'+n);"
        "if(a){try{a.currentTime=0;a.volume=(n==='page')?0.22:0.45;a.play();}catch(e){}}};"
        "window.czMusicOn=function(){var m=document.getElementById('cz-bgm');"
        "if(m){m.volume=0.3;m.play().catch(function(){});}};"
        "window.czMusicOff=function(){var m=document.getElementById('cz-bgm');if(m)m.pause();};"
        "window.czTalk=function(){var s=document.getElementById('cz-sprite');"
        "if(s){s.classList.add('talking');setTimeout(function(){s.classList.remove('talking');},3000);}};"
        f"var fl={flavor!r}; var i=0;"
        "var tick=function(){var e=document.getElementById('cz-flavor');"
        "if(e){e.textContent=fl[i%fl.length]+'...'; i++;}};tick();setInterval(tick,2800);"
        "try{czMusicOn();var b=document.getElementById('czmusicbtn');"
        "if(b)b.classList.add('cz-on');}catch(e){}"
        "}"
    )


def _next_sentence(text: str, start: int) -> tuple[str, int] | None:
    """Return (sentence, new_cursor) for the next COMPLETE sentence after ``start`` - a
    terminator (.!?) followed by whitespace - or None while it is still being streamed.
    Lets us synth a suspect's reply sentence-by-sentence as the tokens arrive."""
    n = len(text)
    i = start
    while i < n and text[i].isspace():
        i += 1
    j = i
    while j < n:
        if text[j] in ".!?":
            k = j + 1
            while k < n and text[k] in ".!?":
                k += 1
            if k < n and text[k].isspace():
                return text[i:k].strip(), k
            j = k
        else:
            j += 1
    return None


def _stage(controller: GameController, sus_id: str) -> str:
    # The suspect is always questioned in a neutral interrogation room - the stage never
    # implies where they "were", so searching a room can no longer contradict their alibi.
    return stage_html(controller.interrogation_uri(), controller.portrait_sheet_uri(sus_id),
                      controller.case.suspect(sus_id).name, "Interrogation Room")


def _evidence_items(controller: GameController) -> list[tuple[str, str, str]]:
    return [(c.name, c.reveal_text, controller.prop_uri(c)) for c in controller.session.evidence()]


def _full_state(controller: GameController):
    case = controller.case
    first = case.suspects[0]
    motives = [m.value for m in MotiveCategory]
    return (
        controller,
        briefing_html(controller.session.player_view),
        gr.update(value=controller.roster()),
        _stage(controller, first.sus_id),
        dialogue_html(first.name, "I suppose you have questions. Get on with it."),
        gr.update(choices=controller.location_choices(), value=None),
        gr.update(choices=[], value=None),
        evidence_html([]),
        notebook_html(controller.session.state, case),
        gr.update(choices=[s.name for s in case.suspects], value=None),
        gr.update(choices=[case.weapon.name,
                           *(d for d in _WEAPON_DECOYS if d != case.weapon.name)], value=None),
        gr.update(choices=motives, value=None),
        gr.update(choices=[], value=[]),
        gr.update(value="", visible=False),  # verdict: hidden until an accusation is made
    )


def _new_case(controller: GameController | None):
    controller = controller or GameController()
    # Fast path: a freshly-generated case is already waiting in the background buffer -
    # reveal it instantly, no overlay. (The worker immediately starts the next one.)
    if controller.start_buffered(wait_secs=0.0):
        yield (gr.update(visible=False), *_full_state(controller))
        return
    # Otherwise generate live behind the overlay (buffer not ready yet).
    yield (gr.update(value=_overlay_html("generate"), visible=True), controller, *([gr.update()] * 13))
    controller.start("generate", seed=random.randint(1, 999_999))
    yield (gr.update(visible=False), *_full_state(controller))


def _select_suspect(controller: GameController | None, evt: gr.SelectData):
    if controller is None or controller.session is None:
        return gr.update(), gr.update()
    controller.select_by_index(evt.index)
    sus_id = controller.current_sus
    transcript = controller.session.state.state_for(sus_id).transcript
    last = transcript[-1].answer if transcript else "Well? Ask your questions."
    return _stage(controller, sus_id), dialogue_html(controller.current_name(), last)


def _ask(controller: GameController | None, question: str, evidence_name: str | None):
    if controller is None or controller.session is None:
        yield dialogue_html("", ""), gr.update(), gr.update(), gr.update(), gr.update()
        return
    name = controller.current_name()
    if not question.strip():
        yield (dialogue_html(name, "Ask me something, Detective."),
               gr.update(), gr.update(), gr.update(), gr.update())
        return

    sus_id = controller.current_sus
    clue_id = controller.clue_id_for_name(evidence_name)
    breaking = controller.relevance_breaking(clue_id)
    note_interaction()  # tell the background case generator to yield CPU to this reply
    # Voice is synthesized sentence-by-sentence in a background worker WHILE the LLM keeps
    # streaming (both release the GIL), so the first words are spoken almost immediately
    # instead of after the whole reply. Chunks play in order via a JS queue.
    voice_on = bool(getattr(controller.tts, "available", False)) and bool(sus_id)
    pool = ThreadPoolExecutor(max_workers=1) if voice_on else None
    futures: list = []
    fptr = 0
    raw = ""     # full raw stream (kept only for the cursor; never shown directly)
    shown = ""   # scrubbed, sentence-complete text the player actually sees + hears
    cursor = 0
    final = None
    try:
        for event in controller.session.interrogate(sus_id, question, clue_id):
            if event.spoken_delta:
                raw += event.spoken_delta
                note_interaction()  # keep the generator backed off for the whole reply
                # Reveal (and voice) only COMPLETE, scrubbed sentences. A confession can
                # never flash on screen or be spoken, and the text appears in step with
                # the voice rather than racing ahead of it.
                while (nxt := _next_sentence(raw, cursor)) is not None:
                    sentence, cursor = nxt
                    clean = scrub_spoken(sentence, breaking=breaking)
                    if not clean:
                        continue
                    shown = f"{shown} {clean}".strip()
                    if pool is not None:
                        futures.append(pool.submit(controller.speak, clean))
                uri = gr.update()  # emit at most one ready audio chunk per streamed tick
                if fptr < len(futures) and futures[fptr].done():
                    got = futures[fptr].result()
                    fptr += 1
                    if got:
                        uri = got
                yield (dialogue_html(name, shown, streaming=True),
                       gr.update(), gr.update(), uri, gr.update())
            if event.final is not None:
                final = event.final

        # Flush the trailing partial sentence (the last line rarely ends with whitespace).
        tail = scrub_spoken(raw[cursor:].strip(), breaking=breaking)
        if tail:
            shown = f"{shown} {tail}".strip()
            if pool is not None:
                futures.append(pool.submit(controller.speak, tail))

        # The engine's final spoken line is the authoritative (already-scrubbed) text.
        line = final.turn.spoken if final else (shown or "...")
        note = notebook_html(controller.session.state, controller.case)
        # Clear the evidence selection so it is presented ONLY on the turn it was chosen
        # (otherwise it would silently re-present every subsequent question).
        clear_ev = gr.update(value=None)
        # Drain any remaining audio chunks in order, one yield each so every chunk plays.
        while fptr < len(futures):
            got = futures[fptr].result()
            fptr += 1
            yield (dialogue_html(name, line), clear_ev, note, (got or gr.update()), gr.update())
        yield (dialogue_html(name, line), clear_ev, note, gr.update(), gr.update())
    finally:
        if pool is not None:
            pool.shutdown(wait=False)


def _search(controller: GameController | None, loc_name: str | None):
    # Searching reveals a room's evidence; it does NOT move the suspect (the stage stays in
    # the interrogation room), so it can never imply a false whereabouts for them.
    if controller is None or controller.session is None or not loc_name:
        return gr.update(), gr.update(), gr.update(), gr.update()
    controller.search(loc_name)
    items = _evidence_items(controller)
    return (
        evidence_html(items),
        gr.update(choices=controller.evidence_choices()),
        notebook_html(controller.session.state, controller.case),
        gr.update(choices=controller.evidence_choices()),
    )


def _add_note(controller: GameController | None, text: str):
    if controller is None or controller.session is None or not text.strip():
        return gr.update(), ""
    controller.add_note(text)
    return notebook_html(controller.session.state, controller.case), ""


def _accuse(controller: GameController | None, accused: str | None, weapon: str | None,
            motive: str | None, cited: list[str] | None):
    if controller is None or controller.session is None or not accused:
        return gr.update(value="<div class='cz-verdict'>Name who you are accusing first.</div>",
                         visible=True)
    weapon_ok = weapon == controller.case.weapon.name
    motive_ok = motive == controller.case.culprit.true_motive.category.value
    verdict = controller.accuse(accused, weapon_ok, motive_ok, cited or [])
    # The verdict panel is hidden until now (no empty box before the player accuses).
    return gr.update(value=verdict_html(controller.case, verdict), visible=True)


# Music toggle is driven entirely by the actual <audio> element state (no race with a
# Python state flip). A 'cz-on' class lights the button when the track is playing.
_MUSIC_BTN_FIND = "var b=document.getElementById('czmusicbtn');"
_MUSIC_TOGGLE_JS = (
    "() => { var m=document.getElementById('cz-bgm'); if(!m)return;"
    "if(m.paused){m.volume=0.3;m.play().catch(function(){});}else{m.pause();}"
    + _MUSIC_BTN_FIND + "if(b)b.classList.toggle('cz-on', !m.paused); }"
)
_MUSIC_ON_JS = (
    "() => { if(window.czMusicOn) czMusicOn();" + _MUSIC_BTN_FIND + "if(b)b.classList.add('cz-on'); }"
)


def _overlay_html(mode: str) -> str:
    """Full-screen loading overlay, shared by model warmup and case generation."""
    if mode == "generate":
        head = "A NEW CASE LANDS ON YOUR DESK"
        sub = ("Give it a minute or two, detective - the case file is being assembled: the "
               "victim, the suspects, their alibis, and the evidence.")
        extra = ""
    else:
        head = ""  # no fixed header on warmup; "Opening the precinct" cycles as a flavor line
        sub = ("Hang tight for a minute or two, detective - first boot warms up the case room, "
               "the suspects, and the interrogation lamp.")
        extra = how_to_play_html()
    head_html = f"<div class='cz-overlay-head'>{head}</div>" if head else ""
    return (
        "<div class='cz-overlay'><div class='cz-overlay-inner'>"
        "<div class='cz-bigtitle'>CASE&nbsp;ZERO</div>"
        f"{head_html}"
        "<div id='cz-flavor' class='cz-flavor'>Opening the precinct...</div>"
        "<div class='cz-loadbar'><span></span></div>"
        f"<div class='cz-muted' style='text-align:center;margin-bottom:12px'>{sub}</div>"
        f"{extra}</div></div>"
    )


def _warmup(controller: GameController | None):
    """Behind the full-screen overlay: warm the models (cold-start) and serve the first
    case from the background buffer (it has been generating since startup). Every case is
    live-generated and unique; the buffer just lets us start without a second wait."""
    settings = get_settings()
    start_case_buffer(settings)  # ensure background generation is running
    tts = _obtain_shared_tts(settings)
    # Warm the interrogation backend + TTS in the background so the briefing reveals as
    # soon as the first case is ready (and the first question/voice are responsive).
    def _warm() -> None:
        try:
            from ..llm.backend import GenParams

            _obtain_shared_backend(settings).generate("ok", GenParams(max_tokens=1, temperature=0.0))
        except Exception:
            pass
        try:
            from ..schemas.suspect import VoiceAssignment

            warm = tts.synth_to_file("Ready.", VoiceAssignment(engine="supertonic", speaker_id=0),
                                     _TTS_DIR / "warm.wav")
            if warm:
                Path(warm).unlink(missing_ok=True)
        except Exception:
            pass

    import threading

    threading.Thread(target=_warm, daemon=True).start()
    controller = controller or GameController()
    # Wait for the first background-generated case; fall back to live generation if needed.
    if not controller.start_buffered(wait_secs=600.0):
        controller.start("generate", seed=random.randint(1, 999_999))
    yield (gr.update(visible=False), gr.update(visible=True), *_full_state(controller))


# Sentence-chunked playback: each synthesized chunk is queued and played in order, and
# the suspect's mouth moves while the queue is draining (stops when it empties).
_TTS_PLAY_JS = (
    "(u)=>{ if(!u) return; try{"
    "window.czQ = window.czQ || [];"
    "var s=document.getElementById('cz-sprite');"
    "window.czQ.push(u);"
    "if(window.czPlaying) return;"
    "var play=function(){"
    "  if(!window.czQ.length){ window.czPlaying=false; if(s)s.classList.remove('talking'); return; }"
    "  window.czPlaying=true; if(s)s.classList.add('talking');"
    "  var a=new Audio(window.czQ.shift()); a.volume=0.95; window.czCur=a;"
    "  var nx=function(){ play(); };"
    "  a.addEventListener('ended',nx); a.addEventListener('error',nx);"
    "  a.play().catch(nx);"
    "}; play();"
    "}catch(e){} }"
)


def build_app() -> gr.Blocks:
    sfx, music = _load_audio()

    with gr.Blocks(css=build_css(), title="Case Zero", theme=gr.themes.Base(),
                   analytics_enabled=False) as demo:
        state = gr.State(None)

        demo.load(None, None, None, js=_audio_setup_js(sfx, music))

        # Music toggle floats above everything (reachable on the loading overlay too).
        # A compact note icon - small and elegant on phones (lights amber when playing).
        music_btn = gr.Button("♪", elem_id="czmusicbtn", elem_classes=["cz-music"])

        # Full-screen overlay (warmup + live generation). Its content is position:fixed, so
        # its Gradio wrapper is collapsed (cz-overlay-host) to avoid an empty box in flow.
        overlay = gr.HTML(value=_overlay_html("warmup"), visible=True,
                          elem_classes=["cz-overlay-host"])

        # Title + credit live INSIDE the game column so the loading screen shows only the
        # overlay (no stray header/footer boxes during warmup).
        with gr.Column(visible=False) as game:
            gr.HTML("<div id='cz-title'>CASE&nbsp;ZERO</div>"
                    "<div id='cz-subtitle'>interrogate &middot; investigate &middot; accuse</div>")
            with gr.Row():
                new_btn = gr.Button("New Case")
            briefing = gr.HTML(value=how_to_play_html())
            with gr.Row(equal_height=False):
                with gr.Column(scale=3):
                    gr.HTML("<h3>Suspects</h3>")
                    roster = gr.Gallery(columns=2, height=330, show_label=False, allow_preview=False,
                                        object_fit="contain", elem_classes="cz-panel")
                    gr.HTML("<h3>Investigate</h3>")
                    loc_dd = gr.Dropdown(label="Search a room", choices=[])
                    search_btn = gr.Button("Search Room")
                with gr.Column(scale=6):
                    stage = gr.HTML()
                    dialogue = gr.HTML()
                    evidence_dd = gr.Dropdown(label="Present evidence (search rooms first)",
                                              choices=[])
                    question = gr.Textbox(label="Ask the suspect",
                                          placeholder="Where were you that night?", lines=1)
                    ask_btn = gr.Button("Ask")
                    tts_audio = gr.Textbox(visible=False, elem_id="cz-tts")
                with gr.Column(scale=3):
                    gr.HTML("<h3>Evidence</h3>")
                    evidence_box = gr.HTML()
                    gr.HTML("<h3>Notebook</h3>")
                    notebook = gr.HTML()
                    note_in = gr.Textbox(placeholder="Jot a note to yourself...", lines=1,
                                         show_label=False, elem_classes="cz-note-in")
                    note_btn = gr.Button("Add Note")

            gr.HTML("<h3>Make Your Accusation</h3>")
            with gr.Row():
                accused_dd = gr.Dropdown(label="The killer is", choices=[])
                weapon_dd = gr.Dropdown(label="With the", choices=[])
                motive_dd = gr.Dropdown(label="Because of", choices=[])
            cited_cg = gr.CheckboxGroup(label="Citing this evidence", choices=[])
            accuse_btn = gr.Button("Accuse", elem_classes="cz-accuse")
            verdict = gr.HTML(visible=False)  # appears only after an accusation is made

            gr.HTML(
                "<div class='cz-credit'>Case Zero &middot; Story &amp; suspects: Qwen2.5-1.5B "
                "(local, via llama.cpp) &middot; Voices: Supertonic (local on-device TTS) "
                "&middot; Music: \"Backbay Lounge\" by Kevin MacLeod (incompetech.com), licensed "
                "under Creative Commons Attribution 4.0 (CC BY 4.0 - free to use with credit)</div>"
            )

        # Warm the models AND serve the first case from the buffer behind the overlay.
        warm_outputs = [overlay, game, state, briefing, roster, stage, dialogue, loc_dd,
                        evidence_dd, evidence_box, notebook, accused_dd, weapon_dd, motive_dd,
                        cited_cg, verdict]
        demo.load(_warmup, [state], warm_outputs)

        new_outputs = [overlay, state, briefing, roster, stage, dialogue, loc_dd, evidence_dd,
                       evidence_box, notebook, accused_dd, weapon_dd, motive_dd, cited_cg, verdict]
        new_btn.click(_new_case, [state], new_outputs)
        new_btn.click(None, None, None, js=_MUSIC_ON_JS)
        music_btn.click(None, None, None, js=_MUSIC_TOGGLE_JS)

        roster.select(_select_suspect, [state], [stage, dialogue])
        roster.select(None, None, None, js="() => { if(window.czSfx) czSfx('select'); }")

        ask_outputs = [dialogue, evidence_dd, notebook, tts_audio, evidence_box]
        # Stop any leftover voice and clear the queue before a new reply starts streaming.
        ask_js = ("() => { try{ if(window.czCur){window.czCur.pause();} window.czQ=[];"
                  " window.czPlaying=false; var s=document.getElementById('cz-sprite');"
                  " if(s)s.classList.remove('talking'); }catch(e){}"
                  " if(window.czSfx) czSfx('select'); }")
        ask_btn.click(_ask, [state, question, evidence_dd], ask_outputs)
        ask_btn.click(None, None, None, js=ask_js)
        ask_btn.click(lambda: "", None, question)  # clear the question box after asking
        question.submit(_ask, [state, question, evidence_dd], ask_outputs)
        question.submit(None, None, None, js=ask_js)
        question.submit(lambda: "", None, question)
        # Speak the line and sync the mouth to the actual audio playback.
        tts_audio.change(None, [tts_audio], None, js=_TTS_PLAY_JS)

        search_outputs = [evidence_box, evidence_dd, notebook, cited_cg]
        search_btn.click(_search, [state, loc_dd], search_outputs)
        search_btn.click(None, None, None, js="() => { if(window.czSfx) czSfx('page'); }")

        # Let the player add their own notebook entries.
        note_btn.click(_add_note, [state, note_in], [notebook, note_in])
        note_in.submit(_add_note, [state, note_in], [notebook, note_in])

        accuse_btn.click(_accuse, [state, accused_dd, weapon_dd, motive_dd, cited_cg], [verdict])
        accuse_btn.click(None, None, None, js="() => { if(window.czSfx) czSfx('accuse'); }")

    return demo