#!/usr/bin/env python3 """Dictation Trainer — Gradio Space (specs/gradio.md). A thin, mobile-first wrapper over three Modal endpoints. No models run here: every inference is an HTTP call out to Modal. Generate: word list + level --LLM--> German dictation --TTS--> audio Check: photo of handwriting --OCR(blind)--> text --grade--> diff + score Run locally from this directory (the Space root): MODAL_LLM_URL=... MODAL_TTS_URL=... MODAL_OCR_URL=... uv run python app.py """ import base64 import json import os import tempfile import time import gradio as gr from loguru import logger # Flat imports: this file is the Space entrypoint, run with space/ as the root. from diff_html import render_report_html from ocr.grading import grade from ocr.transcribe import transcribe_image from openai_client import make_client from prompts import ( DICTATION_SYSTEM_PROMPT, build_user_prompt, clean_dictation, parse_word_list, ) from wizard import nav LLM_MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF" # Higgs ignores the model field today, but the OpenAI SDK requires one; keep it # descriptive in case the server starts validating it. TTS_MODEL = "bosonai/higgs-audio-v3-tts-4b" TTS_VOICE = "alba" # Sampling fixed per spec §6 — deterministic, repeatable dictations. top_k and # repeat_penalty aren't OpenAI-standard params, so they ride in extra_body. LLM_SAMPLING = { "temperature": 0.1, "top_p": 0.1, "top_k": 50, "repeat_penalty": 1.05, } # LFM2.5 is a reasoning model: it emits a chain-of-thought into `reasoning_content` # BEFORE the answer goes into `content`. The token budget must cover BOTH, or the # reasoning consumes it all and `content` comes back empty (finish_reason=length). # The verbose system prompt lengthens the reasoning, so keep this well above the # ~900 tokens of CoT we observed. See space/debug_llm.py. LLM_MAX_TOKENS = 2048 LANG = "de" COLD_START_HINT = "First call after idle can take ~30-60s while backend warms up." # Calm, modern theme: indigo/violet accents on slate neutrals, a soft gradient # page, white cards with gentle shadows, roomy radius + spacing. Most of the look # lives here (theme variables are version-stable); CSS below only does the things # themes can't (phone framing, gradient title, status pill). THEME = gr.themes.Soft( primary_hue=gr.themes.colors.indigo, secondary_hue=gr.themes.colors.sky, neutral_hue=gr.themes.colors.slate, font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"], radius_size=gr.themes.sizes.radius_lg, spacing_size=gr.themes.sizes.spacing_lg, text_size=gr.themes.sizes.text_md, ).set( body_background_fill="linear-gradient(160deg, #eef2fb 0%, #e9ecf7 45%, #ece9f7 100%)", body_background_fill_dark="linear-gradient(160deg, #0f1422 0%, #141b2d 100%)", body_text_color="#1e293b", body_text_color_subdued="#64748b", block_background_fill="#ffffff", block_background_fill_dark="#1a2234", block_border_width="0px", block_radius="18px", block_shadow="0 6px 24px rgba(30, 41, 59, 0.08)", block_shadow_dark="0 6px 24px rgba(0, 0, 0, 0.40)", block_padding="20px", layout_gap="14px", input_background_fill="#f8fafc", input_background_fill_dark="#0f1626", input_radius="12px", button_large_radius="12px", button_large_padding="11px 18px", button_primary_background_fill="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)", button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)", button_primary_background_fill_dark="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)", button_primary_text_color="#ffffff", button_primary_shadow="0 4px 14px rgba(99, 102, 241, 0.35)", button_primary_shadow_hover="0 6px 18px rgba(99, 102, 241, 0.45)", ) # Portrait-phone framing + the bits the theme can't express: a narrow centered # column, a gradient app title, and the centered status (spinner) pill. MOBILE_CSS = """ .gradio-container { max-width: 480px !important; margin: 0 auto !important; padding: 12px 14px 28px !important; } .app-header { display: flex; align-items: center; justify-content: center; gap: 12px; margin: 12px 0 4px; } .app-logo { height: 128px; width: auto; flex: 0 0 auto; } .app-title { font-size: 1.6rem; font-weight: 700; background: linear-gradient(135deg, #6366f1, #8b5cf6); -webkit-background-clip: text; background-clip: text; -webkit-text-fill-color: transparent; } .intro { text-align: center; color: #64748b; font-size: 0.95rem; line-height: 1.45; margin: 0 6px 6px; } /* Step title: transparent (no clipped grey strip), larger, and padded so the card's rounded corner never crops the leading "1 ·". */ .panel-title { background: transparent !important; box-shadow: none !important; border: none !important; padding: 6px 18px 2px !important; } .panel-title h3 { font-size: 1.35rem !important; font-weight: 700 !important; color: #4f46e5 !important; margin: 0 !important; line-height: 1.3; } .status { text-align: center; font-weight: 600; opacity: 0.9; } .status .fa-spinner { margin-right: 6px; } """ # FontAwesome (CDN) for the animated status spinner (fa-spinner + fa-spin). Loaded # into at launch; the icon itself is rendered as raw HTML in the status # fields (see _busy), which is why those Markdowns set sanitize_html=False. FA_HEAD = ( '' ) SPINNER = '' # Shared by the in-app header (embedded base64) and the PWA / favicon icon. LOGO_PATH = os.path.join( os.path.dirname(os.path.abspath(__file__)), "assets", "ankira_blue.png" ) def _logo_data_uri() -> str: """Embed the scaled-down Ankira logo as a base64 PNG data URI, so the header needs no Gradio file-serving allowlist. Empty string if the file is missing (the header then falls back to the ✍️ emoji).""" try: with open(LOGO_PATH, "rb") as f: b64 = base64.b64encode(f.read()).decode("ascii") return f"data:image/png;base64,{b64}" except OSError: return "" _LOGO_URI = _logo_data_uri() _LOGO_TAG = ( f'' if _LOGO_URI else "✍️" ) HEADER_HTML = ( f'
{_LOGO_TAG}' f'Ankira: German Dictation Trainer
' ) def _require_env(name: str) -> str: val = os.environ.get(name) if not val: raise gr.Error(f"{name} is not configured (set it in the Space secrets).") return val def _tts_base_url() -> str: """MODAL_TTS_URL may be the server root or the full speech path; reduce it to the server root so the client appends /v1/audio/speech itself.""" url = _require_env("MODAL_TTS_URL").rstrip("/") for suffix in ("/v1/audio/speech", "/audio/speech"): if url.endswith(suffix): return url[: -len(suffix)] return url def call_llm(words: list[str], level: str) -> str: """Word list + CEFR level -> one German dictation paragraph (cleaned).""" client = make_client(_require_env("MODAL_LLM_URL")) completion = client.chat.completions.create( model=LLM_MODEL, messages=[ {"role": "system", "content": DICTATION_SYSTEM_PROMPT}, {"role": "user", "content": build_user_prompt(words, level)}, ], max_tokens=LLM_MAX_TOKENS, extra_body=LLM_SAMPLING, ) data = completion.model_dump() choice = data.get("choices", [{}])[0] content = (choice.get("message") or {}).get("content") text = clean_dictation(content) if not text: # Evidence at the LLM boundary: see exactly what came back when empty. logger.error( "Empty dictation from LLM. finish_reason={} raw_content={!r}\nfull response: {}", choice.get("finish_reason"), content, json.dumps(data, ensure_ascii=False)[:2000], ) return text def call_tts(text: str) -> str: """Synthesize the dictation; return a temp audio file path. Suffix follows the response Content-Type so gr.Audio plays it without transcoding.""" client = make_client(_tts_base_url()) response = client.audio.speech.create(model=TTS_MODEL, voice=TTS_VOICE, input=text) audio = response.read() content_type = response.response.headers.get("content-type", "") suffix = ".mp3" if "mpeg" in content_type else ".wav" with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f: f.write(audio) return f.name # ---- Step handlers --------------------------------------------------------- # # CTA handlers do their work, then append nav(...) so the wizard advances only on # success; raising gr.Error leaves every output (including the views) untouched, # so the user stays on the current step to fix the problem. def start(words_raw: str, level: str, state: dict): """Input → Listen: generate the dictation text + audio.""" words = parse_word_list(words_raw) if not words: raise gr.Error("Enter at least one word to practice.") text = call_llm(words, level) if not text: logger.error("LLM returned an empty dictation ({} words, {})", len(words), level) raise gr.Error("The model returned an empty dictation. Please try again.") logger.info("Generated dictation ({} words, {}):\n{}", len(words), level, text) state = {"diktat": text, "created_at": time.time()} try: audio_path = call_tts(text) except Exception as e: # never lose the text because synthesis failed (spec §4) gr.Warning(f"Audio synthesis failed ({e}). Text saved — open 'Show text'.") audio_path = None # trailing update hides the first-step intro (see the outputs list in build_ui). return audio_path, text, state, *nav("listen"), gr.update(visible=False) def check(image_path: str, state: dict): """Upload → Results: transcribe the photo (blind) and grade it.""" if not state or not state.get("diktat"): raise gr.Error("Generate a dictation first.") if not image_path: raise gr.Error("Upload a photo of your handwriting first.") ocr_client = make_client(_require_env("MODAL_OCR_URL")) transcription = transcribe_image(image_path, ocr_client) logger.info("OCR transcription:\n{}", transcription) report = grade(state["diktat"], transcription, LANG) return transcription, render_report_html(report), *nav("results") def restart(): """Results → Input: clear everything and start a fresh dictation.""" fresh = {"diktat": "", "created_at": 0} # order matches the `outputs` list on the Start-over button below; trailing # update re-shows the first-step intro. return "", None, "", None, "", "", fresh, *nav("input"), gr.update(visible=True) # Inline progress: the CTA's real outputs all live on the next (hidden) view, so # Gradio's spinner would paint where the user can't see it. Instead each CTA runs # as show-status -> work -> hide-status; the hide step uses .then so it fires even # when the work raises (no spinner left stuck on screen). def _busy(message: str): return gr.update(value=f"{SPINNER} {message}", visible=True) def _idle(): return gr.update(visible=False) def _begin(message: str): """Show the spinner and disable the CTA so it can't be re-fired mid-call.""" return _busy(message), gr.update(interactive=False) def _end(): """Hide the spinner and re-enable the CTA on success (.then path).""" return _idle(), gr.update(interactive=True) def _recover(*_): """Same cleanup for the failure path. A raised gr.Error aborts the chained .then, so re-enabling has to be wired via .failure (which passes the exception as an arg — ignored here). Without this, a validation error like 'no photo uploaded' would leave the button stuck disabled.""" return _idle(), gr.update(interactive=True) def goto(target: str): """Switch to a view and clear transient state (both status spinners hidden, both CTAs re-enabled). Navigating away during a wait shouldn't leave a stale spinner or a disabled button behind on the view you left. Return order matches the NAV_OUTPUTS list wired in build_ui.""" return ( *nav(target), gr.update(visible=False), # input_status gr.update(visible=False), # upload_status gr.update(interactive=True), # start_btn gr.update(interactive=True), # check_btn gr.update(visible=target == "input"), # intro (first step only) ) # ---- UI -------------------------------------------------------------------- def build_ui() -> gr.Blocks: with gr.Blocks(title="Dictation Trainer") as demo: # localStorage-backed: survives a tab reload while the learner writes (spec §3). state = gr.BrowserState({"diktat": "", "created_at": 0}) gr.HTML(HEADER_HTML) # Shown on the first step only (toggled by start/goto/restart below). intro = gr.Markdown( "Practice German spelling by ear. Enter a few words, get a short " "dictation read aloud, and write it down by hand. Finally photograph " "your page for instant word-by-word feedback.", elem_classes="intro", ) # Four stacked views; nav() keeps exactly one visible. Order must match # wizard.VIEWS. with gr.Group(visible=True) as view_input: gr.Markdown("### Let's start!", elem_classes="panel-title") words_in = gr.Textbox( label="Words to practice", placeholder="Comma- or newline-separated, e.g. angeblich, ablehnen, Apfel", lines=4, ) level_in = gr.Dropdown(["A1", "A2", "B1", "B2"], value="A2", label="Level") start_btn = gr.Button("Generate", variant="primary") input_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status") with gr.Group(visible=False) as view_listen: gr.Markdown("### Listen", elem_classes="panel-title") audio_out = gr.Audio( type="filepath", interactive=False, label="Dictation audio" ) gr.Markdown("🎧 Listen and write it down on paper, then click **Finished**.") with gr.Accordion("Show text (debug)", open=False): text_out = gr.Textbox(label="Dictation text", interactive=False, lines=4) with gr.Row(): listen_back_btn = gr.Button("Back") finished_btn = gr.Button("Finished", variant="primary") with gr.Group(visible=False) as view_upload: gr.Markdown("### Upload", elem_classes="panel-title") image_in = gr.Image( type="filepath", sources=["upload", "webcam", "clipboard"], label="Photo of your handwriting", ) with gr.Row(): upload_back_btn = gr.Button("Back") check_btn = gr.Button("Check", variant="primary") upload_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status") with gr.Group(visible=False) as view_results: gr.Markdown("### Results", elem_classes="panel-title") recognized_out = gr.Textbox( label="Recognized text (OCR)", interactive=False, lines=4 ) diff_out = gr.HTML(label="Feedback") with gr.Row(): results_back_btn = gr.Button("Back") restart_btn = gr.Button("Start over", variant="primary") views = [view_input, view_listen, view_upload, view_results] # CTA handlers: disable the button + show status, do the work (advance, or # raise gr.Error and stay). On success .then clears + re-enables; on error # .failure does the same (the raise aborts .then), so the button is never # left stuck disabled. start_work = start_btn.click( lambda: _begin(f"Generating dictation… {COLD_START_HINT}"), outputs=[input_status, start_btn], show_progress="hidden", ).then( start, inputs=[words_in, level_in, state], outputs=[audio_out, text_out, state, *views, intro], # Our _begin spinner is the indicator; suppress Gradio's own overlay, # which would otherwise cover the visible card (and our spinner). show_progress="hidden", ) start_work.then(_end, outputs=[input_status, start_btn], show_progress="hidden") start_work.failure(_recover, outputs=[input_status, start_btn], show_progress="hidden") check_work = check_btn.click( lambda: _begin(f"Reading your handwriting… {COLD_START_HINT}"), outputs=[upload_status, check_btn], show_progress="hidden", ).then( check, inputs=[image_in, state], outputs=[recognized_out, diff_out, *views], # Keep our _begin spinner visible; suppress Gradio's overlay (it would # cover the upload card and hide the spinner). show_progress="hidden", ) check_work.then(_end, outputs=[upload_status, check_btn], show_progress="hidden") check_work.failure(_recover, outputs=[upload_status, check_btn], show_progress="hidden") restart_btn.click( restart, outputs=[words_in, audio_out, text_out, image_in, recognized_out, diff_out, state, *views, intro], ) # Navigation (Finished + Back buttons): switch view AND clear any leftover # spinner / disabled CTA from an action on the view being left. `cancels` # aborts an in-flight Start/Check so a call the user navigated away from # can't complete and yank them forward (its outputs are discarded). nav_outputs = [*views, input_status, upload_status, start_btn, check_btn, intro] in_flight = [start_work, check_work] finished_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight) listen_back_btn.click(lambda: goto("input"), outputs=nav_outputs, cancels=in_flight) upload_back_btn.click(lambda: goto("listen"), outputs=nav_outputs, cancels=in_flight) results_back_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight) return demo if __name__ == "__main__": # On HF Spaces (SPACE_ID set) the runtime serves the app — don't request a # share tunnel there; locally, share=True gives a link usable from a phone. # theme, css and head all live on launch() in Gradio 6 (moved off Blocks). # pwa=True makes the app installable (Gradio generates the manifest + service # worker); the favicon doubles as the home-screen icon. build_ui().launch( theme=THEME, css=MOBILE_CSS, head=FA_HEAD, pwa=True, favicon_path=LOGO_PATH if os.path.exists(LOGO_PATH) else None, share="SPACE_ID" not in os.environ, )