Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Dictation Trainer — Gradio Space (specs/gradio.md). | |
| A thin, mobile-first wrapper over three Modal endpoints. No models run here: | |
| every inference is an HTTP call out to Modal. | |
| Generate: word list + level --LLM--> German dictation --TTS--> audio | |
| Check: photo of handwriting --OCR(blind)--> text --grade--> diff + score | |
| Run locally from this directory (the Space root): | |
| MODAL_LLM_URL=... MODAL_TTS_URL=... MODAL_OCR_URL=... uv run python app.py | |
| """ | |
| import base64 | |
| import json | |
| import os | |
| import tempfile | |
| import time | |
| import gradio as gr | |
| from loguru import logger | |
| # Flat imports: this file is the Space entrypoint, run with space/ as the root. | |
| from diff_html import render_report_html | |
| from ocr.grading import grade | |
| from ocr.transcribe import transcribe_image | |
| from openai_client import make_client | |
| from prompts import ( | |
| DICTATION_SYSTEM_PROMPT, | |
| build_user_prompt, | |
| clean_dictation, | |
| parse_word_list, | |
| ) | |
| from wizard import nav | |
| LLM_MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF" | |
| # Higgs ignores the model field today, but the OpenAI SDK requires one; keep it | |
| # descriptive in case the server starts validating it. | |
| TTS_MODEL = "bosonai/higgs-audio-v3-tts-4b" | |
| TTS_VOICE = "alba" | |
| # Sampling fixed per spec §6 — deterministic, repeatable dictations. top_k and | |
| # repeat_penalty aren't OpenAI-standard params, so they ride in extra_body. | |
| LLM_SAMPLING = { | |
| "temperature": 0.1, | |
| "top_p": 0.1, | |
| "top_k": 50, | |
| "repeat_penalty": 1.05, | |
| } | |
| # LFM2.5 is a reasoning model: it emits a chain-of-thought into `reasoning_content` | |
| # BEFORE the answer goes into `content`. The token budget must cover BOTH, or the | |
| # reasoning consumes it all and `content` comes back empty (finish_reason=length). | |
| # The verbose system prompt lengthens the reasoning, so keep this well above the | |
| # ~900 tokens of CoT we observed. See space/debug_llm.py. | |
| LLM_MAX_TOKENS = 2048 | |
| LANG = "de" | |
| COLD_START_HINT = "First call after idle can take ~30-60s while backend warms up." | |
| # Calm, modern theme: indigo/violet accents on slate neutrals, a soft gradient | |
| # page, white cards with gentle shadows, roomy radius + spacing. Most of the look | |
| # lives here (theme variables are version-stable); CSS below only does the things | |
| # themes can't (phone framing, gradient title, status pill). | |
| THEME = gr.themes.Soft( | |
| primary_hue=gr.themes.colors.indigo, | |
| secondary_hue=gr.themes.colors.sky, | |
| neutral_hue=gr.themes.colors.slate, | |
| font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"], | |
| radius_size=gr.themes.sizes.radius_lg, | |
| spacing_size=gr.themes.sizes.spacing_lg, | |
| text_size=gr.themes.sizes.text_md, | |
| ).set( | |
| body_background_fill="linear-gradient(160deg, #eef2fb 0%, #e9ecf7 45%, #ece9f7 100%)", | |
| body_background_fill_dark="linear-gradient(160deg, #0f1422 0%, #141b2d 100%)", | |
| body_text_color="#1e293b", | |
| body_text_color_subdued="#64748b", | |
| block_background_fill="#ffffff", | |
| block_background_fill_dark="#1a2234", | |
| block_border_width="0px", | |
| block_radius="18px", | |
| block_shadow="0 6px 24px rgba(30, 41, 59, 0.08)", | |
| block_shadow_dark="0 6px 24px rgba(0, 0, 0, 0.40)", | |
| block_padding="20px", | |
| layout_gap="14px", | |
| input_background_fill="#f8fafc", | |
| input_background_fill_dark="#0f1626", | |
| input_radius="12px", | |
| button_large_radius="12px", | |
| button_large_padding="11px 18px", | |
| button_primary_background_fill="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)", | |
| button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)", | |
| button_primary_background_fill_dark="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)", | |
| button_primary_text_color="#ffffff", | |
| button_primary_shadow="0 4px 14px rgba(99, 102, 241, 0.35)", | |
| button_primary_shadow_hover="0 6px 18px rgba(99, 102, 241, 0.45)", | |
| ) | |
| # Portrait-phone framing + the bits the theme can't express: a narrow centered | |
| # column, a gradient app title, and the centered status (spinner) pill. | |
| MOBILE_CSS = """ | |
| .gradio-container { | |
| max-width: 480px !important; | |
| margin: 0 auto !important; | |
| padding: 12px 14px 28px !important; | |
| } | |
| .app-header { | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| gap: 12px; | |
| margin: 12px 0 4px; | |
| } | |
| .app-logo { | |
| height: 128px; | |
| width: auto; | |
| flex: 0 0 auto; | |
| } | |
| .app-title { | |
| font-size: 1.6rem; | |
| font-weight: 700; | |
| background: linear-gradient(135deg, #6366f1, #8b5cf6); | |
| -webkit-background-clip: text; | |
| background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| } | |
| .intro { | |
| text-align: center; | |
| color: #64748b; | |
| font-size: 0.95rem; | |
| line-height: 1.45; | |
| margin: 0 6px 6px; | |
| } | |
| /* Step title: transparent (no clipped grey strip), larger, and padded so the | |
| card's rounded corner never crops the leading "1 ·". */ | |
| .panel-title { | |
| background: transparent !important; | |
| box-shadow: none !important; | |
| border: none !important; | |
| padding: 6px 18px 2px !important; | |
| } | |
| .panel-title h3 { | |
| font-size: 1.35rem !important; | |
| font-weight: 700 !important; | |
| color: #4f46e5 !important; | |
| margin: 0 !important; | |
| line-height: 1.3; | |
| } | |
| .status { | |
| text-align: center; | |
| font-weight: 600; | |
| opacity: 0.9; | |
| } | |
| .status .fa-spinner { | |
| margin-right: 6px; | |
| } | |
| """ | |
| # FontAwesome (CDN) for the animated status spinner (fa-spinner + fa-spin). Loaded | |
| # into <head> at launch; the icon itself is rendered as raw HTML in the status | |
| # fields (see _busy), which is why those Markdowns set sanitize_html=False. | |
| FA_HEAD = ( | |
| '<link rel="stylesheet" ' | |
| 'href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/css/all.min.css">' | |
| ) | |
| SPINNER = '<i class="fa-solid fa-spinner fa-spin"></i>' | |
| # Shared by the in-app header (embedded base64) and the PWA / favicon icon. | |
| LOGO_PATH = os.path.join( | |
| os.path.dirname(os.path.abspath(__file__)), "assets", "ankira_blue.png" | |
| ) | |
| def _logo_data_uri() -> str: | |
| """Embed the scaled-down Ankira logo as a base64 PNG data URI, so the header | |
| needs no Gradio file-serving allowlist. Empty string if the file is missing | |
| (the header then falls back to the ✍️ emoji).""" | |
| try: | |
| with open(LOGO_PATH, "rb") as f: | |
| b64 = base64.b64encode(f.read()).decode("ascii") | |
| return f"data:image/png;base64,{b64}" | |
| except OSError: | |
| return "" | |
| _LOGO_URI = _logo_data_uri() | |
| _LOGO_TAG = ( | |
| f'<img class="app-logo" alt="Ankira" src="{_LOGO_URI}">' if _LOGO_URI else "✍️" | |
| ) | |
| HEADER_HTML = ( | |
| f'<div class="app-header">{_LOGO_TAG}' | |
| f'<span class="app-title">Ankira: German Dictation Trainer</span></div>' | |
| ) | |
| def _require_env(name: str) -> str: | |
| val = os.environ.get(name) | |
| if not val: | |
| raise gr.Error(f"{name} is not configured (set it in the Space secrets).") | |
| return val | |
| def _tts_base_url() -> str: | |
| """MODAL_TTS_URL may be the server root or the full speech path; reduce it to | |
| the server root so the client appends /v1/audio/speech itself.""" | |
| url = _require_env("MODAL_TTS_URL").rstrip("/") | |
| for suffix in ("/v1/audio/speech", "/audio/speech"): | |
| if url.endswith(suffix): | |
| return url[: -len(suffix)] | |
| return url | |
| def call_llm(words: list[str], level: str) -> str: | |
| """Word list + CEFR level -> one German dictation paragraph (cleaned).""" | |
| client = make_client(_require_env("MODAL_LLM_URL")) | |
| completion = client.chat.completions.create( | |
| model=LLM_MODEL, | |
| messages=[ | |
| {"role": "system", "content": DICTATION_SYSTEM_PROMPT}, | |
| {"role": "user", "content": build_user_prompt(words, level)}, | |
| ], | |
| max_tokens=LLM_MAX_TOKENS, | |
| extra_body=LLM_SAMPLING, | |
| ) | |
| data = completion.model_dump() | |
| choice = data.get("choices", [{}])[0] | |
| content = (choice.get("message") or {}).get("content") | |
| text = clean_dictation(content) | |
| if not text: | |
| # Evidence at the LLM boundary: see exactly what came back when empty. | |
| logger.error( | |
| "Empty dictation from LLM. finish_reason={} raw_content={!r}\nfull response: {}", | |
| choice.get("finish_reason"), | |
| content, | |
| json.dumps(data, ensure_ascii=False)[:2000], | |
| ) | |
| return text | |
| def call_tts(text: str) -> str: | |
| """Synthesize the dictation; return a temp audio file path. Suffix follows | |
| the response Content-Type so gr.Audio plays it without transcoding.""" | |
| client = make_client(_tts_base_url()) | |
| response = client.audio.speech.create(model=TTS_MODEL, voice=TTS_VOICE, input=text) | |
| audio = response.read() | |
| content_type = response.response.headers.get("content-type", "") | |
| suffix = ".mp3" if "mpeg" in content_type else ".wav" | |
| with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f: | |
| f.write(audio) | |
| return f.name | |
| # ---- Step handlers --------------------------------------------------------- | |
| # | |
| # CTA handlers do their work, then append nav(...) so the wizard advances only on | |
| # success; raising gr.Error leaves every output (including the views) untouched, | |
| # so the user stays on the current step to fix the problem. | |
| def start(words_raw: str, level: str, state: dict): | |
| """Input → Listen: generate the dictation text + audio.""" | |
| words = parse_word_list(words_raw) | |
| if not words: | |
| raise gr.Error("Enter at least one word to practice.") | |
| text = call_llm(words, level) | |
| if not text: | |
| logger.error("LLM returned an empty dictation ({} words, {})", len(words), level) | |
| raise gr.Error("The model returned an empty dictation. Please try again.") | |
| logger.info("Generated dictation ({} words, {}):\n{}", len(words), level, text) | |
| state = {"diktat": text, "created_at": time.time()} | |
| try: | |
| audio_path = call_tts(text) | |
| except Exception as e: # never lose the text because synthesis failed (spec §4) | |
| gr.Warning(f"Audio synthesis failed ({e}). Text saved — open 'Show text'.") | |
| audio_path = None | |
| # trailing update hides the first-step intro (see the outputs list in build_ui). | |
| return audio_path, text, state, *nav("listen"), gr.update(visible=False) | |
| def check(image_path: str, state: dict): | |
| """Upload → Results: transcribe the photo (blind) and grade it.""" | |
| if not state or not state.get("diktat"): | |
| raise gr.Error("Generate a dictation first.") | |
| if not image_path: | |
| raise gr.Error("Upload a photo of your handwriting first.") | |
| ocr_client = make_client(_require_env("MODAL_OCR_URL")) | |
| transcription = transcribe_image(image_path, ocr_client) | |
| logger.info("OCR transcription:\n{}", transcription) | |
| report = grade(state["diktat"], transcription, LANG) | |
| return transcription, render_report_html(report), *nav("results") | |
| def restart(): | |
| """Results → Input: clear everything and start a fresh dictation.""" | |
| fresh = {"diktat": "", "created_at": 0} | |
| # order matches the `outputs` list on the Start-over button below; trailing | |
| # update re-shows the first-step intro. | |
| return "", None, "", None, "", "", fresh, *nav("input"), gr.update(visible=True) | |
| # Inline progress: the CTA's real outputs all live on the next (hidden) view, so | |
| # Gradio's spinner would paint where the user can't see it. Instead each CTA runs | |
| # as show-status -> work -> hide-status; the hide step uses .then so it fires even | |
| # when the work raises (no spinner left stuck on screen). | |
| def _busy(message: str): | |
| return gr.update(value=f"{SPINNER} {message}", visible=True) | |
| def _idle(): | |
| return gr.update(visible=False) | |
| def _begin(message: str): | |
| """Show the spinner and disable the CTA so it can't be re-fired mid-call.""" | |
| return _busy(message), gr.update(interactive=False) | |
| def _end(): | |
| """Hide the spinner and re-enable the CTA on success (.then path).""" | |
| return _idle(), gr.update(interactive=True) | |
| def _recover(*_): | |
| """Same cleanup for the failure path. A raised gr.Error aborts the chained | |
| .then, so re-enabling has to be wired via .failure (which passes the | |
| exception as an arg — ignored here). Without this, a validation error like | |
| 'no photo uploaded' would leave the button stuck disabled.""" | |
| return _idle(), gr.update(interactive=True) | |
| def goto(target: str): | |
| """Switch to a view and clear transient state (both status spinners hidden, | |
| both CTAs re-enabled). Navigating away during a wait shouldn't leave a stale | |
| spinner or a disabled button behind on the view you left. Return order matches | |
| the NAV_OUTPUTS list wired in build_ui.""" | |
| return ( | |
| *nav(target), | |
| gr.update(visible=False), # input_status | |
| gr.update(visible=False), # upload_status | |
| gr.update(interactive=True), # start_btn | |
| gr.update(interactive=True), # check_btn | |
| gr.update(visible=target == "input"), # intro (first step only) | |
| ) | |
| # ---- UI -------------------------------------------------------------------- | |
| def build_ui() -> gr.Blocks: | |
| with gr.Blocks(title="Dictation Trainer") as demo: | |
| # localStorage-backed: survives a tab reload while the learner writes (spec §3). | |
| state = gr.BrowserState({"diktat": "", "created_at": 0}) | |
| gr.HTML(HEADER_HTML) | |
| # Shown on the first step only (toggled by start/goto/restart below). | |
| intro = gr.Markdown( | |
| "Practice German spelling by ear. Enter a few words, get a short " | |
| "dictation read aloud, and write it down by hand. Finally photograph " | |
| "your page for instant word-by-word feedback.", | |
| elem_classes="intro", | |
| ) | |
| # Four stacked views; nav() keeps exactly one visible. Order must match | |
| # wizard.VIEWS. | |
| with gr.Group(visible=True) as view_input: | |
| gr.Markdown("### Let's start!", elem_classes="panel-title") | |
| words_in = gr.Textbox( | |
| label="Words to practice", | |
| placeholder="Comma- or newline-separated, e.g. angeblich, ablehnen, Apfel", | |
| lines=4, | |
| ) | |
| level_in = gr.Dropdown(["A1", "A2", "B1", "B2"], value="A2", label="Level") | |
| start_btn = gr.Button("Generate", variant="primary") | |
| input_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status") | |
| with gr.Group(visible=False) as view_listen: | |
| gr.Markdown("### Listen", elem_classes="panel-title") | |
| audio_out = gr.Audio( | |
| type="filepath", interactive=False, label="Dictation audio" | |
| ) | |
| gr.Markdown("🎧 Listen and write it down on paper, then click **Finished**.") | |
| with gr.Accordion("Show text (debug)", open=False): | |
| text_out = gr.Textbox(label="Dictation text", interactive=False, lines=4) | |
| with gr.Row(): | |
| listen_back_btn = gr.Button("Back") | |
| finished_btn = gr.Button("Finished", variant="primary") | |
| with gr.Group(visible=False) as view_upload: | |
| gr.Markdown("### Upload", elem_classes="panel-title") | |
| image_in = gr.Image( | |
| type="filepath", | |
| sources=["upload", "webcam", "clipboard"], | |
| label="Photo of your handwriting", | |
| ) | |
| with gr.Row(): | |
| upload_back_btn = gr.Button("Back") | |
| check_btn = gr.Button("Check", variant="primary") | |
| upload_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status") | |
| with gr.Group(visible=False) as view_results: | |
| gr.Markdown("### Results", elem_classes="panel-title") | |
| recognized_out = gr.Textbox( | |
| label="Recognized text (OCR)", interactive=False, lines=4 | |
| ) | |
| diff_out = gr.HTML(label="Feedback") | |
| with gr.Row(): | |
| results_back_btn = gr.Button("Back") | |
| restart_btn = gr.Button("Start over", variant="primary") | |
| views = [view_input, view_listen, view_upload, view_results] | |
| # CTA handlers: disable the button + show status, do the work (advance, or | |
| # raise gr.Error and stay). On success .then clears + re-enables; on error | |
| # .failure does the same (the raise aborts .then), so the button is never | |
| # left stuck disabled. | |
| start_work = start_btn.click( | |
| lambda: _begin(f"Generating dictation… {COLD_START_HINT}"), | |
| outputs=[input_status, start_btn], | |
| show_progress="hidden", | |
| ).then( | |
| start, | |
| inputs=[words_in, level_in, state], | |
| outputs=[audio_out, text_out, state, *views, intro], | |
| # Our _begin spinner is the indicator; suppress Gradio's own overlay, | |
| # which would otherwise cover the visible card (and our spinner). | |
| show_progress="hidden", | |
| ) | |
| start_work.then(_end, outputs=[input_status, start_btn], show_progress="hidden") | |
| start_work.failure(_recover, outputs=[input_status, start_btn], show_progress="hidden") | |
| check_work = check_btn.click( | |
| lambda: _begin(f"Reading your handwriting… {COLD_START_HINT}"), | |
| outputs=[upload_status, check_btn], | |
| show_progress="hidden", | |
| ).then( | |
| check, | |
| inputs=[image_in, state], | |
| outputs=[recognized_out, diff_out, *views], | |
| # Keep our _begin spinner visible; suppress Gradio's overlay (it would | |
| # cover the upload card and hide the spinner). | |
| show_progress="hidden", | |
| ) | |
| check_work.then(_end, outputs=[upload_status, check_btn], show_progress="hidden") | |
| check_work.failure(_recover, outputs=[upload_status, check_btn], show_progress="hidden") | |
| restart_btn.click( | |
| restart, | |
| outputs=[words_in, audio_out, text_out, image_in, recognized_out, diff_out, state, *views, intro], | |
| ) | |
| # Navigation (Finished + Back buttons): switch view AND clear any leftover | |
| # spinner / disabled CTA from an action on the view being left. `cancels` | |
| # aborts an in-flight Start/Check so a call the user navigated away from | |
| # can't complete and yank them forward (its outputs are discarded). | |
| nav_outputs = [*views, input_status, upload_status, start_btn, check_btn, intro] | |
| in_flight = [start_work, check_work] | |
| finished_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight) | |
| listen_back_btn.click(lambda: goto("input"), outputs=nav_outputs, cancels=in_flight) | |
| upload_back_btn.click(lambda: goto("listen"), outputs=nav_outputs, cancels=in_flight) | |
| results_back_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight) | |
| return demo | |
| if __name__ == "__main__": | |
| # On HF Spaces (SPACE_ID set) the runtime serves the app — don't request a | |
| # share tunnel there; locally, share=True gives a link usable from a phone. | |
| # theme, css and head all live on launch() in Gradio 6 (moved off Blocks). | |
| # pwa=True makes the app installable (Gradio generates the manifest + service | |
| # worker); the favicon doubles as the home-screen icon. | |
| build_ui().launch( | |
| theme=THEME, | |
| css=MOBILE_CSS, | |
| head=FA_HEAD, | |
| pwa=True, | |
| favicon_path=LOGO_PATH if os.path.exists(LOGO_PATH) else None, | |
| share="SPACE_ID" not in os.environ, | |
| ) | |