#!/usr/bin/env python3
"""Dictation Trainer — Gradio Space (specs/gradio.md).
A thin, mobile-first wrapper over three Modal endpoints. No models run here:
every inference is an HTTP call out to Modal.
Generate: word list + level --LLM--> German dictation --TTS--> audio
Check: photo of handwriting --OCR(blind)--> text --grade--> diff + score
Run locally from this directory (the Space root):
MODAL_LLM_URL=... MODAL_TTS_URL=... MODAL_OCR_URL=... uv run python app.py
"""
import base64
import json
import os
import tempfile
import time
import gradio as gr
from loguru import logger
# Flat imports: this file is the Space entrypoint, run with space/ as the root.
from diff_html import render_report_html
from ocr.grading import grade
from ocr.transcribe import transcribe_image
from openai_client import make_client
from prompts import (
DICTATION_SYSTEM_PROMPT,
build_user_prompt,
clean_dictation,
parse_word_list,
)
from wizard import nav
LLM_MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF"
# Higgs ignores the model field today, but the OpenAI SDK requires one; keep it
# descriptive in case the server starts validating it.
TTS_MODEL = "bosonai/higgs-audio-v3-tts-4b"
TTS_VOICE = "alba"
# Sampling fixed per spec §6 — deterministic, repeatable dictations. top_k and
# repeat_penalty aren't OpenAI-standard params, so they ride in extra_body.
LLM_SAMPLING = {
"temperature": 0.1,
"top_p": 0.1,
"top_k": 50,
"repeat_penalty": 1.05,
}
# LFM2.5 is a reasoning model: it emits a chain-of-thought into `reasoning_content`
# BEFORE the answer goes into `content`. The token budget must cover BOTH, or the
# reasoning consumes it all and `content` comes back empty (finish_reason=length).
# The verbose system prompt lengthens the reasoning, so keep this well above the
# ~900 tokens of CoT we observed. See space/debug_llm.py.
LLM_MAX_TOKENS = 2048
LANG = "de"
COLD_START_HINT = "First call after idle can take ~30-60s while backend warms up."
# Calm, modern theme: indigo/violet accents on slate neutrals, a soft gradient
# page, white cards with gentle shadows, roomy radius + spacing. Most of the look
# lives here (theme variables are version-stable); CSS below only does the things
# themes can't (phone framing, gradient title, status pill).
THEME = gr.themes.Soft(
primary_hue=gr.themes.colors.indigo,
secondary_hue=gr.themes.colors.sky,
neutral_hue=gr.themes.colors.slate,
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
radius_size=gr.themes.sizes.radius_lg,
spacing_size=gr.themes.sizes.spacing_lg,
text_size=gr.themes.sizes.text_md,
).set(
body_background_fill="linear-gradient(160deg, #eef2fb 0%, #e9ecf7 45%, #ece9f7 100%)",
body_background_fill_dark="linear-gradient(160deg, #0f1422 0%, #141b2d 100%)",
body_text_color="#1e293b",
body_text_color_subdued="#64748b",
block_background_fill="#ffffff",
block_background_fill_dark="#1a2234",
block_border_width="0px",
block_radius="18px",
block_shadow="0 6px 24px rgba(30, 41, 59, 0.08)",
block_shadow_dark="0 6px 24px rgba(0, 0, 0, 0.40)",
block_padding="20px",
layout_gap="14px",
input_background_fill="#f8fafc",
input_background_fill_dark="#0f1626",
input_radius="12px",
button_large_radius="12px",
button_large_padding="11px 18px",
button_primary_background_fill="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)",
button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
button_primary_background_fill_dark="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)",
button_primary_text_color="#ffffff",
button_primary_shadow="0 4px 14px rgba(99, 102, 241, 0.35)",
button_primary_shadow_hover="0 6px 18px rgba(99, 102, 241, 0.45)",
)
# Portrait-phone framing + the bits the theme can't express: a narrow centered
# column, a gradient app title, and the centered status (spinner) pill.
MOBILE_CSS = """
.gradio-container {
max-width: 480px !important;
margin: 0 auto !important;
padding: 12px 14px 28px !important;
}
.app-header {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
margin: 12px 0 4px;
}
.app-logo {
height: 128px;
width: auto;
flex: 0 0 auto;
}
.app-title {
font-size: 1.6rem;
font-weight: 700;
background: linear-gradient(135deg, #6366f1, #8b5cf6);
-webkit-background-clip: text;
background-clip: text;
-webkit-text-fill-color: transparent;
}
.intro {
text-align: center;
color: #64748b;
font-size: 0.95rem;
line-height: 1.45;
margin: 0 6px 6px;
}
/* Step title: transparent (no clipped grey strip), larger, and padded so the
card's rounded corner never crops the leading "1 ·". */
.panel-title {
background: transparent !important;
box-shadow: none !important;
border: none !important;
padding: 6px 18px 2px !important;
}
.panel-title h3 {
font-size: 1.35rem !important;
font-weight: 700 !important;
color: #4f46e5 !important;
margin: 0 !important;
line-height: 1.3;
}
.status {
text-align: center;
font-weight: 600;
opacity: 0.9;
}
.status .fa-spinner {
margin-right: 6px;
}
"""
# FontAwesome (CDN) for the animated status spinner (fa-spinner + fa-spin). Loaded
# into
at launch; the icon itself is rendered as raw HTML in the status
# fields (see _busy), which is why those Markdowns set sanitize_html=False.
FA_HEAD = (
''
)
SPINNER = ''
# Shared by the in-app header (embedded base64) and the PWA / favicon icon.
LOGO_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "assets", "ankira_blue.png"
)
def _logo_data_uri() -> str:
"""Embed the scaled-down Ankira logo as a base64 PNG data URI, so the header
needs no Gradio file-serving allowlist. Empty string if the file is missing
(the header then falls back to the ✍️ emoji)."""
try:
with open(LOGO_PATH, "rb") as f:
b64 = base64.b64encode(f.read()).decode("ascii")
return f"data:image/png;base64,{b64}"
except OSError:
return ""
_LOGO_URI = _logo_data_uri()
_LOGO_TAG = (
f'' if _LOGO_URI else "✍️"
)
HEADER_HTML = (
f'
{_LOGO_TAG}'
f'Ankira: German Dictation Trainer
'
)
def _require_env(name: str) -> str:
val = os.environ.get(name)
if not val:
raise gr.Error(f"{name} is not configured (set it in the Space secrets).")
return val
def _tts_base_url() -> str:
"""MODAL_TTS_URL may be the server root or the full speech path; reduce it to
the server root so the client appends /v1/audio/speech itself."""
url = _require_env("MODAL_TTS_URL").rstrip("/")
for suffix in ("/v1/audio/speech", "/audio/speech"):
if url.endswith(suffix):
return url[: -len(suffix)]
return url
def call_llm(words: list[str], level: str) -> str:
"""Word list + CEFR level -> one German dictation paragraph (cleaned)."""
client = make_client(_require_env("MODAL_LLM_URL"))
completion = client.chat.completions.create(
model=LLM_MODEL,
messages=[
{"role": "system", "content": DICTATION_SYSTEM_PROMPT},
{"role": "user", "content": build_user_prompt(words, level)},
],
max_tokens=LLM_MAX_TOKENS,
extra_body=LLM_SAMPLING,
)
data = completion.model_dump()
choice = data.get("choices", [{}])[0]
content = (choice.get("message") or {}).get("content")
text = clean_dictation(content)
if not text:
# Evidence at the LLM boundary: see exactly what came back when empty.
logger.error(
"Empty dictation from LLM. finish_reason={} raw_content={!r}\nfull response: {}",
choice.get("finish_reason"),
content,
json.dumps(data, ensure_ascii=False)[:2000],
)
return text
def call_tts(text: str) -> str:
"""Synthesize the dictation; return a temp audio file path. Suffix follows
the response Content-Type so gr.Audio plays it without transcoding."""
client = make_client(_tts_base_url())
response = client.audio.speech.create(model=TTS_MODEL, voice=TTS_VOICE, input=text)
audio = response.read()
content_type = response.response.headers.get("content-type", "")
suffix = ".mp3" if "mpeg" in content_type else ".wav"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
f.write(audio)
return f.name
# ---- Step handlers ---------------------------------------------------------
#
# CTA handlers do their work, then append nav(...) so the wizard advances only on
# success; raising gr.Error leaves every output (including the views) untouched,
# so the user stays on the current step to fix the problem.
def start(words_raw: str, level: str, state: dict):
"""Input → Listen: generate the dictation text + audio."""
words = parse_word_list(words_raw)
if not words:
raise gr.Error("Enter at least one word to practice.")
text = call_llm(words, level)
if not text:
logger.error("LLM returned an empty dictation ({} words, {})", len(words), level)
raise gr.Error("The model returned an empty dictation. Please try again.")
logger.info("Generated dictation ({} words, {}):\n{}", len(words), level, text)
state = {"diktat": text, "created_at": time.time()}
try:
audio_path = call_tts(text)
except Exception as e: # never lose the text because synthesis failed (spec §4)
gr.Warning(f"Audio synthesis failed ({e}). Text saved — open 'Show text'.")
audio_path = None
# trailing update hides the first-step intro (see the outputs list in build_ui).
return audio_path, text, state, *nav("listen"), gr.update(visible=False)
def check(image_path: str, state: dict):
"""Upload → Results: transcribe the photo (blind) and grade it."""
if not state or not state.get("diktat"):
raise gr.Error("Generate a dictation first.")
if not image_path:
raise gr.Error("Upload a photo of your handwriting first.")
ocr_client = make_client(_require_env("MODAL_OCR_URL"))
transcription = transcribe_image(image_path, ocr_client)
logger.info("OCR transcription:\n{}", transcription)
report = grade(state["diktat"], transcription, LANG)
return transcription, render_report_html(report), *nav("results")
def restart():
"""Results → Input: clear everything and start a fresh dictation."""
fresh = {"diktat": "", "created_at": 0}
# order matches the `outputs` list on the Start-over button below; trailing
# update re-shows the first-step intro.
return "", None, "", None, "", "", fresh, *nav("input"), gr.update(visible=True)
# Inline progress: the CTA's real outputs all live on the next (hidden) view, so
# Gradio's spinner would paint where the user can't see it. Instead each CTA runs
# as show-status -> work -> hide-status; the hide step uses .then so it fires even
# when the work raises (no spinner left stuck on screen).
def _busy(message: str):
return gr.update(value=f"{SPINNER} {message}", visible=True)
def _idle():
return gr.update(visible=False)
def _begin(message: str):
"""Show the spinner and disable the CTA so it can't be re-fired mid-call."""
return _busy(message), gr.update(interactive=False)
def _end():
"""Hide the spinner and re-enable the CTA on success (.then path)."""
return _idle(), gr.update(interactive=True)
def _recover(*_):
"""Same cleanup for the failure path. A raised gr.Error aborts the chained
.then, so re-enabling has to be wired via .failure (which passes the
exception as an arg — ignored here). Without this, a validation error like
'no photo uploaded' would leave the button stuck disabled."""
return _idle(), gr.update(interactive=True)
def goto(target: str):
"""Switch to a view and clear transient state (both status spinners hidden,
both CTAs re-enabled). Navigating away during a wait shouldn't leave a stale
spinner or a disabled button behind on the view you left. Return order matches
the NAV_OUTPUTS list wired in build_ui."""
return (
*nav(target),
gr.update(visible=False), # input_status
gr.update(visible=False), # upload_status
gr.update(interactive=True), # start_btn
gr.update(interactive=True), # check_btn
gr.update(visible=target == "input"), # intro (first step only)
)
# ---- UI --------------------------------------------------------------------
def build_ui() -> gr.Blocks:
with gr.Blocks(title="Dictation Trainer") as demo:
# localStorage-backed: survives a tab reload while the learner writes (spec §3).
state = gr.BrowserState({"diktat": "", "created_at": 0})
gr.HTML(HEADER_HTML)
# Shown on the first step only (toggled by start/goto/restart below).
intro = gr.Markdown(
"Practice German spelling by ear. Enter a few words, get a short "
"dictation read aloud, and write it down by hand. Finally photograph "
"your page for instant word-by-word feedback.",
elem_classes="intro",
)
# Four stacked views; nav() keeps exactly one visible. Order must match
# wizard.VIEWS.
with gr.Group(visible=True) as view_input:
gr.Markdown("### Let's start!", elem_classes="panel-title")
words_in = gr.Textbox(
label="Words to practice",
placeholder="Comma- or newline-separated, e.g. angeblich, ablehnen, Apfel",
lines=4,
)
level_in = gr.Dropdown(["A1", "A2", "B1", "B2"], value="A2", label="Level")
start_btn = gr.Button("Generate", variant="primary")
input_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status")
with gr.Group(visible=False) as view_listen:
gr.Markdown("### Listen", elem_classes="panel-title")
audio_out = gr.Audio(
type="filepath", interactive=False, label="Dictation audio"
)
gr.Markdown("🎧 Listen and write it down on paper, then click **Finished**.")
with gr.Accordion("Show text (debug)", open=False):
text_out = gr.Textbox(label="Dictation text", interactive=False, lines=4)
with gr.Row():
listen_back_btn = gr.Button("Back")
finished_btn = gr.Button("Finished", variant="primary")
with gr.Group(visible=False) as view_upload:
gr.Markdown("### Upload", elem_classes="panel-title")
image_in = gr.Image(
type="filepath",
sources=["upload", "webcam", "clipboard"],
label="Photo of your handwriting",
)
with gr.Row():
upload_back_btn = gr.Button("Back")
check_btn = gr.Button("Check", variant="primary")
upload_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status")
with gr.Group(visible=False) as view_results:
gr.Markdown("### Results", elem_classes="panel-title")
recognized_out = gr.Textbox(
label="Recognized text (OCR)", interactive=False, lines=4
)
diff_out = gr.HTML(label="Feedback")
with gr.Row():
results_back_btn = gr.Button("Back")
restart_btn = gr.Button("Start over", variant="primary")
views = [view_input, view_listen, view_upload, view_results]
# CTA handlers: disable the button + show status, do the work (advance, or
# raise gr.Error and stay). On success .then clears + re-enables; on error
# .failure does the same (the raise aborts .then), so the button is never
# left stuck disabled.
start_work = start_btn.click(
lambda: _begin(f"Generating dictation… {COLD_START_HINT}"),
outputs=[input_status, start_btn],
show_progress="hidden",
).then(
start,
inputs=[words_in, level_in, state],
outputs=[audio_out, text_out, state, *views, intro],
# Our _begin spinner is the indicator; suppress Gradio's own overlay,
# which would otherwise cover the visible card (and our spinner).
show_progress="hidden",
)
start_work.then(_end, outputs=[input_status, start_btn], show_progress="hidden")
start_work.failure(_recover, outputs=[input_status, start_btn], show_progress="hidden")
check_work = check_btn.click(
lambda: _begin(f"Reading your handwriting… {COLD_START_HINT}"),
outputs=[upload_status, check_btn],
show_progress="hidden",
).then(
check,
inputs=[image_in, state],
outputs=[recognized_out, diff_out, *views],
# Keep our _begin spinner visible; suppress Gradio's overlay (it would
# cover the upload card and hide the spinner).
show_progress="hidden",
)
check_work.then(_end, outputs=[upload_status, check_btn], show_progress="hidden")
check_work.failure(_recover, outputs=[upload_status, check_btn], show_progress="hidden")
restart_btn.click(
restart,
outputs=[words_in, audio_out, text_out, image_in, recognized_out, diff_out, state, *views, intro],
)
# Navigation (Finished + Back buttons): switch view AND clear any leftover
# spinner / disabled CTA from an action on the view being left. `cancels`
# aborts an in-flight Start/Check so a call the user navigated away from
# can't complete and yank them forward (its outputs are discarded).
nav_outputs = [*views, input_status, upload_status, start_btn, check_btn, intro]
in_flight = [start_work, check_work]
finished_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight)
listen_back_btn.click(lambda: goto("input"), outputs=nav_outputs, cancels=in_flight)
upload_back_btn.click(lambda: goto("listen"), outputs=nav_outputs, cancels=in_flight)
results_back_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight)
return demo
if __name__ == "__main__":
# On HF Spaces (SPACE_ID set) the runtime serves the app — don't request a
# share tunnel there; locally, share=True gives a link usable from a phone.
# theme, css and head all live on launch() in Gradio 6 (moved off Blocks).
# pwa=True makes the app installable (Gradio generates the manifest + service
# worker); the favicon doubles as the home-screen icon.
build_ui().launch(
theme=THEME,
css=MOBILE_CSS,
head=FA_HEAD,
pwa=True,
favicon_path=LOGO_PATH if os.path.exists(LOGO_PATH) else None,
share="SPACE_ID" not in os.environ,
)