ankira

Running

App Files Files Community

ankira / app.py

nofater

Add video link

2c84660 verified 19 days ago

Raw

History Blame Contribute Delete

19.3 kB

	#!/usr/bin/env python3
	"""Dictation Trainer — Gradio Space (specs/gradio.md).

	A thin, mobile-first wrapper over three Modal endpoints. No models run here:
	every inference is an HTTP call out to Modal.

	Generate: word list + level --LLM--> German dictation --TTS--> audio
	Check: photo of handwriting --OCR(blind)--> text --grade--> diff + score

	Run locally from this directory (the Space root):
	MODAL_LLM_URL=... MODAL_TTS_URL=... MODAL_OCR_URL=... uv run python app.py
	"""

	import base64
	import json
	import os
	import tempfile
	import time

	import gradio as gr
	from loguru import logger

	# Flat imports: this file is the Space entrypoint, run with space/ as the root.
	from diff_html import render_report_html
	from ocr.grading import grade
	from ocr.transcribe import transcribe_image
	from openai_client import make_client
	from prompts import (
	DICTATION_SYSTEM_PROMPT,
	build_user_prompt,
	clean_dictation,
	parse_word_list,
	)
	from wizard import nav

	LLM_MODEL = "LiquidAI/LFM2.5-8B-A1B-GGUF"
	# Higgs ignores the model field today, but the OpenAI SDK requires one; keep it
	# descriptive in case the server starts validating it.
	TTS_MODEL = "bosonai/higgs-audio-v3-tts-4b"
	TTS_VOICE = "alba"
	# Sampling fixed per spec §6 — deterministic, repeatable dictations. top_k and
	# repeat_penalty aren't OpenAI-standard params, so they ride in extra_body.
	LLM_SAMPLING = {
	"temperature": 0.1,
	"top_p": 0.1,
	"top_k": 50,
	"repeat_penalty": 1.05,
	}
	# LFM2.5 is a reasoning model: it emits a chain-of-thought into `reasoning_content`
	# BEFORE the answer goes into `content`. The token budget must cover BOTH, or the
	# reasoning consumes it all and `content` comes back empty (finish_reason=length).
	# The verbose system prompt lengthens the reasoning, so keep this well above the
	# ~900 tokens of CoT we observed. See space/debug_llm.py.
	LLM_MAX_TOKENS = 2048
	LANG = "de"

	COLD_START_HINT = "First call after idle can take ~30-60s while backend warms up."

	# Calm, modern theme: indigo/violet accents on slate neutrals, a soft gradient
	# page, white cards with gentle shadows, roomy radius + spacing. Most of the look
	# lives here (theme variables are version-stable); CSS below only does the things
	# themes can't (phone framing, gradient title, status pill).
	THEME = gr.themes.Soft(
	primary_hue=gr.themes.colors.indigo,
	secondary_hue=gr.themes.colors.sky,
	neutral_hue=gr.themes.colors.slate,
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
	radius_size=gr.themes.sizes.radius_lg,
	spacing_size=gr.themes.sizes.spacing_lg,
	text_size=gr.themes.sizes.text_md,
	).set(
	body_background_fill="linear-gradient(160deg, #eef2fb 0%, #e9ecf7 45%, #ece9f7 100%)",
	body_background_fill_dark="linear-gradient(160deg, #0f1422 0%, #141b2d 100%)",
	body_text_color="#1e293b",
	body_text_color_subdued="#64748b",
	block_background_fill="#ffffff",
	block_background_fill_dark="#1a2234",
	block_border_width="0px",
	block_radius="18px",
	block_shadow="0 6px 24px rgba(30, 41, 59, 0.08)",
	block_shadow_dark="0 6px 24px rgba(0, 0, 0, 0.40)",
	block_padding="20px",
	layout_gap="14px",
	input_background_fill="#f8fafc",
	input_background_fill_dark="#0f1626",
	input_radius="12px",
	button_large_radius="12px",
	button_large_padding="11px 18px",
	button_primary_background_fill="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)",
	button_primary_background_fill_hover="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
	button_primary_background_fill_dark="linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%)",
	button_primary_text_color="#ffffff",
	button_primary_shadow="0 4px 14px rgba(99, 102, 241, 0.35)",
	button_primary_shadow_hover="0 6px 18px rgba(99, 102, 241, 0.45)",
	)

	# Portrait-phone framing + the bits the theme can't express: a narrow centered
	# column, a gradient app title, and the centered status (spinner) pill.
	MOBILE_CSS = """
	.gradio-container {
	max-width: 480px !important;
	margin: 0 auto !important;
	padding: 12px 14px 28px !important;
	}
	.app-header {
	display: flex;
	align-items: center;
	justify-content: center;
	gap: 12px;
	margin: 12px 0 4px;
	}
	.app-logo {
	height: 128px;
	width: auto;
	flex: 0 0 auto;
	}
	.app-title {
	font-size: 1.6rem;
	font-weight: 700;
	background: linear-gradient(135deg, #6366f1, #8b5cf6);
	-webkit-background-clip: text;
	background-clip: text;
	-webkit-text-fill-color: transparent;
	}
	.intro {
	text-align: center;
	color: #64748b;
	font-size: 0.95rem;
	line-height: 1.45;
	margin: 0 6px 6px;
	}
	/* Step title: transparent (no clipped grey strip), larger, and padded so the
	card's rounded corner never crops the leading "1 ·". */
	.panel-title {
	background: transparent !important;
	box-shadow: none !important;
	border: none !important;
	padding: 6px 18px 2px !important;
	}
	.panel-title h3 {
	font-size: 1.35rem !important;
	font-weight: 700 !important;
	color: #4f46e5 !important;
	margin: 0 !important;
	line-height: 1.3;
	}
	.status {
	text-align: center;
	font-weight: 600;
	opacity: 0.9;
	}
	.status .fa-spinner {
	margin-right: 6px;
	}
	"""

	# FontAwesome (CDN) for the animated status spinner (fa-spinner + fa-spin). Loaded
	# into <head> at launch; the icon itself is rendered as raw HTML in the status
	# fields (see _busy), which is why those Markdowns set sanitize_html=False.
	FA_HEAD = (
	'<link rel="stylesheet" '
	'href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/css/all.min.css">'
	)
	SPINNER = '<i class="fa-solid fa-spinner fa-spin"></i>'


	# Shared by the in-app header (embedded base64) and the PWA / favicon icon.
	LOGO_PATH = os.path.join(
	os.path.dirname(os.path.abspath(__file__)), "assets", "ankira_blue.png"
	)


	def _logo_data_uri() -> str:
	"""Embed the scaled-down Ankira logo as a base64 PNG data URI, so the header
	needs no Gradio file-serving allowlist. Empty string if the file is missing
	(the header then falls back to the ✍️ emoji)."""
	try:
	with open(LOGO_PATH, "rb") as f:
	b64 = base64.b64encode(f.read()).decode("ascii")
	return f"data:image/png;base64,{b64}"
	except OSError:
	return ""


	_LOGO_URI = _logo_data_uri()
	_LOGO_TAG = (
	f'<img class="app-logo" alt="Ankira" src="{_LOGO_URI}">' if _LOGO_URI else "✍️"
	)
	HEADER_HTML = (
	f'<div class="app-header">{_LOGO_TAG}'
	f'<span class="app-title">Ankira: German Dictation Trainer</span></div>'
	)


	def _require_env(name: str) -> str:
	val = os.environ.get(name)
	if not val:
	raise gr.Error(f"{name} is not configured (set it in the Space secrets).")
	return val


	def _tts_base_url() -> str:
	"""MODAL_TTS_URL may be the server root or the full speech path; reduce it to
	the server root so the client appends /v1/audio/speech itself."""
	url = _require_env("MODAL_TTS_URL").rstrip("/")
	for suffix in ("/v1/audio/speech", "/audio/speech"):
	if url.endswith(suffix):
	return url[: -len(suffix)]
	return url


	def call_llm(words: list[str], level: str) -> str:
	"""Word list + CEFR level -> one German dictation paragraph (cleaned)."""
	client = make_client(_require_env("MODAL_LLM_URL"))
	completion = client.chat.completions.create(
	model=LLM_MODEL,
	messages=[
	{"role": "system", "content": DICTATION_SYSTEM_PROMPT},
	{"role": "user", "content": build_user_prompt(words, level)},
	],
	max_tokens=LLM_MAX_TOKENS,
	extra_body=LLM_SAMPLING,
	)
	data = completion.model_dump()
	choice = data.get("choices", [{}])[0]
	content = (choice.get("message") or {}).get("content")
	text = clean_dictation(content)
	if not text:
	# Evidence at the LLM boundary: see exactly what came back when empty.
	logger.error(
	"Empty dictation from LLM. finish_reason={} raw_content={!r}\nfull response: {}",
	choice.get("finish_reason"),
	content,
	json.dumps(data, ensure_ascii=False)[:2000],
	)
	return text


	def call_tts(text: str) -> str:
	"""Synthesize the dictation; return a temp audio file path. Suffix follows
	the response Content-Type so gr.Audio plays it without transcoding."""
	client = make_client(_tts_base_url())
	response = client.audio.speech.create(model=TTS_MODEL, voice=TTS_VOICE, input=text)
	audio = response.read()
	content_type = response.response.headers.get("content-type", "")
	suffix = ".mp3" if "mpeg" in content_type else ".wav"
	with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
	f.write(audio)
	return f.name


	# ---- Step handlers ---------------------------------------------------------
	#
	# CTA handlers do their work, then append nav(...) so the wizard advances only on
	# success; raising gr.Error leaves every output (including the views) untouched,
	# so the user stays on the current step to fix the problem.


	def start(words_raw: str, level: str, state: dict):
	"""Input → Listen: generate the dictation text + audio."""
	words = parse_word_list(words_raw)
	if not words:
	raise gr.Error("Enter at least one word to practice.")

	text = call_llm(words, level)
	if not text:
	logger.error("LLM returned an empty dictation ({} words, {})", len(words), level)
	raise gr.Error("The model returned an empty dictation. Please try again.")
	logger.info("Generated dictation ({} words, {}):\n{}", len(words), level, text)
	state = {"diktat": text, "created_at": time.time()}

	try:
	audio_path = call_tts(text)
	except Exception as e: # never lose the text because synthesis failed (spec §4)
	gr.Warning(f"Audio synthesis failed ({e}). Text saved — open 'Show text'.")
	audio_path = None

	# trailing update hides the first-step intro (see the outputs list in build_ui).
	return audio_path, text, state, *nav("listen"), gr.update(visible=False)


	def check(image_path: str, state: dict):
	"""Upload → Results: transcribe the photo (blind) and grade it."""
	if not state or not state.get("diktat"):
	raise gr.Error("Generate a dictation first.")
	if not image_path:
	raise gr.Error("Upload a photo of your handwriting first.")

	ocr_client = make_client(_require_env("MODAL_OCR_URL"))
	transcription = transcribe_image(image_path, ocr_client)
	logger.info("OCR transcription:\n{}", transcription)
	report = grade(state["diktat"], transcription, LANG)
	return transcription, render_report_html(report), *nav("results")


	def restart():
	"""Results → Input: clear everything and start a fresh dictation."""
	fresh = {"diktat": "", "created_at": 0}
	# order matches the `outputs` list on the Start-over button below; trailing
	# update re-shows the first-step intro.
	return "", None, "", None, "", "", fresh, *nav("input"), gr.update(visible=True)


	# Inline progress: the CTA's real outputs all live on the next (hidden) view, so
	# Gradio's spinner would paint where the user can't see it. Instead each CTA runs
	# as show-status -> work -> hide-status; the hide step uses .then so it fires even
	# when the work raises (no spinner left stuck on screen).


	def _busy(message: str):
	return gr.update(value=f"{SPINNER} {message}", visible=True)


	def _idle():
	return gr.update(visible=False)


	def _begin(message: str):
	"""Show the spinner and disable the CTA so it can't be re-fired mid-call."""
	return _busy(message), gr.update(interactive=False)


	def _end():
	"""Hide the spinner and re-enable the CTA on success (.then path)."""
	return _idle(), gr.update(interactive=True)


	def _recover(*_):
	"""Same cleanup for the failure path. A raised gr.Error aborts the chained
	.then, so re-enabling has to be wired via .failure (which passes the
	exception as an arg — ignored here). Without this, a validation error like
	'no photo uploaded' would leave the button stuck disabled."""
	return _idle(), gr.update(interactive=True)


	def goto(target: str):
	"""Switch to a view and clear transient state (both status spinners hidden,
	both CTAs re-enabled). Navigating away during a wait shouldn't leave a stale
	spinner or a disabled button behind on the view you left. Return order matches
	the NAV_OUTPUTS list wired in build_ui."""
	return (
	*nav(target),
	gr.update(visible=False), # input_status
	gr.update(visible=False), # upload_status
	gr.update(interactive=True), # start_btn
	gr.update(interactive=True), # check_btn
	gr.update(visible=target == "input"), # intro (first step only)
	)


	# ---- UI --------------------------------------------------------------------


	def build_ui() -> gr.Blocks:
	with gr.Blocks(title="Dictation Trainer") as demo:
	# localStorage-backed: survives a tab reload while the learner writes (spec §3).
	state = gr.BrowserState({"diktat": "", "created_at": 0})

	gr.HTML(HEADER_HTML)

	# Shown on the first step only (toggled by start/goto/restart below).
	intro = gr.Markdown(
	"Practice German spelling by ear. Enter a few words, get a short "
	"dictation read aloud, and write it down by hand. Finally photograph "
	"your page for instant word-by-word feedback.",
	elem_classes="intro",
	)

	# Four stacked views; nav() keeps exactly one visible. Order must match
	# wizard.VIEWS.
	with gr.Group(visible=True) as view_input:
	gr.Markdown("### Let's start!", elem_classes="panel-title")

	words_in = gr.Textbox(
	label="Words to practice",
	placeholder="Comma- or newline-separated, e.g. angeblich, ablehnen, Apfel",
	lines=4,
	)
	level_in = gr.Dropdown(["A1", "A2", "B1", "B2"], value="A2", label="Level")
	start_btn = gr.Button("Generate", variant="primary")
	input_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status")

	with gr.Group(visible=False) as view_listen:
	gr.Markdown("### Listen", elem_classes="panel-title")
	audio_out = gr.Audio(
	type="filepath", interactive=False, label="Dictation audio"
	)
	gr.Markdown("🎧 Listen and write it down on paper, then click Finished.")
	with gr.Accordion("Show text (debug)", open=False):
	text_out = gr.Textbox(label="Dictation text", interactive=False, lines=4)
	with gr.Row():
	listen_back_btn = gr.Button("Back")
	finished_btn = gr.Button("Finished", variant="primary")

	with gr.Group(visible=False) as view_upload:
	gr.Markdown("### Upload", elem_classes="panel-title")
	image_in = gr.Image(
	type="filepath",
	sources=["upload", "webcam", "clipboard"],
	label="Photo of your handwriting",
	)
	with gr.Row():
	upload_back_btn = gr.Button("Back")
	check_btn = gr.Button("Check", variant="primary")
	upload_status = gr.Markdown(visible=False, sanitize_html=False, elem_classes="status")

	with gr.Group(visible=False) as view_results:
	gr.Markdown("### Results", elem_classes="panel-title")
	recognized_out = gr.Textbox(
	label="Recognized text (OCR)", interactive=False, lines=4
	)
	diff_out = gr.HTML(label="Feedback")
	with gr.Row():
	results_back_btn = gr.Button("Back")
	restart_btn = gr.Button("Start over", variant="primary")

	views = [view_input, view_listen, view_upload, view_results]

	# CTA handlers: disable the button + show status, do the work (advance, or
	# raise gr.Error and stay). On success .then clears + re-enables; on error
	# .failure does the same (the raise aborts .then), so the button is never
	# left stuck disabled.
	start_work = start_btn.click(
	lambda: _begin(f"Generating dictation… {COLD_START_HINT}"),
	outputs=[input_status, start_btn],
	show_progress="hidden",
	).then(
	start,
	inputs=[words_in, level_in, state],
	outputs=[audio_out, text_out, state, *views, intro],
	# Our _begin spinner is the indicator; suppress Gradio's own overlay,
	# which would otherwise cover the visible card (and our spinner).
	show_progress="hidden",
	)
	start_work.then(_end, outputs=[input_status, start_btn], show_progress="hidden")
	start_work.failure(_recover, outputs=[input_status, start_btn], show_progress="hidden")

	check_work = check_btn.click(
	lambda: _begin(f"Reading your handwriting… {COLD_START_HINT}"),
	outputs=[upload_status, check_btn],
	show_progress="hidden",
	).then(
	check,
	inputs=[image_in, state],
	outputs=[recognized_out, diff_out, *views],
	# Keep our _begin spinner visible; suppress Gradio's overlay (it would
	# cover the upload card and hide the spinner).
	show_progress="hidden",
	)
	check_work.then(_end, outputs=[upload_status, check_btn], show_progress="hidden")
	check_work.failure(_recover, outputs=[upload_status, check_btn], show_progress="hidden")
	restart_btn.click(
	restart,
	outputs=[words_in, audio_out, text_out, image_in, recognized_out, diff_out, state, *views, intro],
	)

	# Navigation (Finished + Back buttons): switch view AND clear any leftover
	# spinner / disabled CTA from an action on the view being left. `cancels`
	# aborts an in-flight Start/Check so a call the user navigated away from
	# can't complete and yank them forward (its outputs are discarded).
	nav_outputs = [*views, input_status, upload_status, start_btn, check_btn, intro]
	in_flight = [start_work, check_work]
	finished_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight)
	listen_back_btn.click(lambda: goto("input"), outputs=nav_outputs, cancels=in_flight)
	upload_back_btn.click(lambda: goto("listen"), outputs=nav_outputs, cancels=in_flight)
	results_back_btn.click(lambda: goto("upload"), outputs=nav_outputs, cancels=in_flight)

	return demo


	if __name__ == "__main__":
	# On HF Spaces (SPACE_ID set) the runtime serves the app — don't request a
	# share tunnel there; locally, share=True gives a link usable from a phone.
	# theme, css and head all live on launch() in Gradio 6 (moved off Blocks).
	# pwa=True makes the app installable (Gradio generates the manifest + service
	# worker); the favicon doubles as the home-screen icon.
	build_ui().launch(
	theme=THEME,
	css=MOBILE_CSS,
	head=FA_HEAD,
	pwa=True,
	favicon_path=LOGO_PATH if os.path.exists(LOGO_PATH) else None,
	share="SPACE_ID" not in os.environ,
	)