Spaces:
Sleeping
Sleeping
Commit ·
21fafd4
1
Parent(s): 849e756
feat: answer-language toggle (UK/EN) for judges
Browse files
app.py
CHANGED
|
@@ -4,7 +4,10 @@ Ukrainian official-letter decoder. Photo or pasted text in; plain-language
|
|
| 4 |
summary, required actions, deadlines, and scam flags out. Fully local:
|
| 5 |
Tesseract OCR + Qwen3-4B-Instruct-2507 (Q4_K_M) via llama.cpp. No cloud calls.
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import html
|
|
@@ -42,6 +45,59 @@ SYSTEM_PROMPT = """Ти — «Розшифровувач паперів», по
|
|
| 42 |
|
| 43 |
ПРАВИЛА ДАТ І СУМ: використовуй лише дати та суми, які є в листі, і копіюй їх точно. Відносні строки («протягом 10 календарних днів») залишай відносними — не перетворюй їх на конкретні дати. Дати у "summary" мають дослівно збігатися з датами у "deadlines". Нічого не вигадуй."""
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
# ---------------------------------------------------------------- OCR path
|
| 46 |
|
| 47 |
|
|
@@ -62,19 +118,20 @@ def ocr_image(filepath: str | None) -> str:
|
|
| 62 |
# ------------------------------------------------------------- model call
|
| 63 |
|
| 64 |
|
| 65 |
-
def stream_completion(letter_text: str):
|
| 66 |
"""Yield (accumulated_text, completion_tokens, elapsed_s) while streaming."""
|
|
|
|
| 67 |
payload = {
|
| 68 |
"model": "paper-decoder",
|
| 69 |
"temperature": 0.2,
|
| 70 |
"seed": 42,
|
| 71 |
"max_tokens": 900,
|
| 72 |
-
"cache_prompt": True,
|
| 73 |
"response_format": {"type": "json_object"},
|
| 74 |
"stream": True,
|
| 75 |
"stream_options": {"include_usage": True},
|
| 76 |
"messages": [
|
| 77 |
-
{"role": "system", "content":
|
| 78 |
{"role": "user", "content": letter_text},
|
| 79 |
],
|
| 80 |
}
|
|
@@ -112,13 +169,12 @@ def parse_model_json(text: str) -> dict | None:
|
|
| 112 |
if candidate.startswith("json"):
|
| 113 |
candidate = candidate[4:]
|
| 114 |
candidate = candidate.strip()
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
pass
|
| 122 |
start, end = candidate.find("{"), candidate.rfind("}")
|
| 123 |
if start != -1 and end > start:
|
| 124 |
try:
|
|
@@ -149,21 +205,21 @@ def render_error(message: str) -> str:
|
|
| 149 |
return f'<div class="pd-error">{esc(message)}</div>'
|
| 150 |
|
| 151 |
|
| 152 |
-
def render_result(data: dict, tok_s: float | None) -> str:
|
| 153 |
flags = data.get("scam_flags") or []
|
| 154 |
is_scam = len(flags) > 0
|
| 155 |
|
| 156 |
if is_scam:
|
| 157 |
-
stamp = '<div class="pd-stamp pd-stamp-danger">
|
| 158 |
else:
|
| 159 |
-
stamp = '<div class="pd-stamp pd-stamp-ok">
|
| 160 |
|
| 161 |
parts = [f'<div class="pd-stamp-row">{stamp}</div>']
|
| 162 |
|
| 163 |
summary = data.get("summary")
|
| 164 |
if summary:
|
| 165 |
parts.append(
|
| 166 |
-
'<section class="pd-card"><h3>
|
| 167 |
f'<p class="pd-summary">{esc(summary)}</p></section>'
|
| 168 |
)
|
| 169 |
|
|
@@ -175,7 +231,7 @@ def render_result(data: dict, tok_s: float | None) -> str:
|
|
| 175 |
if isinstance(f, dict)
|
| 176 |
)
|
| 177 |
parts.append(
|
| 178 |
-
'<section class="pd-card pd-card-danger"><h3>
|
| 179 |
f'<ul class="pd-flags">{rows}</ul></section>'
|
| 180 |
)
|
| 181 |
|
|
@@ -183,7 +239,7 @@ def render_result(data: dict, tok_s: float | None) -> str:
|
|
| 183 |
if actions:
|
| 184 |
rows = "".join(f"<li>{esc(a)}</li>" for a in actions)
|
| 185 |
parts.append(
|
| 186 |
-
'<section class="pd-card"><h3>
|
| 187 |
f'<ol class="pd-actions">{rows}</ol></section>'
|
| 188 |
)
|
| 189 |
|
|
@@ -200,16 +256,15 @@ def render_result(data: dict, tok_s: float | None) -> str:
|
|
| 200 |
f'<td>{esc(d.get("what", ""))}</td>{amount_html}</tr>'
|
| 201 |
)
|
| 202 |
parts.append(
|
| 203 |
-
'<section class="pd-card"><h3>
|
| 204 |
'<table class="pd-deadlines"><thead><tr>'
|
| 205 |
-
|
|
|
|
| 206 |
f"<tbody>{rows}</tbody></table></section>"
|
| 207 |
)
|
| 208 |
|
| 209 |
-
speed = f" · {tok_s:.1f}
|
| 210 |
-
parts.append(
|
| 211 |
-
f'<div class="pd-meta">Оброблено локально, без інтернет-сервісів{speed}</div>'
|
| 212 |
-
)
|
| 213 |
return '<div class="pd-result">' + "".join(parts) + "</div>"
|
| 214 |
|
| 215 |
|
|
@@ -225,40 +280,42 @@ def do_ocr(image_path):
|
|
| 225 |
return text
|
| 226 |
|
| 227 |
|
| 228 |
-
def decode(letter_text: str):
|
|
|
|
|
|
|
|
|
|
| 229 |
letter_text = (letter_text or "").strip()
|
| 230 |
if not letter_text or letter_text.startswith("(Не вдалося"):
|
| 231 |
-
yield render_error(
|
| 232 |
-
"Спочатку додайте лист: сфотографуйте його або вставте текст у поле зліва."
|
| 233 |
-
)
|
| 234 |
return
|
| 235 |
letter_text = letter_text[:MAX_LETTER_CHARS]
|
| 236 |
|
| 237 |
-
yield render_status("
|
| 238 |
|
| 239 |
acc, completion_tokens, elapsed = "", 0, 0.0
|
| 240 |
try:
|
| 241 |
last_yield = 0.0
|
| 242 |
-
for acc, completion_tokens, elapsed in stream_completion(letter_text):
|
| 243 |
if elapsed - last_yield >= 1.0:
|
| 244 |
last_yield = elapsed
|
| 245 |
yield render_status(
|
| 246 |
-
"
|
|
|
|
| 247 |
)
|
| 248 |
except requests.RequestException as e:
|
| 249 |
-
yield render_error(f"
|
| 250 |
return
|
| 251 |
|
| 252 |
data = parse_model_json(acc)
|
| 253 |
if data is None:
|
| 254 |
yield (
|
| 255 |
-
render_error("
|
| 256 |
+ f'<pre class="pd-raw">{esc(acc[:2000])}</pre>'
|
| 257 |
)
|
| 258 |
return
|
| 259 |
|
| 260 |
tok_s = (completion_tokens / elapsed) if completion_tokens and elapsed > 0 else None
|
| 261 |
-
yield render_result(data, tok_s)
|
| 262 |
|
| 263 |
|
| 264 |
# --------------------------------------------------------------------- UI
|
|
@@ -378,6 +435,11 @@ with gr.Blocks(title="Paper Decoder — Розшифровувач папері
|
|
| 378 |
placeholder="…або вставте текст листа сюди",
|
| 379 |
lines=10,
|
| 380 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
with gr.Row():
|
| 382 |
decode_btn = gr.Button("Розшифрувати лист", variant="primary", size="lg")
|
| 383 |
clear_btn = gr.ClearButton([image, letter], value="Очистити")
|
|
@@ -388,12 +450,13 @@ with gr.Blocks(title="Paper Decoder — Розшифровувач папері
|
|
| 388 |
)
|
| 389 |
with gr.Column(scale=6):
|
| 390 |
result = gr.HTML(
|
| 391 |
-
'<div class="pd-meta">Тут з\'явиться розшифровка листа.
|
|
|
|
| 392 |
)
|
| 393 |
|
| 394 |
image.upload(do_ocr, inputs=[image], outputs=[letter])
|
| 395 |
-
decode_btn.click(decode, inputs=[letter], outputs=[result])
|
| 396 |
-
letter.submit(decode, inputs=[letter], outputs=[result])
|
| 397 |
|
| 398 |
if __name__ == "__main__":
|
| 399 |
demo.launch(css=CSS)
|
|
|
|
| 4 |
summary, required actions, deadlines, and scam flags out. Fully local:
|
| 5 |
Tesseract OCR + Qwen3-4B-Instruct-2507 (Q4_K_M) via llama.cpp. No cloud calls.
|
| 6 |
|
| 7 |
+
Answer-language toggle (UK/EN): the product is Ukrainian-first for my parents;
|
| 8 |
+
English answers exist so hackathon judges can evaluate output quality.
|
| 9 |
+
|
| 10 |
+
Build Small Hackathon 2026 · Backyard AI track.
|
| 11 |
"""
|
| 12 |
|
| 13 |
import html
|
|
|
|
| 45 |
|
| 46 |
ПРАВИЛА ДАТ І СУМ: використовуй лише дати та суми, які є в листі, і копіюй їх точно. Відносні строки («протягом 10 календарних днів») залишай відносними — не перетворюй їх на конкретні дати. Дати у "summary" мають дослівно збігатися з датами у "deadlines". Нічого не вигадуй."""
|
| 47 |
|
| 48 |
+
EN_SUFFIX = """
|
| 49 |
+
|
| 50 |
+
OUTPUT LANGUAGE OVERRIDE: Write the values of "summary", "actions", "what" and "pattern" in ENGLISH (translate the allowed pattern names). Keep dates, amounts and phone numbers exactly as written in the letter. Keep "evidence" quotes verbatim in their original language — they are citations."""
|
| 51 |
+
|
| 52 |
+
UK = "Українська"
|
| 53 |
+
EN = "English"
|
| 54 |
+
|
| 55 |
+
T = {
|
| 56 |
+
"uk": {
|
| 57 |
+
"stamp_danger": "Схоже на шахрайство",
|
| 58 |
+
"stamp_ok": "Виглядає як справжній лист",
|
| 59 |
+
"h_summary": "Про що цей лист",
|
| 60 |
+
"h_flags": "Чому це підозріло",
|
| 61 |
+
"h_actions": "Що робити",
|
| 62 |
+
"h_deadlines": "Дати та суми",
|
| 63 |
+
"col_when": "Коли",
|
| 64 |
+
"col_what": "Що",
|
| 65 |
+
"col_amount": "Сума",
|
| 66 |
+
"meta": "Оброблено локально, без інтернет-сервісів",
|
| 67 |
+
"status_read": "Читаю лист…",
|
| 68 |
+
"status_read_d": "Перший запит після запуску може тривати довше.",
|
| 69 |
+
"status_work": "Розшифровую…",
|
| 70 |
+
"status_chars": "символів за",
|
| 71 |
+
"err_empty": "Спочатку додайте лист: сфотографуйте його або вставте текст у поле зліва.",
|
| 72 |
+
"err_conn": "Помилка зв'язку з моделлю:",
|
| 73 |
+
"err_json": "Модель повернула некоректну відповідь. Спробуйте ще раз.",
|
| 74 |
+
},
|
| 75 |
+
"en": {
|
| 76 |
+
"stamp_danger": "Looks like a scam",
|
| 77 |
+
"stamp_ok": "Looks like a genuine letter",
|
| 78 |
+
"h_summary": "What this letter says",
|
| 79 |
+
"h_flags": "Why it is suspicious",
|
| 80 |
+
"h_actions": "What to do",
|
| 81 |
+
"h_deadlines": "Dates and amounts",
|
| 82 |
+
"col_when": "When",
|
| 83 |
+
"col_what": "What",
|
| 84 |
+
"col_amount": "Amount",
|
| 85 |
+
"meta": "Processed locally, no internet services",
|
| 86 |
+
"status_read": "Reading the letter…",
|
| 87 |
+
"status_read_d": "The first request after startup can take longer.",
|
| 88 |
+
"status_work": "Decoding…",
|
| 89 |
+
"status_chars": "characters in",
|
| 90 |
+
"err_empty": "Add a letter first: photograph it or paste the text on the left.",
|
| 91 |
+
"err_conn": "Model connection error:",
|
| 92 |
+
"err_json": "The model returned a malformed answer. Try again.",
|
| 93 |
+
},
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def lang_code(choice: str) -> str:
|
| 98 |
+
return "en" if choice == EN else "uk"
|
| 99 |
+
|
| 100 |
+
|
| 101 |
# ---------------------------------------------------------------- OCR path
|
| 102 |
|
| 103 |
|
|
|
|
| 118 |
# ------------------------------------------------------------- model call
|
| 119 |
|
| 120 |
|
| 121 |
+
def stream_completion(letter_text: str, lang: str):
|
| 122 |
"""Yield (accumulated_text, completion_tokens, elapsed_s) while streaming."""
|
| 123 |
+
system = SYSTEM_PROMPT + (EN_SUFFIX if lang == "en" else "")
|
| 124 |
payload = {
|
| 125 |
"model": "paper-decoder",
|
| 126 |
"temperature": 0.2,
|
| 127 |
"seed": 42,
|
| 128 |
"max_tokens": 900,
|
| 129 |
+
"cache_prompt": True,
|
| 130 |
"response_format": {"type": "json_object"},
|
| 131 |
"stream": True,
|
| 132 |
"stream_options": {"include_usage": True},
|
| 133 |
"messages": [
|
| 134 |
+
{"role": "system", "content": system},
|
| 135 |
{"role": "user", "content": letter_text},
|
| 136 |
],
|
| 137 |
}
|
|
|
|
| 169 |
if candidate.startswith("json"):
|
| 170 |
candidate = candidate[4:]
|
| 171 |
candidate = candidate.strip()
|
| 172 |
+
try:
|
| 173 |
+
obj = json.loads(candidate)
|
| 174 |
+
if isinstance(obj, dict):
|
| 175 |
+
return obj
|
| 176 |
+
except (json.JSONDecodeError, ValueError):
|
| 177 |
+
pass
|
|
|
|
| 178 |
start, end = candidate.find("{"), candidate.rfind("}")
|
| 179 |
if start != -1 and end > start:
|
| 180 |
try:
|
|
|
|
| 205 |
return f'<div class="pd-error">{esc(message)}</div>'
|
| 206 |
|
| 207 |
|
| 208 |
+
def render_result(data: dict, tok_s: float | None, t: dict) -> str:
|
| 209 |
flags = data.get("scam_flags") or []
|
| 210 |
is_scam = len(flags) > 0
|
| 211 |
|
| 212 |
if is_scam:
|
| 213 |
+
stamp = f'<div class="pd-stamp pd-stamp-danger">{esc(t["stamp_danger"])}</div>'
|
| 214 |
else:
|
| 215 |
+
stamp = f'<div class="pd-stamp pd-stamp-ok">{esc(t["stamp_ok"])}</div>'
|
| 216 |
|
| 217 |
parts = [f'<div class="pd-stamp-row">{stamp}</div>']
|
| 218 |
|
| 219 |
summary = data.get("summary")
|
| 220 |
if summary:
|
| 221 |
parts.append(
|
| 222 |
+
f'<section class="pd-card"><h3>{esc(t["h_summary"])}</h3>'
|
| 223 |
f'<p class="pd-summary">{esc(summary)}</p></section>'
|
| 224 |
)
|
| 225 |
|
|
|
|
| 231 |
if isinstance(f, dict)
|
| 232 |
)
|
| 233 |
parts.append(
|
| 234 |
+
f'<section class="pd-card pd-card-danger"><h3>{esc(t["h_flags"])}</h3>'
|
| 235 |
f'<ul class="pd-flags">{rows}</ul></section>'
|
| 236 |
)
|
| 237 |
|
|
|
|
| 239 |
if actions:
|
| 240 |
rows = "".join(f"<li>{esc(a)}</li>" for a in actions)
|
| 241 |
parts.append(
|
| 242 |
+
f'<section class="pd-card"><h3>{esc(t["h_actions"])}</h3>'
|
| 243 |
f'<ol class="pd-actions">{rows}</ol></section>'
|
| 244 |
)
|
| 245 |
|
|
|
|
| 256 |
f'<td>{esc(d.get("what", ""))}</td>{amount_html}</tr>'
|
| 257 |
)
|
| 258 |
parts.append(
|
| 259 |
+
f'<section class="pd-card"><h3>{esc(t["h_deadlines"])}</h3>'
|
| 260 |
'<table class="pd-deadlines"><thead><tr>'
|
| 261 |
+
f'<th>{esc(t["col_when"])}</th><th>{esc(t["col_what"])}</th><th>{esc(t["col_amount"])}</th>'
|
| 262 |
+
"</tr></thead>"
|
| 263 |
f"<tbody>{rows}</tbody></table></section>"
|
| 264 |
)
|
| 265 |
|
| 266 |
+
speed = f" · {tok_s:.1f} tok/s" if tok_s else ""
|
| 267 |
+
parts.append(f'<div class="pd-meta">{esc(t["meta"])}{speed}</div>')
|
|
|
|
|
|
|
| 268 |
return '<div class="pd-result">' + "".join(parts) + "</div>"
|
| 269 |
|
| 270 |
|
|
|
|
| 280 |
return text
|
| 281 |
|
| 282 |
|
| 283 |
+
def decode(letter_text: str, lang_choice: str):
|
| 284 |
+
lang = lang_code(lang_choice)
|
| 285 |
+
t = T[lang]
|
| 286 |
+
|
| 287 |
letter_text = (letter_text or "").strip()
|
| 288 |
if not letter_text or letter_text.startswith("(Не вдалося"):
|
| 289 |
+
yield render_error(t["err_empty"])
|
|
|
|
|
|
|
| 290 |
return
|
| 291 |
letter_text = letter_text[:MAX_LETTER_CHARS]
|
| 292 |
|
| 293 |
+
yield render_status(t["status_read"], t["status_read_d"])
|
| 294 |
|
| 295 |
acc, completion_tokens, elapsed = "", 0, 0.0
|
| 296 |
try:
|
| 297 |
last_yield = 0.0
|
| 298 |
+
for acc, completion_tokens, elapsed in stream_completion(letter_text, lang):
|
| 299 |
if elapsed - last_yield >= 1.0:
|
| 300 |
last_yield = elapsed
|
| 301 |
yield render_status(
|
| 302 |
+
t["status_work"],
|
| 303 |
+
f"{len(acc)} {t['status_chars']} {elapsed:.0f} s",
|
| 304 |
)
|
| 305 |
except requests.RequestException as e:
|
| 306 |
+
yield render_error(f"{t['err_conn']} {e}")
|
| 307 |
return
|
| 308 |
|
| 309 |
data = parse_model_json(acc)
|
| 310 |
if data is None:
|
| 311 |
yield (
|
| 312 |
+
render_error(t["err_json"])
|
| 313 |
+ f'<pre class="pd-raw">{esc(acc[:2000])}</pre>'
|
| 314 |
)
|
| 315 |
return
|
| 316 |
|
| 317 |
tok_s = (completion_tokens / elapsed) if completion_tokens and elapsed > 0 else None
|
| 318 |
+
yield render_result(data, tok_s, t)
|
| 319 |
|
| 320 |
|
| 321 |
# --------------------------------------------------------------------- UI
|
|
|
|
| 435 |
placeholder="…або вставте текст листа сюди",
|
| 436 |
lines=10,
|
| 437 |
)
|
| 438 |
+
lang = gr.Radio(
|
| 439 |
+
choices=[UK, EN],
|
| 440 |
+
value=UK,
|
| 441 |
+
label="Мова відповіді / Answer language",
|
| 442 |
+
)
|
| 443 |
with gr.Row():
|
| 444 |
decode_btn = gr.Button("Розшифрувати лист", variant="primary", size="lg")
|
| 445 |
clear_btn = gr.ClearButton([image, letter], value="Очистити")
|
|
|
|
| 450 |
)
|
| 451 |
with gr.Column(scale=6):
|
| 452 |
result = gr.HTML(
|
| 453 |
+
'<div class="pd-meta">Тут з\'явиться розшифровка листа. '
|
| 454 |
+
"/ The decoded letter will appear here.</div>"
|
| 455 |
)
|
| 456 |
|
| 457 |
image.upload(do_ocr, inputs=[image], outputs=[letter])
|
| 458 |
+
decode_btn.click(decode, inputs=[letter, lang], outputs=[result])
|
| 459 |
+
letter.submit(decode, inputs=[letter, lang], outputs=[result])
|
| 460 |
|
| 461 |
if __name__ == "__main__":
|
| 462 |
demo.launch(css=CSS)
|