braillematesystem commited on
Commit
a3fe23b
·
verified ·
1 Parent(s): 8c334d2

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +454 -79
main.py CHANGED
@@ -70,17 +70,17 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
70
  # =========================================================
71
  SUMMARY_ENABLED = os.getenv("SUMMARY_ENABLED", "true").lower() == "true"
72
  SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "weijiahaha/t5-small-summarization")
73
- SUMMARY_MAX_INPUT_CHARS = int(os.getenv("SUMMARY_MAX_INPUT_CHARS", "1200"))
74
- SUMMARY_MAX_NEW_TOKENS = int(os.getenv("SUMMARY_MAX_NEW_TOKENS", "48"))
75
  SUMMARY_MIN_TEXT_LEN = int(os.getenv("SUMMARY_MIN_TEXT_LEN", "80"))
76
- SUMMARY_NUM_BEAMS = int(os.getenv("SUMMARY_NUM_BEAMS", "2"))
77
- SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "220"))
78
  TORCH_NUM_THREADS = int(os.getenv("TORCH_NUM_THREADS", "1"))
79
 
80
  # =========================================================
81
  # APP
82
  # =========================================================
83
- app = FastAPI(title=APP_NAME, version="1.5.0")
84
  app.add_middleware(
85
  CORSMiddleware,
86
  allow_origins=["*"],
@@ -151,7 +151,7 @@ class ScanRequest(BaseModel):
151
  uuid: str
152
 
153
  # =========================================================
154
- # HELPERS
155
  # =========================================================
156
  def _now() -> float:
157
  return time.time()
@@ -277,6 +277,7 @@ def basic_cleanup(text: str) -> str:
277
  text = re.sub(r"\s+([,.;:!?])", r"\1", text)
278
  text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
279
  text = re.sub(r"[ \t]+", " ", text)
 
280
  return text.strip()
281
 
282
  def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
@@ -290,6 +291,9 @@ def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
290
  )
291
  return (weird / max(1, len(text))) > 0.20
292
 
 
 
 
293
  def get_paddle_ocr() -> PaddleOCR:
294
  global _OCR_PADDLE
295
  if _OCR_PADDLE is not None:
@@ -334,6 +338,9 @@ def paddle_ocr_extract(img: Image.Image) -> Tuple[str, float]:
334
  avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
335
  return full_text, avg_conf
336
 
 
 
 
337
  def get_hf_corrector():
338
  global _HF_CORRECTOR
339
  if _HF_CORRECTOR is not None:
@@ -355,7 +362,7 @@ def hf_correct_text(text: str) -> str:
355
  return text
356
 
357
  # =========================================================
358
- # SUMMARY HELPERS
359
  # =========================================================
360
  def _truncate_text(text: str, max_chars: int) -> str:
361
  text = text.strip()
@@ -367,32 +374,246 @@ def _truncate_text(text: str, max_chars: int) -> str:
367
  cut = cut[:last_space]
368
  return cut.rstrip(" ,;:-") + "..."
369
 
370
- def prepare_text_for_summary(text: str) -> str:
371
- text = text.replace("\r", "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
- raw_lines = [ln.strip() for ln in text.split("\n")]
 
374
  raw_lines = [ln for ln in raw_lines if ln]
375
 
376
  if not raw_lines:
377
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
  merged_parts: List[str] = []
380
  buffer = ""
381
 
382
- for line in raw_lines:
 
 
 
 
 
 
383
  if not buffer:
384
  buffer = line
385
  continue
386
 
387
  prev_end = buffer[-1] if buffer else ""
 
 
 
 
388
  should_join = True
389
 
390
  if prev_end in ".!?:":
391
  should_join = False
392
 
393
- if len(buffer) < 20 and len(line) < 20:
394
  should_join = False
395
 
 
 
 
 
 
 
396
  if should_join:
397
  buffer = f"{buffer} {line}"
398
  else:
@@ -402,46 +623,152 @@ def prepare_text_for_summary(text: str) -> str:
402
  if buffer:
403
  merged_parts.append(buffer.strip())
404
 
405
- text = " ".join(merged_parts)
406
- text = re.sub(r"\s+", " ", text)
407
- text = re.sub(r"\s+([,.;:!?])", r"\1", text)
408
- text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
409
- text = re.sub(r"[.]{2,}", ".", text)
410
- text = re.sub(r"[?]{2,}", "?", text)
411
- text = re.sub(r"[!]{2,}", "!", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
- return text.strip()
414
-
415
- def _simple_summary_fallback(text: str, max_chars: int = 220) -> str:
416
- text = prepare_text_for_summary(text)
417
- text = basic_cleanup(text)
 
 
418
 
 
 
 
 
 
419
  if not text:
420
  return ""
 
 
 
 
421
 
422
- lower = text.lower()
423
- story_markers = [
424
- "once upon", "rabbit", "tortoise", "lion", "mouse",
425
- "crow", "fox", "race", "moral", "boasted", "won"
426
- ]
427
- story_hits = sum(1 for m in story_markers if m in lower)
428
 
429
- if story_hits >= 2:
430
- if "rabbit" in lower and "tortoise" in lower:
431
- return "Slow and steady wins the race."
432
- if "boast" in lower or "proud" in lower:
433
- return "Overconfidence can lead to failure."
434
- return "The story shows that patience and consistency can lead to success."
435
 
436
- parts = re.split(r'(?<=[.!?])\s+', text)
437
- parts = [p.strip() for p in parts if p.strip()]
 
 
 
438
 
439
- for p in parts:
440
- if len(p) >= 30 and len(re.findall(r"[A-Za-z]", p)) >= 15:
441
- return _truncate_text(p, max_chars)
 
 
442
 
443
- return _truncate_text(text, max_chars)
 
444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  def get_hf_summarizer():
446
  global _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
447
 
@@ -471,31 +798,64 @@ def get_hf_summarizer():
471
  print(f"[summary] model loaded: {SUMMARY_MODEL} on {device}")
472
  return _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
473
 
474
- def summarize_text(text: str) -> Tuple[str, str]:
475
- text = basic_cleanup(text)
476
-
 
 
477
  if not SUMMARY_ENABLED:
478
- return "", "disabled"
479
 
480
- if not text:
481
- return "", "empty"
 
 
 
 
 
 
482
 
483
- prepared = prepare_text_for_summary(text)
 
484
 
485
- if len(prepared) < SUMMARY_MIN_TEXT_LEN:
486
- return prepared, "source_short"
 
487
 
488
- source = _truncate_text(prepared, SUMMARY_MAX_INPUT_CHARS)
 
 
 
 
 
 
489
 
 
490
  lower = source.lower()
 
491
  looks_like_story = any(x in lower for x in [
492
- "once upon", "rabbit", "tortoise", "lion", "mouse", "fox", "crow", "race"
 
493
  ])
494
 
 
 
 
 
 
 
 
 
495
  if looks_like_story:
496
- prompt = f"summarize the story in one short clear sentence with the main lesson: {source}"
497
- else:
498
- prompt = f"summarize: {source}"
 
 
 
 
 
 
499
 
500
  try:
501
  tokenizer, model, torch = get_hf_summarizer()
@@ -513,7 +873,7 @@ def summarize_text(text: str) -> Tuple[str, str]:
513
  with torch.no_grad():
514
  output_ids = model.generate(
515
  **inputs,
516
- max_new_tokens=SUMMARY_MAX_NEW_TOKENS,
517
  num_beams=max(2, SUMMARY_NUM_BEAMS),
518
  do_sample=False,
519
  early_stopping=True,
@@ -524,31 +884,39 @@ def summarize_text(text: str) -> Tuple[str, str]:
524
 
525
  summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
526
  summary = basic_cleanup(summary)
 
527
  summary = re.sub(r"^(summary|summarize|main idea|moral)\s*:\s*", "", summary, flags=re.I).strip()
528
 
529
- if not summary:
530
- return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_empty"
531
-
532
- if len(summary) < 12:
533
- return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_too_short"
534
-
535
- alpha_count = len(re.findall(r"[A-Za-z]", summary))
536
- word_count = len(re.findall(r"[A-Za-z']+", summary))
537
- if alpha_count < 8 or word_count < 3:
538
- return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_fragment"
539
-
540
- if summary.lower() in prepared.lower() and len(summary) < 20:
541
- return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_raw_fragment"
542
-
543
- if summary.endswith((" the", " and", " or", " of", " to", " in", " a")):
544
- return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_incomplete"
545
-
546
- summary = _truncate_text(summary, SUMMARY_MAX_CHARS)
547
- return summary, "t5_small_summarization"
 
 
 
 
 
 
 
548
 
549
  except Exception as e:
550
  print(f"[summary] failed: {type(e).__name__}: {e}")
551
- return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), f"fallback:{type(e).__name__}"
552
 
553
  # =========================================================
554
  # STARTUP
@@ -665,7 +1033,14 @@ async def ocr_endpoint(
665
  if len(final_text) > MAX_TEXT_LEN:
666
  final_text = final_text[:MAX_TEXT_LEN]
667
 
668
- summary, summary_method = summarize_text(final_text)
 
 
 
 
 
 
 
669
 
670
  payload = {
671
  "uuid": uuid,
 
70
  # =========================================================
71
  SUMMARY_ENABLED = os.getenv("SUMMARY_ENABLED", "true").lower() == "true"
72
  SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "weijiahaha/t5-small-summarization")
73
+ SUMMARY_MAX_INPUT_CHARS = int(os.getenv("SUMMARY_MAX_INPUT_CHARS", "1600"))
74
+ SUMMARY_MAX_NEW_TOKENS = int(os.getenv("SUMMARY_MAX_NEW_TOKENS", "72"))
75
  SUMMARY_MIN_TEXT_LEN = int(os.getenv("SUMMARY_MIN_TEXT_LEN", "80"))
76
+ SUMMARY_NUM_BEAMS = int(os.getenv("SUMMARY_NUM_BEAMS", "3"))
77
+ SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "260"))
78
  TORCH_NUM_THREADS = int(os.getenv("TORCH_NUM_THREADS", "1"))
79
 
80
  # =========================================================
81
  # APP
82
  # =========================================================
83
+ app = FastAPI(title=APP_NAME, version="1.11.0")
84
  app.add_middleware(
85
  CORSMiddleware,
86
  allow_origins=["*"],
 
151
  uuid: str
152
 
153
  # =========================================================
154
+ # GENERAL HELPERS
155
  # =========================================================
156
  def _now() -> float:
157
  return time.time()
 
277
  text = re.sub(r"\s+([,.;:!?])", r"\1", text)
278
  text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
279
  text = re.sub(r"[ \t]+", " ", text)
280
+ text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
281
  return text.strip()
282
 
283
  def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
 
291
  )
292
  return (weird / max(1, len(text))) > 0.20
293
 
294
+ # =========================================================
295
+ # OCR
296
+ # =========================================================
297
  def get_paddle_ocr() -> PaddleOCR:
298
  global _OCR_PADDLE
299
  if _OCR_PADDLE is not None:
 
338
  avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
339
  return full_text, avg_conf
340
 
341
+ # =========================================================
342
+ # OPTIONAL HF TEXT CORRECTION
343
+ # =========================================================
344
  def get_hf_corrector():
345
  global _HF_CORRECTOR
346
  if _HF_CORRECTOR is not None:
 
362
  return text
363
 
364
  # =========================================================
365
+ # SUMMARY REPAIR / NORMALIZATION
366
  # =========================================================
367
  def _truncate_text(text: str, max_chars: int) -> str:
368
  text = text.strip()
 
374
  cut = cut[:last_space]
375
  return cut.rstrip(" ,;:-") + "..."
376
 
377
+ def _sentence_split(text: str) -> List[str]:
378
+ parts = re.split(r'(?<=[.!?])\s+', text)
379
+ return [p.strip() for p in parts if p.strip()]
380
+
381
+ def _looks_like_title(line: str) -> bool:
382
+ s = line.strip()
383
+ if not s:
384
+ return False
385
+ if len(s) > 90:
386
+ return False
387
+ if re.fullmatch(r"[0-9]+", s):
388
+ return False
389
+
390
+ words = re.findall(r"[A-Za-z][A-Za-z'-]*", s)
391
+ if not words:
392
+ return False
393
+
394
+ titleish = sum(1 for w in words if w[:1].isupper())
395
+ ratio = titleish / max(1, len(words))
396
+ return ratio >= 0.6 and len(words) <= 10
397
+
398
+ def _clean_title(line: str) -> str:
399
+ s = basic_cleanup(line)
400
+ s = re.sub(r"^[0-9]+\s*", "", s).strip()
401
+ s = re.sub(r"\.+$", "", s).strip()
402
+ s = re.sub(r"\s{2,}", " ", s)
403
+ return s
404
+
405
+ def _extract_moral(text: str) -> Optional[str]:
406
+ t = basic_cleanup(text)
407
+
408
+ m = re.search(r"(moral\s+of\s+the\s+story\.?\s*)(.+)$", t, flags=re.I)
409
+ if m:
410
+ moral = m.group(2).strip()
411
+ moral = re.split(r'(?<=[.!?])\s+', moral)[0].strip()
412
+ moral = re.sub(r"^[\-\:\*\"\']+", "", moral).strip()
413
+ if moral:
414
+ return moral
415
+
416
+ m2 = re.search(r"\bmoral\s*[:\-]\s*(.+)$", t, flags=re.I)
417
+ if m2:
418
+ moral = m2.group(1).strip()
419
+ moral = re.split(r'(?<=[.!?])\s+', moral)[0].strip()
420
+ if moral:
421
+ return moral
422
+
423
+ return None
424
 
425
+ def _extract_title_and_context_lines(text: str) -> Tuple[Optional[str], Optional[str], List[str]]:
426
+ raw_lines = [ln.strip() for ln in text.replace("\r", "\n").split("\n")]
427
  raw_lines = [ln for ln in raw_lines if ln]
428
 
429
  if not raw_lines:
430
+ return None, None, []
431
+
432
+ title = None
433
+ context_line = None
434
+
435
+ filtered = [ln for ln in raw_lines[:6] if not re.fullmatch(r"[0-9]+", ln.strip())]
436
+
437
+ if filtered:
438
+ title_parts: List[str] = []
439
+ for ln in filtered[:3]:
440
+ if _looks_like_title(ln):
441
+ title_parts.append(_clean_title(ln))
442
+ else:
443
+ break
444
+
445
+ if title_parts:
446
+ title = " ".join([p for p in title_parts if p]).strip()
447
+ title = re.sub(r"\s+", " ", title).strip()
448
+
449
+ if title:
450
+ for ln in filtered[len(title_parts):len(title_parts) + 2]:
451
+ c = _clean_title(ln)
452
+ if len(c) <= 60 and (
453
+ re.search(r"\b(january|february|march|april|may|june|july|august|september|october|november|december)\b", c, re.I)
454
+ or re.search(r"\b\d{4}\b", c)
455
+ or "," in c
456
+ or "-" in c
457
+ ):
458
+ context_line = c
459
+ break
460
+
461
+ body_lines = list(raw_lines)
462
+
463
+ remove_count = 0
464
+ if title:
465
+ title_words = title.lower().split()
466
+ while remove_count < len(body_lines):
467
+ candidate = _clean_title(body_lines[remove_count]).lower()
468
+ if not candidate:
469
+ remove_count += 1
470
+ continue
471
+ if any(w in candidate for w in title_words[:2]):
472
+ remove_count += 1
473
+ else:
474
+ break
475
+
476
+ if context_line and remove_count < len(body_lines):
477
+ if _clean_title(body_lines[remove_count]).lower() == context_line.lower():
478
+ remove_count += 1
479
+
480
+ body_lines = body_lines[remove_count:]
481
+ return title, context_line, body_lines
482
+
483
+ def _extract_person_name(text: str) -> Optional[str]:
484
+ if not text:
485
+ return None
486
+
487
+ text = re.sub(r"\barren Buffett\b", "Warren Buffett", text)
488
+ text = re.sub(r"\barren\b", "Warren", text)
489
+
490
+ patterns = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b", text)
491
+ blacklist = {
492
+ "The Less", "Long Term", "Capital Management", "Coca Cola",
493
+ "Howard Plain", "Wells Fargo", "Pulitzer Prize"
494
+ }
495
+
496
+ for p in patterns:
497
+ if p not in blacklist:
498
+ return p
499
+
500
+ return None
501
+
502
+ def _looks_like_biography_or_profile(text: str) -> bool:
503
+ lower = text.lower()
504
+ markers = [
505
+ "office", "headquarters", "photographs", "memorabilia",
506
+ "appearance", "chair", "desk", "eyebrow", "glasses",
507
+ "shirt", "suit jacket", "surrounded by", "buffett",
508
+ "berkshire", "omaha"
509
+ ]
510
+ return sum(1 for m in markers if m in lower) >= 3
511
+
512
+ def _light_ocr_word_fixes(text: str) -> str:
513
+ """
514
+ Very lightweight OCR cleanup for summary source only.
515
+ Keep this small and safe.
516
+ """
517
+ fixes = {
518
+ r"\barren Buffett\b": "Warren Buffett",
519
+ r"\barren\b": "Warren",
520
+ r"\btion\b": "lion",
521
+ r"\ba tion\b": "a lion",
522
+ r"\brabblt\b": "rabbit",
523
+ r"\btortoiso\b": "tortoise",
524
+ r"\btme\b": "time",
525
+ r"\bwnole\b": "whole",
526
+ r"\bwoko\b": "woke",
527
+ r"\bho saw\b": "he saw",
528
+ r"\bseep\b": "sleep",
529
+ r"\bIarge\b": "large",
530
+ r"\bIace\b": "lace",
531
+ r"\bbascball\b": "baseball",
532
+ r"\bfinishlng\b": "finishing",
533
+ r"\bgocd\b": "good",
534
+ r"\bortoise ks\b": "tortoise is",
535
+ r"\blen\b": "left",
536
+ r"\bandd\b": "and",
537
+ r"\bselfassurance\b": "self assurance",
538
+ r"\bthe'mouse's\b": "the mouse's",
539
+ r"\bin\.\s+distress\b": "in distress",
540
+ r"\bThey had\.\s+him\b": "They had him",
541
+ r"\blet him\.\s+go\b": "let him go",
542
+ r"\bThe Lion andd\b": "The Lion and the Mouse",
543
+ r"\bThe Mouse\b": "the mouse",
544
+ r"\bMean while\b": "Meanwhile",
545
+ r"\bA tortoise\b": "a tortoise",
546
+ r"\bRabbit and a tortoise\b": "rabbit and a tortoise",
547
+ }
548
+
549
+ out = text
550
+ for pat, repl in fixes.items():
551
+ out = re.sub(pat, repl, out, flags=re.I)
552
+
553
+ out = re.sub(r"\b([A-Z][a-z]{2,})\.\s+([A-Z][a-z]{2,})\b", r"\1 \2", out)
554
+ out = re.sub(r"\s+", " ", out).strip()
555
+
556
+ return out
557
+
558
+ def repair_text_for_summary(text: str) -> Dict[str, Optional[str]]:
559
+ """
560
+ Repairs OCR text for summarization without changing OCR extraction behavior.
561
+ """
562
+ text = basic_cleanup(text)
563
+ title, context_line, body_lines = _extract_title_and_context_lines(text)
564
+
565
+ if not body_lines and text:
566
+ body_lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
567
+
568
+ repaired_lines: List[str] = []
569
+
570
+ i = 0
571
+ while i < len(body_lines):
572
+ line = basic_cleanup(body_lines[i])
573
+
574
+ if line.endswith("-") and i + 1 < len(body_lines):
575
+ nxt = basic_cleanup(body_lines[i + 1])
576
+ line = line[:-1] + nxt
577
+ repaired_lines.append(line)
578
+ i += 2
579
+ continue
580
+
581
+ repaired_lines.append(line)
582
+ i += 1
583
 
584
  merged_parts: List[str] = []
585
  buffer = ""
586
 
587
+ for line in repaired_lines:
588
+ if not line:
589
+ continue
590
+
591
+ line = re.sub(r"\s+", " ", line).strip()
592
+ line = re.sub(r"^[\*\•\-\_]+\s*", "", line)
593
+
594
  if not buffer:
595
  buffer = line
596
  continue
597
 
598
  prev_end = buffer[-1] if buffer else ""
599
+ starts_lower = bool(re.match(r"^[a-z]", line))
600
+ starts_common = bool(re.match(r"^(and|but|or|so|then|when|while|because|if|that|who|which|where)\b", line, re.I))
601
+ starts_short = len(line.split()) <= 4
602
+
603
  should_join = True
604
 
605
  if prev_end in ".!?:":
606
  should_join = False
607
 
608
+ if _looks_like_title(line):
609
  should_join = False
610
 
611
+ if starts_lower or starts_common:
612
+ should_join = True
613
+
614
+ if starts_short and prev_end not in ".!?":
615
+ should_join = True
616
+
617
  if should_join:
618
  buffer = f"{buffer} {line}"
619
  else:
 
623
  if buffer:
624
  merged_parts.append(buffer.strip())
625
 
626
+ repaired_text = " ".join(merged_parts)
627
+
628
+ repaired_text = repaired_text.replace("..", ".")
629
+ repaired_text = repaired_text.replace(" .", ".")
630
+ repaired_text = repaired_text.replace(" ,", ",")
631
+ repaired_text = repaired_text.replace(" ;", ";")
632
+ repaired_text = repaired_text.replace(" :", ":")
633
+ repaired_text = repaired_text.replace(" !", "!")
634
+ repaired_text = repaired_text.replace(" ?", "?")
635
+ repaired_text = re.sub(r"([A-Za-z])'at'([A-Za-z])", r"\1 \2", repaired_text)
636
+ repaired_text = re.sub(r"([A-Za-z])'([A-Za-z])", r"\1'\2", repaired_text)
637
+ repaired_text = re.sub(r"\s+", " ", repaired_text).strip()
638
+ repaired_text = re.sub(r"[.]{2,}", ".", repaired_text)
639
+ repaired_text = re.sub(r"[?]{2,}", "?", repaired_text)
640
+ repaired_text = re.sub(r"[!]{2,}", "!", repaired_text)
641
+
642
+ repaired_text = _light_ocr_word_fixes(repaired_text)
643
+ moral = _extract_moral(text)
644
+
645
+ repaired_no_moral = repaired_text
646
+ repaired_no_moral = re.sub(r"moral\s+of\s+the\s+story\.?\s*.*$", "", repaired_no_moral, flags=re.I).strip()
647
+ repaired_no_moral = re.sub(r"\bmoral\s*[:\-]\s*.*$", "", repaired_no_moral, flags=re.I).strip()
648
+
649
+ lead_sentence = None
650
+ for sent in _sentence_split(repaired_no_moral or repaired_text):
651
+ if len(sent) >= 25 and len(re.findall(r"[A-Za-z]", sent)) >= 12:
652
+ lead_sentence = sent
653
+ break
654
 
655
+ return {
656
+ "title": title,
657
+ "context_line": context_line,
658
+ "moral": moral,
659
+ "repaired_text": repaired_no_moral or repaired_text,
660
+ "lead_sentence": lead_sentence,
661
+ }
662
 
663
+ def sanitize_summary_text(text: str) -> str:
664
+ """
665
+ Keep summary braille-friendly:
666
+ letters numbers spaces only
667
+ """
668
  if not text:
669
  return ""
670
+ text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
671
+ text = re.sub(r"[^A-Za-z0-9\s]", " ", text)
672
+ text = re.sub(r"\s+", " ", text).strip()
673
+ return text
674
 
675
+ def _is_bad_model_summary(summary: str, repaired_text: str) -> Optional[str]:
676
+ if not summary:
677
+ return "empty"
678
+ if len(summary) < 18:
679
+ return "too_short"
 
680
 
681
+ alpha_count = len(re.findall(r"[A-Za-z]", summary))
682
+ word_count = len(re.findall(r"[A-Za-z']+", summary))
683
+ if alpha_count < 10 or word_count < 4:
684
+ return "fragment"
 
 
685
 
686
+ if re.search(r"\b(the|and|or|of|to|in|a)$", summary.strip(), re.I):
687
+ return "incomplete"
688
+
689
+ if summary.lower() in repaired_text.lower() and len(summary) < 40:
690
+ return "raw_fragment"
691
 
692
+ bad_tokens = ["tion was", "rabblt", "tortoiso", "wnole", "woko", "len the", "andd"]
693
+ if any(tok in summary.lower() for tok in bad_tokens):
694
+ return "ocr_noise"
695
+
696
+ return None
697
 
698
+ def _story_fallback(title: Optional[str], moral: Optional[str], repaired_text: str, max_chars: int) -> str:
699
+ lower = repaired_text.lower()
700
 
701
+ if "lion" in lower and "mouse" in lower:
702
+ summary = "A lion spared a mouse and later the mouse freed the lion from hunters"
703
+ if moral:
704
+ summary += f" Moral {moral}"
705
+ else:
706
+ summary += " Moral kindness can be repaid"
707
+ if title:
708
+ summary = f"{title} {summary}"
709
+ return _truncate_text(summary, max_chars)
710
+
711
+ if "rabbit" in lower and "tortoise" in lower:
712
+ summary = "A fast rabbit lost a race to a steady tortoise after stopping to rest"
713
+ if moral:
714
+ summary += f" Moral {moral}"
715
+ else:
716
+ summary += " Moral slow and steady wins the race"
717
+ if title:
718
+ summary = f"{title} {summary}"
719
+ return _truncate_text(summary, max_chars)
720
+
721
+ first_sentences = _sentence_split(repaired_text)[:2]
722
+ summary = " ".join(first_sentences).strip()
723
+ if moral and moral.lower() not in summary.lower():
724
+ summary = f"{summary} Moral {moral}"
725
+ if title:
726
+ summary = f"{title} {summary}"
727
+ return _truncate_text(summary, max_chars)
728
+
729
+ def _structured_summary_fallback(title: Optional[str], context_line: Optional[str], repaired_text: str, moral: Optional[str], max_chars: int) -> str:
730
+ """
731
+ Better fallback for noisy non-story text.
732
+ """
733
+ person = _extract_person_name(repaired_text)
734
+ is_profile = _looks_like_biography_or_profile(repaired_text)
735
+
736
+ if is_profile and person:
737
+ parts = []
738
+ if title:
739
+ parts.append(title)
740
+ if context_line:
741
+ parts.append(context_line)
742
+
743
+ header = " ".join(parts).strip()
744
+ core = f"{person} is described through his appearance manner and surroundings"
745
+ if "office" in repaired_text.lower() or "desk" in repaired_text.lower():
746
+ core += " in his office"
747
+
748
+ result = f"{header} {core}".strip() if header else core
749
+ return _truncate_text(result, max_chars)
750
+
751
+ lead = None
752
+ for sent in _sentence_split(repaired_text):
753
+ if len(sent) >= 30 and len(re.findall(r"[A-Za-z]", sent)) >= 15:
754
+ lead = sent
755
+ break
756
+
757
+ parts = []
758
+ if title:
759
+ parts.append(title)
760
+ if context_line:
761
+ parts.append(context_line)
762
+ if lead:
763
+ parts.append(lead)
764
+ if moral:
765
+ parts.append(f"Moral {moral}")
766
+
767
+ return _truncate_text(" ".join(parts).strip(), max_chars)
768
+
769
+ # =========================================================
770
+ # SUMMARIZER MODEL
771
+ # =========================================================
772
  def get_hf_summarizer():
773
  global _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
774
 
 
798
  print(f"[summary] model loaded: {SUMMARY_MODEL} on {device}")
799
  return _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
800
 
801
+ def summarize_text(text: str) -> Tuple[str, str, str]:
802
+ """
803
+ Returns:
804
+ summary, summary_method, repaired_text_used_for_summary
805
+ """
806
  if not SUMMARY_ENABLED:
807
+ return "", "disabled", ""
808
 
809
+ if not text.strip():
810
+ return "", "empty", ""
811
+
812
+ repaired = repair_text_for_summary(text)
813
+ title = repaired["title"]
814
+ context_line = repaired["context_line"]
815
+ moral = repaired["moral"]
816
+ repaired_text = repaired["repaired_text"] or ""
817
 
818
+ if not repaired_text:
819
+ return "", "empty_repaired", ""
820
 
821
+ # stronger post-repair cleanup
822
+ repaired_text = _light_ocr_word_fixes(repaired_text)
823
+ repaired_text = basic_cleanup(repaired_text)
824
 
825
+ # infer title if missing and story entities obvious
826
+ lower_full = repaired_text.lower()
827
+ if not title:
828
+ if "lion" in lower_full and "mouse" in lower_full:
829
+ title = "The Lion and the Mouse"
830
+ elif "rabbit" in lower_full and "tortoise" in lower_full:
831
+ title = "The Rabbit and the Tortoise"
832
 
833
+ source = _truncate_text(repaired_text, SUMMARY_MAX_INPUT_CHARS)
834
  lower = source.lower()
835
+
836
  looks_like_story = any(x in lower for x in [
837
+ "once upon", "rabbit", "tortoise", "lion", "mouse",
838
+ "fox", "crow", "race", "hunters", "jungle", "forest"
839
  ])
840
 
841
+ looks_like_profile = _looks_like_biography_or_profile(source)
842
+
843
+ # 1) biography/profile -> structured fallback first
844
+ if looks_like_profile:
845
+ summary = _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS)
846
+ return summary, "structured_profile_fallback", repaired_text
847
+
848
+ # 2) story/fable -> ALWAYS use structured story fallback
849
  if looks_like_story:
850
+ summary = _story_fallback(title, moral, repaired_text, SUMMARY_MAX_CHARS)
851
+ return summary, "structured_story_fallback", repaired_text
852
+
853
+ # 3) short non-story -> fallback
854
+ if len(source) < SUMMARY_MIN_TEXT_LEN:
855
+ return _structured_summary_fallback(title, context_line, source, moral, SUMMARY_MAX_CHARS), "fallback_short", repaired_text
856
+
857
+ # 4) model only for non-story text
858
+ prompt = f"summarize: {source}"
859
 
860
  try:
861
  tokenizer, model, torch = get_hf_summarizer()
 
873
  with torch.no_grad():
874
  output_ids = model.generate(
875
  **inputs,
876
+ max_new_tokens=max(40, SUMMARY_MAX_NEW_TOKENS),
877
  num_beams=max(2, SUMMARY_NUM_BEAMS),
878
  do_sample=False,
879
  early_stopping=True,
 
884
 
885
  summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
886
  summary = basic_cleanup(summary)
887
+ summary = _light_ocr_word_fixes(summary)
888
  summary = re.sub(r"^(summary|summarize|main idea|moral)\s*:\s*", "", summary, flags=re.I).strip()
889
 
890
+ # reject prompt echo junk
891
+ prompt_echo_markers = [
892
+ "the story in one short clear sentence",
893
+ "if there is a lesson include it briefly",
894
+ "summarize this story",
895
+ "summarize this text",
896
+ ]
897
+ if any(m in summary.lower() for m in prompt_echo_markers):
898
+ return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), "fallback_prompt_echo", repaired_text
899
+
900
+ bad_reason = _is_bad_model_summary(summary, repaired_text)
901
+ if bad_reason:
902
+ return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), f"fallback_generic_{bad_reason}", repaired_text
903
+
904
+ enriched = summary
905
+
906
+ if title and title.lower() not in enriched.lower():
907
+ if context_line and context_line.lower() not in enriched.lower():
908
+ enriched = f"{title} {context_line} {enriched}"
909
+ else:
910
+ enriched = f"{title} {enriched}"
911
+ elif context_line and context_line.lower() not in enriched.lower():
912
+ enriched = f"{context_line} {enriched}"
913
+
914
+ enriched = _truncate_text(enriched, SUMMARY_MAX_CHARS)
915
+ return enriched, "t5_small_summarization_repaired", repaired_text
916
 
917
  except Exception as e:
918
  print(f"[summary] failed: {type(e).__name__}: {e}")
919
+ return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), f"fallback_generic:{type(e).__name__}", repaired_text
920
 
921
  # =========================================================
922
  # STARTUP
 
1033
  if len(final_text) > MAX_TEXT_LEN:
1034
  final_text = final_text[:MAX_TEXT_LEN]
1035
 
1036
+ summary, summary_method, summary_source_text = summarize_text(final_text)
1037
+
1038
+ # Replace final_text with repaired readable text
1039
+ if summary_source_text:
1040
+ final_text = summary_source_text
1041
+
1042
+ # Sanitize summary only
1043
+ summary = sanitize_summary_text(summary)
1044
 
1045
  payload = {
1046
  "uuid": uuid,