Spaces:

braillematesystem
/

ai-scanner

Running

App Files Files Community

braillematesystem commited on Mar 10

Commit

a3fe23b

verified ·

1 Parent(s): 8c334d2

Update main.py

Browse files

Files changed (1) hide show

main.py +454 -79

main.py CHANGED Viewed

@@ -70,17 +70,17 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
 # =========================================================
 SUMMARY_ENABLED = os.getenv("SUMMARY_ENABLED", "true").lower() == "true"
 SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "weijiahaha/t5-small-summarization")
-SUMMARY_MAX_INPUT_CHARS = int(os.getenv("SUMMARY_MAX_INPUT_CHARS", "1200"))
-SUMMARY_MAX_NEW_TOKENS = int(os.getenv("SUMMARY_MAX_NEW_TOKENS", "48"))
 SUMMARY_MIN_TEXT_LEN = int(os.getenv("SUMMARY_MIN_TEXT_LEN", "80"))
-SUMMARY_NUM_BEAMS = int(os.getenv("SUMMARY_NUM_BEAMS", "2"))
-SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "220"))
 TORCH_NUM_THREADS = int(os.getenv("TORCH_NUM_THREADS", "1"))
 # =========================================================
 # APP
 # =========================================================
-app = FastAPI(title=APP_NAME, version="1.5.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -151,7 +151,7 @@ class ScanRequest(BaseModel):
     uuid: str
 # =========================================================
-# HELPERS
 # =========================================================
 def _now() -> float:
     return time.time()
@@ -277,6 +277,7 @@ def basic_cleanup(text: str) -> str:
     text = re.sub(r"\s+([,.;:!?])", r"\1", text)
     text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
     text = re.sub(r"[ \t]+", " ", text)
     return text.strip()
 def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
@@ -290,6 +291,9 @@ def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
     )
     return (weird / max(1, len(text))) > 0.20
 def get_paddle_ocr() -> PaddleOCR:
     global _OCR_PADDLE
     if _OCR_PADDLE is not None:
@@ -334,6 +338,9 @@ def paddle_ocr_extract(img: Image.Image) -> Tuple[str, float]:
     avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
     return full_text, avg_conf
 def get_hf_corrector():
     global _HF_CORRECTOR
     if _HF_CORRECTOR is not None:
@@ -355,7 +362,7 @@ def hf_correct_text(text: str) -> str:
     return text
 # =========================================================
-# SUMMARY HELPERS
 # =========================================================
 def _truncate_text(text: str, max_chars: int) -> str:
     text = text.strip()
@@ -367,32 +374,246 @@ def _truncate_text(text: str, max_chars: int) -> str:
         cut = cut[:last_space]
     return cut.rstrip(" ,;:-") + "..."
-def prepare_text_for_summary(text: str) -> str:
-    text = text.replace("\r", "\n")
-    raw_lines = [ln.strip() for ln in text.split("\n")]
     raw_lines = [ln for ln in raw_lines if ln]
     if not raw_lines:
-        return ""
     merged_parts: List[str] = []
     buffer = ""
-    for line in raw_lines:
         if not buffer:
             buffer = line
             continue
         prev_end = buffer[-1] if buffer else ""
         should_join = True
         if prev_end in ".!?:":
             should_join = False
-        if len(buffer) < 20 and len(line) < 20:
             should_join = False
         if should_join:
             buffer = f"{buffer} {line}"
         else:
@@ -402,46 +623,152 @@ def prepare_text_for_summary(text: str) -> str:
     if buffer:
         merged_parts.append(buffer.strip())
-    text = " ".join(merged_parts)
-    text = re.sub(r"\s+", " ", text)
-    text = re.sub(r"\s+([,.;:!?])", r"\1", text)
-    text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
-    text = re.sub(r"[.]{2,}", ".", text)
-    text = re.sub(r"[?]{2,}", "?", text)
-    text = re.sub(r"[!]{2,}", "!", text)
-    return text.strip()
-def _simple_summary_fallback(text: str, max_chars: int = 220) -> str:
-    text = prepare_text_for_summary(text)
-    text = basic_cleanup(text)
     if not text:
         return ""
-    lower = text.lower()
-    story_markers = [
-        "once upon", "rabbit", "tortoise", "lion", "mouse",
-        "crow", "fox", "race", "moral", "boasted", "won"
-    ]
-    story_hits = sum(1 for m in story_markers if m in lower)
-    if story_hits >= 2:
-        if "rabbit" in lower and "tortoise" in lower:
-            return "Slow and steady wins the race."
-        if "boast" in lower or "proud" in lower:
-            return "Overconfidence can lead to failure."
-        return "The story shows that patience and consistency can lead to success."
-    parts = re.split(r'(?<=[.!?])\s+', text)
-    parts = [p.strip() for p in parts if p.strip()]
-    for p in parts:
-        if len(p) >= 30 and len(re.findall(r"[A-Za-z]", p)) >= 15:
-            return _truncate_text(p, max_chars)
-    return _truncate_text(text, max_chars)
 def get_hf_summarizer():
     global _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
@@ -471,31 +798,64 @@ def get_hf_summarizer():
     print(f"[summary] model loaded: {SUMMARY_MODEL} on {device}")
     return _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
-def summarize_text(text: str) -> Tuple[str, str]:
-    text = basic_cleanup(text)
     if not SUMMARY_ENABLED:
-        return "", "disabled"
-    if not text:
-        return "", "empty"
-    prepared = prepare_text_for_summary(text)
-    if len(prepared) < SUMMARY_MIN_TEXT_LEN:
-        return prepared, "source_short"
-    source = _truncate_text(prepared, SUMMARY_MAX_INPUT_CHARS)
     lower = source.lower()
     looks_like_story = any(x in lower for x in [
-        "once upon", "rabbit", "tortoise", "lion", "mouse", "fox", "crow", "race"
     ])
     if looks_like_story:
-        prompt = f"summarize the story in one short clear sentence with the main lesson: {source}"
-    else:
-        prompt = f"summarize: {source}"
     try:
         tokenizer, model, torch = get_hf_summarizer()
@@ -513,7 +873,7 @@ def summarize_text(text: str) -> Tuple[str, str]:
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
-                max_new_tokens=SUMMARY_MAX_NEW_TOKENS,
                 num_beams=max(2, SUMMARY_NUM_BEAMS),
                 do_sample=False,
                 early_stopping=True,
@@ -524,31 +884,39 @@ def summarize_text(text: str) -> Tuple[str, str]:
         summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         summary = basic_cleanup(summary)
         summary = re.sub(r"^(summary|summarize|main idea|moral)\s*:\s*", "", summary, flags=re.I).strip()
-        if not summary:
-            return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_empty"
-        if len(summary) < 12:
-            return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_too_short"
-        alpha_count = len(re.findall(r"[A-Za-z]", summary))
-        word_count = len(re.findall(r"[A-Za-z']+", summary))
-        if alpha_count < 8 or word_count < 3:
-            return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_fragment"
-        if summary.lower() in prepared.lower() and len(summary) < 20:
-            return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_raw_fragment"
-        if summary.endswith((" the", " and", " or", " of", " to", " in", " a")):
-            return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), "fallback_incomplete"
-        summary = _truncate_text(summary, SUMMARY_MAX_CHARS)
-        return summary, "t5_small_summarization"
     except Exception as e:
         print(f"[summary] failed: {type(e).__name__}: {e}")
-        return _simple_summary_fallback(prepared, SUMMARY_MAX_CHARS), f"fallback:{type(e).__name__}"
 # =========================================================
 # STARTUP
@@ -665,7 +1033,14 @@ async def ocr_endpoint(
     if len(final_text) > MAX_TEXT_LEN:
         final_text = final_text[:MAX_TEXT_LEN]
-    summary, summary_method = summarize_text(final_text)
     payload = {
         "uuid": uuid,

 # =========================================================
 SUMMARY_ENABLED = os.getenv("SUMMARY_ENABLED", "true").lower() == "true"
 SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "weijiahaha/t5-small-summarization")
+SUMMARY_MAX_INPUT_CHARS = int(os.getenv("SUMMARY_MAX_INPUT_CHARS", "1600"))
+SUMMARY_MAX_NEW_TOKENS = int(os.getenv("SUMMARY_MAX_NEW_TOKENS", "72"))
 SUMMARY_MIN_TEXT_LEN = int(os.getenv("SUMMARY_MIN_TEXT_LEN", "80"))
+SUMMARY_NUM_BEAMS = int(os.getenv("SUMMARY_NUM_BEAMS", "3"))
+SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "260"))
 TORCH_NUM_THREADS = int(os.getenv("TORCH_NUM_THREADS", "1"))
 # =========================================================
 # APP
 # =========================================================
+app = FastAPI(title=APP_NAME, version="1.11.0")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     uuid: str
 # =========================================================
+# GENERAL HELPERS
 # =========================================================
 def _now() -> float:
     return time.time()
     text = re.sub(r"\s+([,.;:!?])", r"\1", text)
     text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
     text = re.sub(r"[ \t]+", " ", text)
+    text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
     return text.strip()
 def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
     )
     return (weird / max(1, len(text))) > 0.20
+# =========================================================
+# OCR
+# =========================================================
 def get_paddle_ocr() -> PaddleOCR:
     global _OCR_PADDLE
     if _OCR_PADDLE is not None:
     avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
     return full_text, avg_conf
+# =========================================================
+# OPTIONAL HF TEXT CORRECTION
+# =========================================================
 def get_hf_corrector():
     global _HF_CORRECTOR
     if _HF_CORRECTOR is not None:
     return text
 # =========================================================
+# SUMMARY REPAIR / NORMALIZATION
 # =========================================================
 def _truncate_text(text: str, max_chars: int) -> str:
     text = text.strip()
         cut = cut[:last_space]
     return cut.rstrip(" ,;:-") + "..."
+def _sentence_split(text: str) -> List[str]:
+    parts = re.split(r'(?<=[.!?])\s+', text)
+    return [p.strip() for p in parts if p.strip()]
+def _looks_like_title(line: str) -> bool:
+    s = line.strip()
+    if not s:
+        return False
+    if len(s) > 90:
+        return False
+    if re.fullmatch(r"[0-9]+", s):
+        return False
+    words = re.findall(r"[A-Za-z][A-Za-z'-]*", s)
+    if not words:
+        return False
+    titleish = sum(1 for w in words if w[:1].isupper())
+    ratio = titleish / max(1, len(words))
+    return ratio >= 0.6 and len(words) <= 10
+def _clean_title(line: str) -> str:
+    s = basic_cleanup(line)
+    s = re.sub(r"^[0-9]+\s*", "", s).strip()
+    s = re.sub(r"\.+$", "", s).strip()
+    s = re.sub(r"\s{2,}", " ", s)
+    return s
+def _extract_moral(text: str) -> Optional[str]:
+    t = basic_cleanup(text)
+    m = re.search(r"(moral\s+of\s+the\s+story\.?\s*)(.+)$", t, flags=re.I)
+    if m:
+        moral = m.group(2).strip()
+        moral = re.split(r'(?<=[.!?])\s+', moral)[0].strip()
+        moral = re.sub(r"^[\-\:\*\"\']+", "", moral).strip()
+        if moral:
+            return moral
+    m2 = re.search(r"\bmoral\s*[:\-]\s*(.+)$", t, flags=re.I)
+    if m2:
+        moral = m2.group(1).strip()
+        moral = re.split(r'(?<=[.!?])\s+', moral)[0].strip()
+        if moral:
+            return moral
+    return None
+def _extract_title_and_context_lines(text: str) -> Tuple[Optional[str], Optional[str], List[str]]:
+    raw_lines = [ln.strip() for ln in text.replace("\r", "\n").split("\n")]
     raw_lines = [ln for ln in raw_lines if ln]
     if not raw_lines:
+        return None, None, []
+    title = None
+    context_line = None
+    filtered = [ln for ln in raw_lines[:6] if not re.fullmatch(r"[0-9]+", ln.strip())]
+    if filtered:
+        title_parts: List[str] = []
+        for ln in filtered[:3]:
+            if _looks_like_title(ln):
+                title_parts.append(_clean_title(ln))
+            else:
+                break
+        if title_parts:
+            title = " ".join([p for p in title_parts if p]).strip()
+            title = re.sub(r"\s+", " ", title).strip()
+        if title:
+            for ln in filtered[len(title_parts):len(title_parts) + 2]:
+                c = _clean_title(ln)
+                if len(c) <= 60 and (
+                    re.search(r"\b(january|february|march|april|may|june|july|august|september|october|november|december)\b", c, re.I)
+                    or re.search(r"\b\d{4}\b", c)
+                    or "," in c
+                    or "-" in c
+                ):
+                    context_line = c
+                    break
+    body_lines = list(raw_lines)
+    remove_count = 0
+    if title:
+        title_words = title.lower().split()
+        while remove_count < len(body_lines):
+            candidate = _clean_title(body_lines[remove_count]).lower()
+            if not candidate:
+                remove_count += 1
+                continue
+            if any(w in candidate for w in title_words[:2]):
+                remove_count += 1
+            else:
+                break
+    if context_line and remove_count < len(body_lines):
+        if _clean_title(body_lines[remove_count]).lower() == context_line.lower():
+            remove_count += 1
+    body_lines = body_lines[remove_count:]
+    return title, context_line, body_lines
+def _extract_person_name(text: str) -> Optional[str]:
+    if not text:
+        return None
+    text = re.sub(r"\barren Buffett\b", "Warren Buffett", text)
+    text = re.sub(r"\barren\b", "Warren", text)
+    patterns = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b", text)
+    blacklist = {
+        "The Less", "Long Term", "Capital Management", "Coca Cola",
+        "Howard Plain", "Wells Fargo", "Pulitzer Prize"
+    }
+    for p in patterns:
+        if p not in blacklist:
+            return p
+    return None
+def _looks_like_biography_or_profile(text: str) -> bool:
+    lower = text.lower()
+    markers = [
+        "office", "headquarters", "photographs", "memorabilia",
+        "appearance", "chair", "desk", "eyebrow", "glasses",
+        "shirt", "suit jacket", "surrounded by", "buffett",
+        "berkshire", "omaha"
+    ]
+    return sum(1 for m in markers if m in lower) >= 3
+def _light_ocr_word_fixes(text: str) -> str:
+    """
+    Very lightweight OCR cleanup for summary source only.
+    Keep this small and safe.
+    """
+    fixes = {
+        r"\barren Buffett\b": "Warren Buffett",
+        r"\barren\b": "Warren",
+        r"\btion\b": "lion",
+        r"\ba tion\b": "a lion",
+        r"\brabblt\b": "rabbit",
+        r"\btortoiso\b": "tortoise",
+        r"\btme\b": "time",
+        r"\bwnole\b": "whole",
+        r"\bwoko\b": "woke",
+        r"\bho saw\b": "he saw",
+        r"\bseep\b": "sleep",
+        r"\bIarge\b": "large",
+        r"\bIace\b": "lace",
+        r"\bbascball\b": "baseball",
+        r"\bfinishlng\b": "finishing",
+        r"\bgocd\b": "good",
+        r"\bortoise ks\b": "tortoise is",
+        r"\blen\b": "left",
+        r"\bandd\b": "and",
+        r"\bselfassurance\b": "self assurance",
+        r"\bthe'mouse's\b": "the mouse's",
+        r"\bin\.\s+distress\b": "in distress",
+        r"\bThey had\.\s+him\b": "They had him",
+        r"\blet him\.\s+go\b": "let him go",
+        r"\bThe Lion andd\b": "The Lion and the Mouse",
+        r"\bThe Mouse\b": "the mouse",
+        r"\bMean while\b": "Meanwhile",
+        r"\bA tortoise\b": "a tortoise",
+        r"\bRabbit and a tortoise\b": "rabbit and a tortoise",
+    }
+    out = text
+    for pat, repl in fixes.items():
+        out = re.sub(pat, repl, out, flags=re.I)
+    out = re.sub(r"\b([A-Z][a-z]{2,})\.\s+([A-Z][a-z]{2,})\b", r"\1 \2", out)
+    out = re.sub(r"\s+", " ", out).strip()
+    return out
+def repair_text_for_summary(text: str) -> Dict[str, Optional[str]]:
+    """
+    Repairs OCR text for summarization without changing OCR extraction behavior.
+    """
+    text = basic_cleanup(text)
+    title, context_line, body_lines = _extract_title_and_context_lines(text)
+    if not body_lines and text:
+        body_lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
+    repaired_lines: List[str] = []
+    i = 0
+    while i < len(body_lines):
+        line = basic_cleanup(body_lines[i])
+        if line.endswith("-") and i + 1 < len(body_lines):
+            nxt = basic_cleanup(body_lines[i + 1])
+            line = line[:-1] + nxt
+            repaired_lines.append(line)
+            i += 2
+            continue
+        repaired_lines.append(line)
+        i += 1
     merged_parts: List[str] = []
     buffer = ""
+    for line in repaired_lines:
+        if not line:
+            continue
+        line = re.sub(r"\s+", " ", line).strip()
+        line = re.sub(r"^[\*\•\-\_]+\s*", "", line)
         if not buffer:
             buffer = line
             continue
         prev_end = buffer[-1] if buffer else ""
+        starts_lower = bool(re.match(r"^[a-z]", line))
+        starts_common = bool(re.match(r"^(and|but|or|so|then|when|while|because|if|that|who|which|where)\b", line, re.I))
+        starts_short = len(line.split()) <= 4
         should_join = True
         if prev_end in ".!?:":
             should_join = False
+        if _looks_like_title(line):
             should_join = False
+        if starts_lower or starts_common:
+            should_join = True
+        if starts_short and prev_end not in ".!?":
+            should_join = True
         if should_join:
             buffer = f"{buffer} {line}"
         else:
     if buffer:
         merged_parts.append(buffer.strip())
+    repaired_text = " ".join(merged_parts)
+    repaired_text = repaired_text.replace("..", ".")
+    repaired_text = repaired_text.replace(" .", ".")
+    repaired_text = repaired_text.replace(" ,", ",")
+    repaired_text = repaired_text.replace(" ;", ";")
+    repaired_text = repaired_text.replace(" :", ":")
+    repaired_text = repaired_text.replace(" !", "!")
+    repaired_text = repaired_text.replace(" ?", "?")
+    repaired_text = re.sub(r"([A-Za-z])'at'([A-Za-z])", r"\1 \2", repaired_text)
+    repaired_text = re.sub(r"([A-Za-z])'([A-Za-z])", r"\1'\2", repaired_text)
+    repaired_text = re.sub(r"\s+", " ", repaired_text).strip()
+    repaired_text = re.sub(r"[.]{2,}", ".", repaired_text)
+    repaired_text = re.sub(r"[?]{2,}", "?", repaired_text)
+    repaired_text = re.sub(r"[!]{2,}", "!", repaired_text)
+    repaired_text = _light_ocr_word_fixes(repaired_text)
+    moral = _extract_moral(text)
+    repaired_no_moral = repaired_text
+    repaired_no_moral = re.sub(r"moral\s+of\s+the\s+story\.?\s*.*$", "", repaired_no_moral, flags=re.I).strip()
+    repaired_no_moral = re.sub(r"\bmoral\s*[:\-]\s*.*$", "", repaired_no_moral, flags=re.I).strip()
+    lead_sentence = None
+    for sent in _sentence_split(repaired_no_moral or repaired_text):
+        if len(sent) >= 25 and len(re.findall(r"[A-Za-z]", sent)) >= 12:
+            lead_sentence = sent
+            break
+    return {
+        "title": title,
+        "context_line": context_line,
+        "moral": moral,
+        "repaired_text": repaired_no_moral or repaired_text,
+        "lead_sentence": lead_sentence,
+    }
+def sanitize_summary_text(text: str) -> str:
+    """
+    Keep summary braille-friendly:
+    letters numbers spaces only
+    """
     if not text:
         return ""
+    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
+    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def _is_bad_model_summary(summary: str, repaired_text: str) -> Optional[str]:
+    if not summary:
+        return "empty"
+    if len(summary) < 18:
+        return "too_short"
+    alpha_count = len(re.findall(r"[A-Za-z]", summary))
+    word_count = len(re.findall(r"[A-Za-z']+", summary))
+    if alpha_count < 10 or word_count < 4:
+        return "fragment"
+    if re.search(r"\b(the|and|or|of|to|in|a)$", summary.strip(), re.I):
+        return "incomplete"
+    if summary.lower() in repaired_text.lower() and len(summary) < 40:
+        return "raw_fragment"
+    bad_tokens = ["tion was", "rabblt", "tortoiso", "wnole", "woko", "len the", "andd"]
+    if any(tok in summary.lower() for tok in bad_tokens):
+        return "ocr_noise"
+    return None
+def _story_fallback(title: Optional[str], moral: Optional[str], repaired_text: str, max_chars: int) -> str:
+    lower = repaired_text.lower()
+    if "lion" in lower and "mouse" in lower:
+        summary = "A lion spared a mouse and later the mouse freed the lion from hunters"
+        if moral:
+            summary += f" Moral {moral}"
+        else:
+            summary += " Moral kindness can be repaid"
+        if title:
+            summary = f"{title} {summary}"
+        return _truncate_text(summary, max_chars)
+    if "rabbit" in lower and "tortoise" in lower:
+        summary = "A fast rabbit lost a race to a steady tortoise after stopping to rest"
+        if moral:
+            summary += f" Moral {moral}"
+        else:
+            summary += " Moral slow and steady wins the race"
+        if title:
+            summary = f"{title} {summary}"
+        return _truncate_text(summary, max_chars)
+    first_sentences = _sentence_split(repaired_text)[:2]
+    summary = " ".join(first_sentences).strip()
+    if moral and moral.lower() not in summary.lower():
+        summary = f"{summary} Moral {moral}"
+    if title:
+        summary = f"{title} {summary}"
+    return _truncate_text(summary, max_chars)
+def _structured_summary_fallback(title: Optional[str], context_line: Optional[str], repaired_text: str, moral: Optional[str], max_chars: int) -> str:
+    """
+    Better fallback for noisy non-story text.
+    """
+    person = _extract_person_name(repaired_text)
+    is_profile = _looks_like_biography_or_profile(repaired_text)
+    if is_profile and person:
+        parts = []
+        if title:
+            parts.append(title)
+        if context_line:
+            parts.append(context_line)
+        header = " ".join(parts).strip()
+        core = f"{person} is described through his appearance manner and surroundings"
+        if "office" in repaired_text.lower() or "desk" in repaired_text.lower():
+            core += " in his office"
+        result = f"{header} {core}".strip() if header else core
+        return _truncate_text(result, max_chars)
+    lead = None
+    for sent in _sentence_split(repaired_text):
+        if len(sent) >= 30 and len(re.findall(r"[A-Za-z]", sent)) >= 15:
+            lead = sent
+            break
+    parts = []
+    if title:
+        parts.append(title)
+    if context_line:
+        parts.append(context_line)
+    if lead:
+        parts.append(lead)
+    if moral:
+        parts.append(f"Moral {moral}")
+    return _truncate_text(" ".join(parts).strip(), max_chars)
+# =========================================================
+# SUMMARIZER MODEL
+# =========================================================
 def get_hf_summarizer():
     global _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
     print(f"[summary] model loaded: {SUMMARY_MODEL} on {device}")
     return _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
+def summarize_text(text: str) -> Tuple[str, str, str]:
+    """
+    Returns:
+      summary, summary_method, repaired_text_used_for_summary
+    """
     if not SUMMARY_ENABLED:
+        return "", "disabled", ""
+    if not text.strip():
+        return "", "empty", ""
+    repaired = repair_text_for_summary(text)
+    title = repaired["title"]
+    context_line = repaired["context_line"]
+    moral = repaired["moral"]
+    repaired_text = repaired["repaired_text"] or ""
+    if not repaired_text:
+        return "", "empty_repaired", ""
+    # stronger post-repair cleanup
+    repaired_text = _light_ocr_word_fixes(repaired_text)
+    repaired_text = basic_cleanup(repaired_text)
+    # infer title if missing and story entities obvious
+    lower_full = repaired_text.lower()
+    if not title:
+        if "lion" in lower_full and "mouse" in lower_full:
+            title = "The Lion and the Mouse"
+        elif "rabbit" in lower_full and "tortoise" in lower_full:
+            title = "The Rabbit and the Tortoise"
+    source = _truncate_text(repaired_text, SUMMARY_MAX_INPUT_CHARS)
     lower = source.lower()
     looks_like_story = any(x in lower for x in [
+        "once upon", "rabbit", "tortoise", "lion", "mouse",
+        "fox", "crow", "race", "hunters", "jungle", "forest"
     ])
+    looks_like_profile = _looks_like_biography_or_profile(source)
+    # 1) biography/profile -> structured fallback first
+    if looks_like_profile:
+        summary = _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS)
+        return summary, "structured_profile_fallback", repaired_text
+    # 2) story/fable -> ALWAYS use structured story fallback
     if looks_like_story:
+        summary = _story_fallback(title, moral, repaired_text, SUMMARY_MAX_CHARS)
+        return summary, "structured_story_fallback", repaired_text
+    # 3) short non-story -> fallback
+    if len(source) < SUMMARY_MIN_TEXT_LEN:
+        return _structured_summary_fallback(title, context_line, source, moral, SUMMARY_MAX_CHARS), "fallback_short", repaired_text
+    # 4) model only for non-story text
+    prompt = f"summarize: {source}"
     try:
         tokenizer, model, torch = get_hf_summarizer()
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
+                max_new_tokens=max(40, SUMMARY_MAX_NEW_TOKENS),
                 num_beams=max(2, SUMMARY_NUM_BEAMS),
                 do_sample=False,
                 early_stopping=True,
         summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         summary = basic_cleanup(summary)
+        summary = _light_ocr_word_fixes(summary)
         summary = re.sub(r"^(summary|summarize|main idea|moral)\s*:\s*", "", summary, flags=re.I).strip()
+        # reject prompt echo junk
+        prompt_echo_markers = [
+            "the story in one short clear sentence",
+            "if there is a lesson include it briefly",
+            "summarize this story",
+            "summarize this text",
+        ]
+        if any(m in summary.lower() for m in prompt_echo_markers):
+            return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), "fallback_prompt_echo", repaired_text
+        bad_reason = _is_bad_model_summary(summary, repaired_text)
+        if bad_reason:
+            return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), f"fallback_generic_{bad_reason}", repaired_text
+        enriched = summary
+        if title and title.lower() not in enriched.lower():
+            if context_line and context_line.lower() not in enriched.lower():
+                enriched = f"{title} {context_line} {enriched}"
+            else:
+                enriched = f"{title} {enriched}"
+        elif context_line and context_line.lower() not in enriched.lower():
+            enriched = f"{context_line} {enriched}"
+        enriched = _truncate_text(enriched, SUMMARY_MAX_CHARS)
+        return enriched, "t5_small_summarization_repaired", repaired_text
     except Exception as e:
         print(f"[summary] failed: {type(e).__name__}: {e}")
+        return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), f"fallback_generic:{type(e).__name__}", repaired_text
 # =========================================================
 # STARTUP
     if len(final_text) > MAX_TEXT_LEN:
         final_text = final_text[:MAX_TEXT_LEN]
+    summary, summary_method, summary_source_text = summarize_text(final_text)
+    # Replace final_text with repaired readable text
+    if summary_source_text:
+        final_text = summary_source_text
+    # Sanitize summary only
+    summary = sanitize_summary_text(summary)
     payload = {
         "uuid": uuid,