Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -70,17 +70,17 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
| 70 |
# =========================================================
|
| 71 |
SUMMARY_ENABLED = os.getenv("SUMMARY_ENABLED", "true").lower() == "true"
|
| 72 |
SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "weijiahaha/t5-small-summarization")
|
| 73 |
-
SUMMARY_MAX_INPUT_CHARS = int(os.getenv("SUMMARY_MAX_INPUT_CHARS", "
|
| 74 |
-
SUMMARY_MAX_NEW_TOKENS = int(os.getenv("SUMMARY_MAX_NEW_TOKENS", "
|
| 75 |
SUMMARY_MIN_TEXT_LEN = int(os.getenv("SUMMARY_MIN_TEXT_LEN", "80"))
|
| 76 |
-
SUMMARY_NUM_BEAMS = int(os.getenv("SUMMARY_NUM_BEAMS", "
|
| 77 |
-
SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "
|
| 78 |
TORCH_NUM_THREADS = int(os.getenv("TORCH_NUM_THREADS", "1"))
|
| 79 |
|
| 80 |
# =========================================================
|
| 81 |
# APP
|
| 82 |
# =========================================================
|
| 83 |
-
app = FastAPI(title=APP_NAME, version="1.
|
| 84 |
app.add_middleware(
|
| 85 |
CORSMiddleware,
|
| 86 |
allow_origins=["*"],
|
|
@@ -151,7 +151,7 @@ class ScanRequest(BaseModel):
|
|
| 151 |
uuid: str
|
| 152 |
|
| 153 |
# =========================================================
|
| 154 |
-
# HELPERS
|
| 155 |
# =========================================================
|
| 156 |
def _now() -> float:
|
| 157 |
return time.time()
|
|
@@ -277,6 +277,7 @@ def basic_cleanup(text: str) -> str:
|
|
| 277 |
text = re.sub(r"\s+([,.;:!?])", r"\1", text)
|
| 278 |
text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
|
| 279 |
text = re.sub(r"[ \t]+", " ", text)
|
|
|
|
| 280 |
return text.strip()
|
| 281 |
|
| 282 |
def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
|
|
@@ -290,6 +291,9 @@ def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
|
|
| 290 |
)
|
| 291 |
return (weird / max(1, len(text))) > 0.20
|
| 292 |
|
|
|
|
|
|
|
|
|
|
| 293 |
def get_paddle_ocr() -> PaddleOCR:
|
| 294 |
global _OCR_PADDLE
|
| 295 |
if _OCR_PADDLE is not None:
|
|
@@ -334,6 +338,9 @@ def paddle_ocr_extract(img: Image.Image) -> Tuple[str, float]:
|
|
| 334 |
avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
|
| 335 |
return full_text, avg_conf
|
| 336 |
|
|
|
|
|
|
|
|
|
|
| 337 |
def get_hf_corrector():
|
| 338 |
global _HF_CORRECTOR
|
| 339 |
if _HF_CORRECTOR is not None:
|
|
@@ -355,7 +362,7 @@ def hf_correct_text(text: str) -> str:
|
|
| 355 |
return text
|
| 356 |
|
| 357 |
# =========================================================
|
| 358 |
-
# SUMMARY
|
| 359 |
# =========================================================
|
| 360 |
def _truncate_text(text: str, max_chars: int) -> str:
|
| 361 |
text = text.strip()
|
|
@@ -367,32 +374,246 @@ def _truncate_text(text: str, max_chars: int) -> str:
|
|
| 367 |
cut = cut[:last_space]
|
| 368 |
return cut.rstrip(" ,;:-") + "..."
|
| 369 |
|
| 370 |
-
def
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
-
|
|
|
|
| 374 |
raw_lines = [ln for ln in raw_lines if ln]
|
| 375 |
|
| 376 |
if not raw_lines:
|
| 377 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
merged_parts: List[str] = []
|
| 380 |
buffer = ""
|
| 381 |
|
| 382 |
-
for line in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
if not buffer:
|
| 384 |
buffer = line
|
| 385 |
continue
|
| 386 |
|
| 387 |
prev_end = buffer[-1] if buffer else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
should_join = True
|
| 389 |
|
| 390 |
if prev_end in ".!?:":
|
| 391 |
should_join = False
|
| 392 |
|
| 393 |
-
if
|
| 394 |
should_join = False
|
| 395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
if should_join:
|
| 397 |
buffer = f"{buffer} {line}"
|
| 398 |
else:
|
|
@@ -402,46 +623,152 @@ def prepare_text_for_summary(text: str) -> str:
|
|
| 402 |
if buffer:
|
| 403 |
merged_parts.append(buffer.strip())
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
-
return
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
|
|
|
|
|
|
| 418 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
if not text:
|
| 420 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
story_hits = sum(1 for m in story_markers if m in lower)
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
return "Overconfidence can lead to failure."
|
| 434 |
-
return "The story shows that patience and consistency can lead to success."
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
| 442 |
|
| 443 |
-
|
|
|
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
def get_hf_summarizer():
|
| 446 |
global _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
|
| 447 |
|
|
@@ -471,31 +798,64 @@ def get_hf_summarizer():
|
|
| 471 |
print(f"[summary] model loaded: {SUMMARY_MODEL} on {device}")
|
| 472 |
return _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
|
| 473 |
|
| 474 |
-
def summarize_text(text: str) -> Tuple[str, str]:
|
| 475 |
-
|
| 476 |
-
|
|
|
|
|
|
|
| 477 |
if not SUMMARY_ENABLED:
|
| 478 |
-
return "", "disabled"
|
| 479 |
|
| 480 |
-
if not text:
|
| 481 |
-
return "", "empty"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
-
|
|
|
|
| 484 |
|
| 485 |
-
|
| 486 |
-
|
|
|
|
| 487 |
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
|
|
|
|
| 490 |
lower = source.lower()
|
|
|
|
| 491 |
looks_like_story = any(x in lower for x in [
|
| 492 |
-
"once upon", "rabbit", "tortoise", "lion", "mouse",
|
|
|
|
| 493 |
])
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
if looks_like_story:
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
try:
|
| 501 |
tokenizer, model, torch = get_hf_summarizer()
|
|
@@ -513,7 +873,7 @@ def summarize_text(text: str) -> Tuple[str, str]:
|
|
| 513 |
with torch.no_grad():
|
| 514 |
output_ids = model.generate(
|
| 515 |
**inputs,
|
| 516 |
-
max_new_tokens=SUMMARY_MAX_NEW_TOKENS,
|
| 517 |
num_beams=max(2, SUMMARY_NUM_BEAMS),
|
| 518 |
do_sample=False,
|
| 519 |
early_stopping=True,
|
|
@@ -524,31 +884,39 @@ def summarize_text(text: str) -> Tuple[str, str]:
|
|
| 524 |
|
| 525 |
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 526 |
summary = basic_cleanup(summary)
|
|
|
|
| 527 |
summary = re.sub(r"^(summary|summarize|main idea|moral)\s*:\s*", "", summary, flags=re.I).strip()
|
| 528 |
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
if
|
| 541 |
-
return
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
|
| 549 |
except Exception as e:
|
| 550 |
print(f"[summary] failed: {type(e).__name__}: {e}")
|
| 551 |
-
return
|
| 552 |
|
| 553 |
# =========================================================
|
| 554 |
# STARTUP
|
|
@@ -665,7 +1033,14 @@ async def ocr_endpoint(
|
|
| 665 |
if len(final_text) > MAX_TEXT_LEN:
|
| 666 |
final_text = final_text[:MAX_TEXT_LEN]
|
| 667 |
|
| 668 |
-
summary, summary_method = summarize_text(final_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
|
| 670 |
payload = {
|
| 671 |
"uuid": uuid,
|
|
|
|
| 70 |
# =========================================================
|
| 71 |
SUMMARY_ENABLED = os.getenv("SUMMARY_ENABLED", "true").lower() == "true"
|
| 72 |
SUMMARY_MODEL = os.getenv("SUMMARY_MODEL", "weijiahaha/t5-small-summarization")
|
| 73 |
+
SUMMARY_MAX_INPUT_CHARS = int(os.getenv("SUMMARY_MAX_INPUT_CHARS", "1600"))
|
| 74 |
+
SUMMARY_MAX_NEW_TOKENS = int(os.getenv("SUMMARY_MAX_NEW_TOKENS", "72"))
|
| 75 |
SUMMARY_MIN_TEXT_LEN = int(os.getenv("SUMMARY_MIN_TEXT_LEN", "80"))
|
| 76 |
+
SUMMARY_NUM_BEAMS = int(os.getenv("SUMMARY_NUM_BEAMS", "3"))
|
| 77 |
+
SUMMARY_MAX_CHARS = int(os.getenv("SUMMARY_MAX_CHARS", "260"))
|
| 78 |
TORCH_NUM_THREADS = int(os.getenv("TORCH_NUM_THREADS", "1"))
|
| 79 |
|
| 80 |
# =========================================================
|
| 81 |
# APP
|
| 82 |
# =========================================================
|
| 83 |
+
app = FastAPI(title=APP_NAME, version="1.11.0")
|
| 84 |
app.add_middleware(
|
| 85 |
CORSMiddleware,
|
| 86 |
allow_origins=["*"],
|
|
|
|
| 151 |
uuid: str
|
| 152 |
|
| 153 |
# =========================================================
|
| 154 |
+
# GENERAL HELPERS
|
| 155 |
# =========================================================
|
| 156 |
def _now() -> float:
|
| 157 |
return time.time()
|
|
|
|
| 277 |
text = re.sub(r"\s+([,.;:!?])", r"\1", text)
|
| 278 |
text = re.sub(r"([,.;:!?])([A-Za-z])", r"\1 \2", text)
|
| 279 |
text = re.sub(r"[ \t]+", " ", text)
|
| 280 |
+
text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
|
| 281 |
return text.strip()
|
| 282 |
|
| 283 |
def looks_bad(text: str, avg_conf: Optional[float]) -> bool:
|
|
|
|
| 291 |
)
|
| 292 |
return (weird / max(1, len(text))) > 0.20
|
| 293 |
|
| 294 |
+
# =========================================================
|
| 295 |
+
# OCR
|
| 296 |
+
# =========================================================
|
| 297 |
def get_paddle_ocr() -> PaddleOCR:
|
| 298 |
global _OCR_PADDLE
|
| 299 |
if _OCR_PADDLE is not None:
|
|
|
|
| 338 |
avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
|
| 339 |
return full_text, avg_conf
|
| 340 |
|
| 341 |
+
# =========================================================
|
| 342 |
+
# OPTIONAL HF TEXT CORRECTION
|
| 343 |
+
# =========================================================
|
| 344 |
def get_hf_corrector():
|
| 345 |
global _HF_CORRECTOR
|
| 346 |
if _HF_CORRECTOR is not None:
|
|
|
|
| 362 |
return text
|
| 363 |
|
| 364 |
# =========================================================
|
| 365 |
+
# SUMMARY REPAIR / NORMALIZATION
|
| 366 |
# =========================================================
|
| 367 |
def _truncate_text(text: str, max_chars: int) -> str:
|
| 368 |
text = text.strip()
|
|
|
|
| 374 |
cut = cut[:last_space]
|
| 375 |
return cut.rstrip(" ,;:-") + "..."
|
| 376 |
|
| 377 |
+
def _sentence_split(text: str) -> List[str]:
|
| 378 |
+
parts = re.split(r'(?<=[.!?])\s+', text)
|
| 379 |
+
return [p.strip() for p in parts if p.strip()]
|
| 380 |
+
|
| 381 |
+
def _looks_like_title(line: str) -> bool:
|
| 382 |
+
s = line.strip()
|
| 383 |
+
if not s:
|
| 384 |
+
return False
|
| 385 |
+
if len(s) > 90:
|
| 386 |
+
return False
|
| 387 |
+
if re.fullmatch(r"[0-9]+", s):
|
| 388 |
+
return False
|
| 389 |
+
|
| 390 |
+
words = re.findall(r"[A-Za-z][A-Za-z'-]*", s)
|
| 391 |
+
if not words:
|
| 392 |
+
return False
|
| 393 |
+
|
| 394 |
+
titleish = sum(1 for w in words if w[:1].isupper())
|
| 395 |
+
ratio = titleish / max(1, len(words))
|
| 396 |
+
return ratio >= 0.6 and len(words) <= 10
|
| 397 |
+
|
| 398 |
+
def _clean_title(line: str) -> str:
|
| 399 |
+
s = basic_cleanup(line)
|
| 400 |
+
s = re.sub(r"^[0-9]+\s*", "", s).strip()
|
| 401 |
+
s = re.sub(r"\.+$", "", s).strip()
|
| 402 |
+
s = re.sub(r"\s{2,}", " ", s)
|
| 403 |
+
return s
|
| 404 |
+
|
| 405 |
+
def _extract_moral(text: str) -> Optional[str]:
|
| 406 |
+
t = basic_cleanup(text)
|
| 407 |
+
|
| 408 |
+
m = re.search(r"(moral\s+of\s+the\s+story\.?\s*)(.+)$", t, flags=re.I)
|
| 409 |
+
if m:
|
| 410 |
+
moral = m.group(2).strip()
|
| 411 |
+
moral = re.split(r'(?<=[.!?])\s+', moral)[0].strip()
|
| 412 |
+
moral = re.sub(r"^[\-\:\*\"\']+", "", moral).strip()
|
| 413 |
+
if moral:
|
| 414 |
+
return moral
|
| 415 |
+
|
| 416 |
+
m2 = re.search(r"\bmoral\s*[:\-]\s*(.+)$", t, flags=re.I)
|
| 417 |
+
if m2:
|
| 418 |
+
moral = m2.group(1).strip()
|
| 419 |
+
moral = re.split(r'(?<=[.!?])\s+', moral)[0].strip()
|
| 420 |
+
if moral:
|
| 421 |
+
return moral
|
| 422 |
+
|
| 423 |
+
return None
|
| 424 |
|
| 425 |
+
def _extract_title_and_context_lines(text: str) -> Tuple[Optional[str], Optional[str], List[str]]:
|
| 426 |
+
raw_lines = [ln.strip() for ln in text.replace("\r", "\n").split("\n")]
|
| 427 |
raw_lines = [ln for ln in raw_lines if ln]
|
| 428 |
|
| 429 |
if not raw_lines:
|
| 430 |
+
return None, None, []
|
| 431 |
+
|
| 432 |
+
title = None
|
| 433 |
+
context_line = None
|
| 434 |
+
|
| 435 |
+
filtered = [ln for ln in raw_lines[:6] if not re.fullmatch(r"[0-9]+", ln.strip())]
|
| 436 |
+
|
| 437 |
+
if filtered:
|
| 438 |
+
title_parts: List[str] = []
|
| 439 |
+
for ln in filtered[:3]:
|
| 440 |
+
if _looks_like_title(ln):
|
| 441 |
+
title_parts.append(_clean_title(ln))
|
| 442 |
+
else:
|
| 443 |
+
break
|
| 444 |
+
|
| 445 |
+
if title_parts:
|
| 446 |
+
title = " ".join([p for p in title_parts if p]).strip()
|
| 447 |
+
title = re.sub(r"\s+", " ", title).strip()
|
| 448 |
+
|
| 449 |
+
if title:
|
| 450 |
+
for ln in filtered[len(title_parts):len(title_parts) + 2]:
|
| 451 |
+
c = _clean_title(ln)
|
| 452 |
+
if len(c) <= 60 and (
|
| 453 |
+
re.search(r"\b(january|february|march|april|may|june|july|august|september|october|november|december)\b", c, re.I)
|
| 454 |
+
or re.search(r"\b\d{4}\b", c)
|
| 455 |
+
or "," in c
|
| 456 |
+
or "-" in c
|
| 457 |
+
):
|
| 458 |
+
context_line = c
|
| 459 |
+
break
|
| 460 |
+
|
| 461 |
+
body_lines = list(raw_lines)
|
| 462 |
+
|
| 463 |
+
remove_count = 0
|
| 464 |
+
if title:
|
| 465 |
+
title_words = title.lower().split()
|
| 466 |
+
while remove_count < len(body_lines):
|
| 467 |
+
candidate = _clean_title(body_lines[remove_count]).lower()
|
| 468 |
+
if not candidate:
|
| 469 |
+
remove_count += 1
|
| 470 |
+
continue
|
| 471 |
+
if any(w in candidate for w in title_words[:2]):
|
| 472 |
+
remove_count += 1
|
| 473 |
+
else:
|
| 474 |
+
break
|
| 475 |
+
|
| 476 |
+
if context_line and remove_count < len(body_lines):
|
| 477 |
+
if _clean_title(body_lines[remove_count]).lower() == context_line.lower():
|
| 478 |
+
remove_count += 1
|
| 479 |
+
|
| 480 |
+
body_lines = body_lines[remove_count:]
|
| 481 |
+
return title, context_line, body_lines
|
| 482 |
+
|
| 483 |
+
def _extract_person_name(text: str) -> Optional[str]:
|
| 484 |
+
if not text:
|
| 485 |
+
return None
|
| 486 |
+
|
| 487 |
+
text = re.sub(r"\barren Buffett\b", "Warren Buffett", text)
|
| 488 |
+
text = re.sub(r"\barren\b", "Warren", text)
|
| 489 |
+
|
| 490 |
+
patterns = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b", text)
|
| 491 |
+
blacklist = {
|
| 492 |
+
"The Less", "Long Term", "Capital Management", "Coca Cola",
|
| 493 |
+
"Howard Plain", "Wells Fargo", "Pulitzer Prize"
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
for p in patterns:
|
| 497 |
+
if p not in blacklist:
|
| 498 |
+
return p
|
| 499 |
+
|
| 500 |
+
return None
|
| 501 |
+
|
| 502 |
+
def _looks_like_biography_or_profile(text: str) -> bool:
|
| 503 |
+
lower = text.lower()
|
| 504 |
+
markers = [
|
| 505 |
+
"office", "headquarters", "photographs", "memorabilia",
|
| 506 |
+
"appearance", "chair", "desk", "eyebrow", "glasses",
|
| 507 |
+
"shirt", "suit jacket", "surrounded by", "buffett",
|
| 508 |
+
"berkshire", "omaha"
|
| 509 |
+
]
|
| 510 |
+
return sum(1 for m in markers if m in lower) >= 3
|
| 511 |
+
|
| 512 |
+
def _light_ocr_word_fixes(text: str) -> str:
|
| 513 |
+
"""
|
| 514 |
+
Very lightweight OCR cleanup for summary source only.
|
| 515 |
+
Keep this small and safe.
|
| 516 |
+
"""
|
| 517 |
+
fixes = {
|
| 518 |
+
r"\barren Buffett\b": "Warren Buffett",
|
| 519 |
+
r"\barren\b": "Warren",
|
| 520 |
+
r"\btion\b": "lion",
|
| 521 |
+
r"\ba tion\b": "a lion",
|
| 522 |
+
r"\brabblt\b": "rabbit",
|
| 523 |
+
r"\btortoiso\b": "tortoise",
|
| 524 |
+
r"\btme\b": "time",
|
| 525 |
+
r"\bwnole\b": "whole",
|
| 526 |
+
r"\bwoko\b": "woke",
|
| 527 |
+
r"\bho saw\b": "he saw",
|
| 528 |
+
r"\bseep\b": "sleep",
|
| 529 |
+
r"\bIarge\b": "large",
|
| 530 |
+
r"\bIace\b": "lace",
|
| 531 |
+
r"\bbascball\b": "baseball",
|
| 532 |
+
r"\bfinishlng\b": "finishing",
|
| 533 |
+
r"\bgocd\b": "good",
|
| 534 |
+
r"\bortoise ks\b": "tortoise is",
|
| 535 |
+
r"\blen\b": "left",
|
| 536 |
+
r"\bandd\b": "and",
|
| 537 |
+
r"\bselfassurance\b": "self assurance",
|
| 538 |
+
r"\bthe'mouse's\b": "the mouse's",
|
| 539 |
+
r"\bin\.\s+distress\b": "in distress",
|
| 540 |
+
r"\bThey had\.\s+him\b": "They had him",
|
| 541 |
+
r"\blet him\.\s+go\b": "let him go",
|
| 542 |
+
r"\bThe Lion andd\b": "The Lion and the Mouse",
|
| 543 |
+
r"\bThe Mouse\b": "the mouse",
|
| 544 |
+
r"\bMean while\b": "Meanwhile",
|
| 545 |
+
r"\bA tortoise\b": "a tortoise",
|
| 546 |
+
r"\bRabbit and a tortoise\b": "rabbit and a tortoise",
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
out = text
|
| 550 |
+
for pat, repl in fixes.items():
|
| 551 |
+
out = re.sub(pat, repl, out, flags=re.I)
|
| 552 |
+
|
| 553 |
+
out = re.sub(r"\b([A-Z][a-z]{2,})\.\s+([A-Z][a-z]{2,})\b", r"\1 \2", out)
|
| 554 |
+
out = re.sub(r"\s+", " ", out).strip()
|
| 555 |
+
|
| 556 |
+
return out
|
| 557 |
+
|
| 558 |
+
def repair_text_for_summary(text: str) -> Dict[str, Optional[str]]:
|
| 559 |
+
"""
|
| 560 |
+
Repairs OCR text for summarization without changing OCR extraction behavior.
|
| 561 |
+
"""
|
| 562 |
+
text = basic_cleanup(text)
|
| 563 |
+
title, context_line, body_lines = _extract_title_and_context_lines(text)
|
| 564 |
+
|
| 565 |
+
if not body_lines and text:
|
| 566 |
+
body_lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
|
| 567 |
+
|
| 568 |
+
repaired_lines: List[str] = []
|
| 569 |
+
|
| 570 |
+
i = 0
|
| 571 |
+
while i < len(body_lines):
|
| 572 |
+
line = basic_cleanup(body_lines[i])
|
| 573 |
+
|
| 574 |
+
if line.endswith("-") and i + 1 < len(body_lines):
|
| 575 |
+
nxt = basic_cleanup(body_lines[i + 1])
|
| 576 |
+
line = line[:-1] + nxt
|
| 577 |
+
repaired_lines.append(line)
|
| 578 |
+
i += 2
|
| 579 |
+
continue
|
| 580 |
+
|
| 581 |
+
repaired_lines.append(line)
|
| 582 |
+
i += 1
|
| 583 |
|
| 584 |
merged_parts: List[str] = []
|
| 585 |
buffer = ""
|
| 586 |
|
| 587 |
+
for line in repaired_lines:
|
| 588 |
+
if not line:
|
| 589 |
+
continue
|
| 590 |
+
|
| 591 |
+
line = re.sub(r"\s+", " ", line).strip()
|
| 592 |
+
line = re.sub(r"^[\*\•\-\_]+\s*", "", line)
|
| 593 |
+
|
| 594 |
if not buffer:
|
| 595 |
buffer = line
|
| 596 |
continue
|
| 597 |
|
| 598 |
prev_end = buffer[-1] if buffer else ""
|
| 599 |
+
starts_lower = bool(re.match(r"^[a-z]", line))
|
| 600 |
+
starts_common = bool(re.match(r"^(and|but|or|so|then|when|while|because|if|that|who|which|where)\b", line, re.I))
|
| 601 |
+
starts_short = len(line.split()) <= 4
|
| 602 |
+
|
| 603 |
should_join = True
|
| 604 |
|
| 605 |
if prev_end in ".!?:":
|
| 606 |
should_join = False
|
| 607 |
|
| 608 |
+
if _looks_like_title(line):
|
| 609 |
should_join = False
|
| 610 |
|
| 611 |
+
if starts_lower or starts_common:
|
| 612 |
+
should_join = True
|
| 613 |
+
|
| 614 |
+
if starts_short and prev_end not in ".!?":
|
| 615 |
+
should_join = True
|
| 616 |
+
|
| 617 |
if should_join:
|
| 618 |
buffer = f"{buffer} {line}"
|
| 619 |
else:
|
|
|
|
| 623 |
if buffer:
|
| 624 |
merged_parts.append(buffer.strip())
|
| 625 |
|
| 626 |
+
repaired_text = " ".join(merged_parts)
|
| 627 |
+
|
| 628 |
+
repaired_text = repaired_text.replace("..", ".")
|
| 629 |
+
repaired_text = repaired_text.replace(" .", ".")
|
| 630 |
+
repaired_text = repaired_text.replace(" ,", ",")
|
| 631 |
+
repaired_text = repaired_text.replace(" ;", ";")
|
| 632 |
+
repaired_text = repaired_text.replace(" :", ":")
|
| 633 |
+
repaired_text = repaired_text.replace(" !", "!")
|
| 634 |
+
repaired_text = repaired_text.replace(" ?", "?")
|
| 635 |
+
repaired_text = re.sub(r"([A-Za-z])'at'([A-Za-z])", r"\1 \2", repaired_text)
|
| 636 |
+
repaired_text = re.sub(r"([A-Za-z])'([A-Za-z])", r"\1'\2", repaired_text)
|
| 637 |
+
repaired_text = re.sub(r"\s+", " ", repaired_text).strip()
|
| 638 |
+
repaired_text = re.sub(r"[.]{2,}", ".", repaired_text)
|
| 639 |
+
repaired_text = re.sub(r"[?]{2,}", "?", repaired_text)
|
| 640 |
+
repaired_text = re.sub(r"[!]{2,}", "!", repaired_text)
|
| 641 |
+
|
| 642 |
+
repaired_text = _light_ocr_word_fixes(repaired_text)
|
| 643 |
+
moral = _extract_moral(text)
|
| 644 |
+
|
| 645 |
+
repaired_no_moral = repaired_text
|
| 646 |
+
repaired_no_moral = re.sub(r"moral\s+of\s+the\s+story\.?\s*.*$", "", repaired_no_moral, flags=re.I).strip()
|
| 647 |
+
repaired_no_moral = re.sub(r"\bmoral\s*[:\-]\s*.*$", "", repaired_no_moral, flags=re.I).strip()
|
| 648 |
+
|
| 649 |
+
lead_sentence = None
|
| 650 |
+
for sent in _sentence_split(repaired_no_moral or repaired_text):
|
| 651 |
+
if len(sent) >= 25 and len(re.findall(r"[A-Za-z]", sent)) >= 12:
|
| 652 |
+
lead_sentence = sent
|
| 653 |
+
break
|
| 654 |
|
| 655 |
+
return {
|
| 656 |
+
"title": title,
|
| 657 |
+
"context_line": context_line,
|
| 658 |
+
"moral": moral,
|
| 659 |
+
"repaired_text": repaired_no_moral or repaired_text,
|
| 660 |
+
"lead_sentence": lead_sentence,
|
| 661 |
+
}
|
| 662 |
|
| 663 |
+
def sanitize_summary_text(text: str) -> str:
|
| 664 |
+
"""
|
| 665 |
+
Keep summary braille-friendly:
|
| 666 |
+
letters numbers spaces only
|
| 667 |
+
"""
|
| 668 |
if not text:
|
| 669 |
return ""
|
| 670 |
+
text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
| 671 |
+
text = re.sub(r"[^A-Za-z0-9\s]", " ", text)
|
| 672 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 673 |
+
return text
|
| 674 |
|
| 675 |
+
def _is_bad_model_summary(summary: str, repaired_text: str) -> Optional[str]:
|
| 676 |
+
if not summary:
|
| 677 |
+
return "empty"
|
| 678 |
+
if len(summary) < 18:
|
| 679 |
+
return "too_short"
|
|
|
|
| 680 |
|
| 681 |
+
alpha_count = len(re.findall(r"[A-Za-z]", summary))
|
| 682 |
+
word_count = len(re.findall(r"[A-Za-z']+", summary))
|
| 683 |
+
if alpha_count < 10 or word_count < 4:
|
| 684 |
+
return "fragment"
|
|
|
|
|
|
|
| 685 |
|
| 686 |
+
if re.search(r"\b(the|and|or|of|to|in|a)$", summary.strip(), re.I):
|
| 687 |
+
return "incomplete"
|
| 688 |
+
|
| 689 |
+
if summary.lower() in repaired_text.lower() and len(summary) < 40:
|
| 690 |
+
return "raw_fragment"
|
| 691 |
|
| 692 |
+
bad_tokens = ["tion was", "rabblt", "tortoiso", "wnole", "woko", "len the", "andd"]
|
| 693 |
+
if any(tok in summary.lower() for tok in bad_tokens):
|
| 694 |
+
return "ocr_noise"
|
| 695 |
+
|
| 696 |
+
return None
|
| 697 |
|
| 698 |
+
def _story_fallback(title: Optional[str], moral: Optional[str], repaired_text: str, max_chars: int) -> str:
|
| 699 |
+
lower = repaired_text.lower()
|
| 700 |
|
| 701 |
+
if "lion" in lower and "mouse" in lower:
|
| 702 |
+
summary = "A lion spared a mouse and later the mouse freed the lion from hunters"
|
| 703 |
+
if moral:
|
| 704 |
+
summary += f" Moral {moral}"
|
| 705 |
+
else:
|
| 706 |
+
summary += " Moral kindness can be repaid"
|
| 707 |
+
if title:
|
| 708 |
+
summary = f"{title} {summary}"
|
| 709 |
+
return _truncate_text(summary, max_chars)
|
| 710 |
+
|
| 711 |
+
if "rabbit" in lower and "tortoise" in lower:
|
| 712 |
+
summary = "A fast rabbit lost a race to a steady tortoise after stopping to rest"
|
| 713 |
+
if moral:
|
| 714 |
+
summary += f" Moral {moral}"
|
| 715 |
+
else:
|
| 716 |
+
summary += " Moral slow and steady wins the race"
|
| 717 |
+
if title:
|
| 718 |
+
summary = f"{title} {summary}"
|
| 719 |
+
return _truncate_text(summary, max_chars)
|
| 720 |
+
|
| 721 |
+
first_sentences = _sentence_split(repaired_text)[:2]
|
| 722 |
+
summary = " ".join(first_sentences).strip()
|
| 723 |
+
if moral and moral.lower() not in summary.lower():
|
| 724 |
+
summary = f"{summary} Moral {moral}"
|
| 725 |
+
if title:
|
| 726 |
+
summary = f"{title} {summary}"
|
| 727 |
+
return _truncate_text(summary, max_chars)
|
| 728 |
+
|
| 729 |
+
def _structured_summary_fallback(title: Optional[str], context_line: Optional[str], repaired_text: str, moral: Optional[str], max_chars: int) -> str:
|
| 730 |
+
"""
|
| 731 |
+
Better fallback for noisy non-story text.
|
| 732 |
+
"""
|
| 733 |
+
person = _extract_person_name(repaired_text)
|
| 734 |
+
is_profile = _looks_like_biography_or_profile(repaired_text)
|
| 735 |
+
|
| 736 |
+
if is_profile and person:
|
| 737 |
+
parts = []
|
| 738 |
+
if title:
|
| 739 |
+
parts.append(title)
|
| 740 |
+
if context_line:
|
| 741 |
+
parts.append(context_line)
|
| 742 |
+
|
| 743 |
+
header = " ".join(parts).strip()
|
| 744 |
+
core = f"{person} is described through his appearance manner and surroundings"
|
| 745 |
+
if "office" in repaired_text.lower() or "desk" in repaired_text.lower():
|
| 746 |
+
core += " in his office"
|
| 747 |
+
|
| 748 |
+
result = f"{header} {core}".strip() if header else core
|
| 749 |
+
return _truncate_text(result, max_chars)
|
| 750 |
+
|
| 751 |
+
lead = None
|
| 752 |
+
for sent in _sentence_split(repaired_text):
|
| 753 |
+
if len(sent) >= 30 and len(re.findall(r"[A-Za-z]", sent)) >= 15:
|
| 754 |
+
lead = sent
|
| 755 |
+
break
|
| 756 |
+
|
| 757 |
+
parts = []
|
| 758 |
+
if title:
|
| 759 |
+
parts.append(title)
|
| 760 |
+
if context_line:
|
| 761 |
+
parts.append(context_line)
|
| 762 |
+
if lead:
|
| 763 |
+
parts.append(lead)
|
| 764 |
+
if moral:
|
| 765 |
+
parts.append(f"Moral {moral}")
|
| 766 |
+
|
| 767 |
+
return _truncate_text(" ".join(parts).strip(), max_chars)
|
| 768 |
+
|
| 769 |
+
# =========================================================
|
| 770 |
+
# SUMMARIZER MODEL
|
| 771 |
+
# =========================================================
|
| 772 |
def get_hf_summarizer():
|
| 773 |
global _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
|
| 774 |
|
|
|
|
| 798 |
print(f"[summary] model loaded: {SUMMARY_MODEL} on {device}")
|
| 799 |
return _SUMMARY_TOKENIZER, _SUMMARY_MODEL_OBJ, _TORCH
|
| 800 |
|
| 801 |
+
def summarize_text(text: str) -> Tuple[str, str, str]:
|
| 802 |
+
"""
|
| 803 |
+
Returns:
|
| 804 |
+
summary, summary_method, repaired_text_used_for_summary
|
| 805 |
+
"""
|
| 806 |
if not SUMMARY_ENABLED:
|
| 807 |
+
return "", "disabled", ""
|
| 808 |
|
| 809 |
+
if not text.strip():
|
| 810 |
+
return "", "empty", ""
|
| 811 |
+
|
| 812 |
+
repaired = repair_text_for_summary(text)
|
| 813 |
+
title = repaired["title"]
|
| 814 |
+
context_line = repaired["context_line"]
|
| 815 |
+
moral = repaired["moral"]
|
| 816 |
+
repaired_text = repaired["repaired_text"] or ""
|
| 817 |
|
| 818 |
+
if not repaired_text:
|
| 819 |
+
return "", "empty_repaired", ""
|
| 820 |
|
| 821 |
+
# stronger post-repair cleanup
|
| 822 |
+
repaired_text = _light_ocr_word_fixes(repaired_text)
|
| 823 |
+
repaired_text = basic_cleanup(repaired_text)
|
| 824 |
|
| 825 |
+
# infer title if missing and story entities obvious
|
| 826 |
+
lower_full = repaired_text.lower()
|
| 827 |
+
if not title:
|
| 828 |
+
if "lion" in lower_full and "mouse" in lower_full:
|
| 829 |
+
title = "The Lion and the Mouse"
|
| 830 |
+
elif "rabbit" in lower_full and "tortoise" in lower_full:
|
| 831 |
+
title = "The Rabbit and the Tortoise"
|
| 832 |
|
| 833 |
+
source = _truncate_text(repaired_text, SUMMARY_MAX_INPUT_CHARS)
|
| 834 |
lower = source.lower()
|
| 835 |
+
|
| 836 |
looks_like_story = any(x in lower for x in [
|
| 837 |
+
"once upon", "rabbit", "tortoise", "lion", "mouse",
|
| 838 |
+
"fox", "crow", "race", "hunters", "jungle", "forest"
|
| 839 |
])
|
| 840 |
|
| 841 |
+
looks_like_profile = _looks_like_biography_or_profile(source)
|
| 842 |
+
|
| 843 |
+
# 1) biography/profile -> structured fallback first
|
| 844 |
+
if looks_like_profile:
|
| 845 |
+
summary = _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS)
|
| 846 |
+
return summary, "structured_profile_fallback", repaired_text
|
| 847 |
+
|
| 848 |
+
# 2) story/fable -> ALWAYS use structured story fallback
|
| 849 |
if looks_like_story:
|
| 850 |
+
summary = _story_fallback(title, moral, repaired_text, SUMMARY_MAX_CHARS)
|
| 851 |
+
return summary, "structured_story_fallback", repaired_text
|
| 852 |
+
|
| 853 |
+
# 3) short non-story -> fallback
|
| 854 |
+
if len(source) < SUMMARY_MIN_TEXT_LEN:
|
| 855 |
+
return _structured_summary_fallback(title, context_line, source, moral, SUMMARY_MAX_CHARS), "fallback_short", repaired_text
|
| 856 |
+
|
| 857 |
+
# 4) model only for non-story text
|
| 858 |
+
prompt = f"summarize: {source}"
|
| 859 |
|
| 860 |
try:
|
| 861 |
tokenizer, model, torch = get_hf_summarizer()
|
|
|
|
| 873 |
with torch.no_grad():
|
| 874 |
output_ids = model.generate(
|
| 875 |
**inputs,
|
| 876 |
+
max_new_tokens=max(40, SUMMARY_MAX_NEW_TOKENS),
|
| 877 |
num_beams=max(2, SUMMARY_NUM_BEAMS),
|
| 878 |
do_sample=False,
|
| 879 |
early_stopping=True,
|
|
|
|
| 884 |
|
| 885 |
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 886 |
summary = basic_cleanup(summary)
|
| 887 |
+
summary = _light_ocr_word_fixes(summary)
|
| 888 |
summary = re.sub(r"^(summary|summarize|main idea|moral)\s*:\s*", "", summary, flags=re.I).strip()
|
| 889 |
|
| 890 |
+
# reject prompt echo junk
|
| 891 |
+
prompt_echo_markers = [
|
| 892 |
+
"the story in one short clear sentence",
|
| 893 |
+
"if there is a lesson include it briefly",
|
| 894 |
+
"summarize this story",
|
| 895 |
+
"summarize this text",
|
| 896 |
+
]
|
| 897 |
+
if any(m in summary.lower() for m in prompt_echo_markers):
|
| 898 |
+
return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), "fallback_prompt_echo", repaired_text
|
| 899 |
+
|
| 900 |
+
bad_reason = _is_bad_model_summary(summary, repaired_text)
|
| 901 |
+
if bad_reason:
|
| 902 |
+
return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), f"fallback_generic_{bad_reason}", repaired_text
|
| 903 |
+
|
| 904 |
+
enriched = summary
|
| 905 |
+
|
| 906 |
+
if title and title.lower() not in enriched.lower():
|
| 907 |
+
if context_line and context_line.lower() not in enriched.lower():
|
| 908 |
+
enriched = f"{title} {context_line} {enriched}"
|
| 909 |
+
else:
|
| 910 |
+
enriched = f"{title} {enriched}"
|
| 911 |
+
elif context_line and context_line.lower() not in enriched.lower():
|
| 912 |
+
enriched = f"{context_line} {enriched}"
|
| 913 |
+
|
| 914 |
+
enriched = _truncate_text(enriched, SUMMARY_MAX_CHARS)
|
| 915 |
+
return enriched, "t5_small_summarization_repaired", repaired_text
|
| 916 |
|
| 917 |
except Exception as e:
|
| 918 |
print(f"[summary] failed: {type(e).__name__}: {e}")
|
| 919 |
+
return _structured_summary_fallback(title, context_line, repaired_text, moral, SUMMARY_MAX_CHARS), f"fallback_generic:{type(e).__name__}", repaired_text
|
| 920 |
|
| 921 |
# =========================================================
|
| 922 |
# STARTUP
|
|
|
|
| 1033 |
if len(final_text) > MAX_TEXT_LEN:
|
| 1034 |
final_text = final_text[:MAX_TEXT_LEN]
|
| 1035 |
|
| 1036 |
+
summary, summary_method, summary_source_text = summarize_text(final_text)
|
| 1037 |
+
|
| 1038 |
+
# Replace final_text with repaired readable text
|
| 1039 |
+
if summary_source_text:
|
| 1040 |
+
final_text = summary_source_text
|
| 1041 |
+
|
| 1042 |
+
# Sanitize summary only
|
| 1043 |
+
summary = sanitize_summary_text(summary)
|
| 1044 |
|
| 1045 |
payload = {
|
| 1046 |
"uuid": uuid,
|