File size: 2,813 Bytes
34b531b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | from __future__ import annotations
import hashlib
import html
import re
import uuid
from app.processing.constants import BOILERPLATE_LINES, TOKEN_PATTERN
def stable_id(*parts: str) -> str:
joined = "|".join(parts)
digest = hashlib.sha1(joined.encode("utf-8")).hexdigest()
return str(uuid.uuid5(uuid.NAMESPACE_URL, digest))
def tokenize(text: str) -> list[str]:
return TOKEN_PATTERN.findall(text)
def token_count(text: str) -> int:
return len(tokenize(text))
def detokenize(tokens: list[str]) -> str:
text = " ".join(tokens)
text = re.sub(r"\s+([,.;:!?%)\]\}])", r"\1", text)
text = re.sub(r"([\(\[\{])\s+", r"\1", text)
return text.strip()
def normalize_text(text: str) -> str:
text = html.unescape(text)
text = re.sub(r"\r\n?", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def rows_to_table_text(rows: list[list[str]]) -> str:
lines = []
for row in rows:
cleaned = [normalize_text(cell) for cell in row if normalize_text(cell)]
if cleaned:
lines.append(" | ".join(cleaned))
return "\n".join(lines)
def looks_like_heading(line: str) -> bool:
if line.startswith("#"):
return True
if len(line) > 90 or len(tokenize(line)) > 14:
return False
if re.match(r"^\d+[\).\s-]+", line):
return True
letters = re.sub(r"[^A-Za-zÀ-ỹ]", "", line)
return bool(letters) and letters.upper() == letters and len(letters) >= 3
def looks_like_table(line: str) -> bool:
return line.count("|") >= 2 or line.count(",") >= 4 or "\t" in line
def looks_like_widget(line: str) -> bool:
key_value = bool(re.search(r"[::]\s*\S+", line))
numeric_dense = len(re.findall(r"\d+(?:[.,]\d+)?%?", line)) >= 3
return key_value or numeric_dense
def is_noise_line(line: str) -> bool:
lowered = line.strip().lower()
if not lowered:
return True
if lowered in BOILERPLATE_LINES:
return True
if lowered.startswith(("window[", "function ", "var ", "const ", "let ")):
return True
if "googletagmanager.com" in lowered or "_gtm_" in lowered:
return True
if lowered.startswith(("{", "};", "])", "</", "<script")) and len(line) > 40:
return True
if "quét mã qr" in lowered or "cài đặt tiện ích" in lowered:
return True
if "số giấy phép mạng xã hội" in lowered or "chịu trách nhiệm nội dung" in lowered:
return True
return False
def clean_document_text(text: str) -> str:
lines = [line for line in text.splitlines() if not is_noise_line(normalize_text(line))]
return normalize_text("\n".join(lines))
|