File size: 2,813 Bytes
34b531b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from __future__ import annotations

import hashlib
import html
import re
import uuid

from app.processing.constants import BOILERPLATE_LINES, TOKEN_PATTERN


def stable_id(*parts: str) -> str:
    joined = "|".join(parts)
    digest = hashlib.sha1(joined.encode("utf-8")).hexdigest()
    return str(uuid.uuid5(uuid.NAMESPACE_URL, digest))


def tokenize(text: str) -> list[str]:
    return TOKEN_PATTERN.findall(text)


def token_count(text: str) -> int:
    return len(tokenize(text))


def detokenize(tokens: list[str]) -> str:
    text = " ".join(tokens)
    text = re.sub(r"\s+([,.;:!?%)\]\}])", r"\1", text)
    text = re.sub(r"([\(\[\{])\s+", r"\1", text)
    return text.strip()


def normalize_text(text: str) -> str:
    text = html.unescape(text)
    text = re.sub(r"\r\n?", "\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def rows_to_table_text(rows: list[list[str]]) -> str:
    lines = []
    for row in rows:
        cleaned = [normalize_text(cell) for cell in row if normalize_text(cell)]
        if cleaned:
            lines.append(" | ".join(cleaned))
    return "\n".join(lines)


def looks_like_heading(line: str) -> bool:
    if line.startswith("#"):
        return True
    if len(line) > 90 or len(tokenize(line)) > 14:
        return False
    if re.match(r"^\d+[\).\s-]+", line):
        return True
    letters = re.sub(r"[^A-Za-zÀ-ỹ]", "", line)
    return bool(letters) and letters.upper() == letters and len(letters) >= 3


def looks_like_table(line: str) -> bool:
    return line.count("|") >= 2 or line.count(",") >= 4 or "\t" in line


def looks_like_widget(line: str) -> bool:
    key_value = bool(re.search(r"[::]\s*\S+", line))
    numeric_dense = len(re.findall(r"\d+(?:[.,]\d+)?%?", line)) >= 3
    return key_value or numeric_dense


def is_noise_line(line: str) -> bool:
    lowered = line.strip().lower()
    if not lowered:
        return True
    if lowered in BOILERPLATE_LINES:
        return True
    if lowered.startswith(("window[", "function ", "var ", "const ", "let ")):
        return True
    if "googletagmanager.com" in lowered or "_gtm_" in lowered:
        return True
    if lowered.startswith(("{", "};", "])", "</", "<script")) and len(line) > 40:
        return True
    if "quét mã qr" in lowered or "cài đặt tiện ích" in lowered:
        return True
    if "số giấy phép mạng xã hội" in lowered or "chịu trách nhiệm nội dung" in lowered:
        return True
    return False


def clean_document_text(text: str) -> str:
    lines = [line for line in text.splitlines() if not is_noise_line(normalize_text(line))]
    return normalize_text("\n".join(lines))