Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| import unicodedata as ud | |
| from typing import List | |
| from . import config as CFG | |
| CTRL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]") | |
| MULTI_WS_RE = re.compile(r"[ \t\f\r]{2,}") | |
| REPEAT_CHAR_RE = re.compile(r"(.)\1{4,}") # 5+ -> cap to 4 total | |
| URL_RE = re.compile(CFG.PROTECT_REGEX["url"]) | |
| USER_RE = re.compile(CFG.PROTECT_REGEX["user"]) | |
| def nfc(text: str) -> str: | |
| return ud.normalize("NFC", text) | |
| def strip_control(text: str) -> str: | |
| return CTRL_RE.sub("", text) | |
| def cap_repeats(text: str, max_repeat: int = CFG.MAX_REPEAT_CHARS) -> str: | |
| # Replace runs longer than max_repeat with exactly max_repeat | |
| def repl(m): | |
| ch = m.group(1) | |
| return ch * max_repeat | |
| return REPEAT_CHAR_RE.sub(repl, text) | |
| def replace_placeholders(text: str) -> str: | |
| text = URL_RE.sub(CFG.PLACEHOLDERS["URL"], text) | |
| text = USER_RE.sub(CFG.PLACEHOLDERS["USER"], text) | |
| return text | |
| def collapse_whitespace(text: str) -> str: | |
| # Collapse spaces/tabs/etc but preserve newlines | |
| parts = text.split("\n") | |
| parts = [MULTI_WS_RE.sub(" ", p).strip() for p in parts] | |
| return "\n".join([p for p in parts if p != ""]) # drop blank lines from internal collapse | |
| def clean_text(text: str, lowercase: bool | None = None) -> str: | |
| if lowercase is None: | |
| lowercase = CFG.LOWERCASE | |
| if not isinstance(text, str): | |
| return "" | |
| s = text | |
| s = nfc(s) | |
| s = strip_control(s) | |
| s = replace_placeholders(s) | |
| s = cap_repeats(s, CFG.MAX_REPEAT_CHARS) | |
| if lowercase: | |
| s = s.lower() | |
| s = collapse_whitespace(s) | |
| return s.strip() | |
| def filter_short(lines: List[str], min_len: int = 1) -> List[str]: | |
| return [s for s in lines if len(s) >= min_len] | |