from __future__ import annotations import re import unicodedata as ud from typing import List from . import config as CFG CTRL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]") MULTI_WS_RE = re.compile(r"[ \t\f\r]{2,}") REPEAT_CHAR_RE = re.compile(r"(.)\1{4,}") # 5+ -> cap to 4 total URL_RE = re.compile(CFG.PROTECT_REGEX["url"]) USER_RE = re.compile(CFG.PROTECT_REGEX["user"]) def nfc(text: str) -> str: return ud.normalize("NFC", text) def strip_control(text: str) -> str: return CTRL_RE.sub("", text) def cap_repeats(text: str, max_repeat: int = CFG.MAX_REPEAT_CHARS) -> str: # Replace runs longer than max_repeat with exactly max_repeat def repl(m): ch = m.group(1) return ch * max_repeat return REPEAT_CHAR_RE.sub(repl, text) def replace_placeholders(text: str) -> str: text = URL_RE.sub(CFG.PLACEHOLDERS["URL"], text) text = USER_RE.sub(CFG.PLACEHOLDERS["USER"], text) return text def collapse_whitespace(text: str) -> str: # Collapse spaces/tabs/etc but preserve newlines parts = text.split("\n") parts = [MULTI_WS_RE.sub(" ", p).strip() for p in parts] return "\n".join([p for p in parts if p != ""]) # drop blank lines from internal collapse def clean_text(text: str, lowercase: bool | None = None) -> str: if lowercase is None: lowercase = CFG.LOWERCASE if not isinstance(text, str): return "" s = text s = nfc(s) s = strip_control(s) s = replace_placeholders(s) s = cap_repeats(s, CFG.MAX_REPEAT_CHARS) if lowercase: s = s.lower() s = collapse_whitespace(s) return s.strip() def filter_short(lines: List[str], min_len: int = 1) -> List[str]: return [s for s in lines if len(s) >= min_len]