Twitch-BPE / src /clean.py
Soldier-Boy's picture
create: src files
c6e5251 verified
from __future__ import annotations
import re
import unicodedata as ud
from typing import List
from . import config as CFG
CTRL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
MULTI_WS_RE = re.compile(r"[ \t\f\r]{2,}")
REPEAT_CHAR_RE = re.compile(r"(.)\1{4,}") # 5+ -> cap to 4 total
URL_RE = re.compile(CFG.PROTECT_REGEX["url"])
USER_RE = re.compile(CFG.PROTECT_REGEX["user"])
def nfc(text: str) -> str:
return ud.normalize("NFC", text)
def strip_control(text: str) -> str:
return CTRL_RE.sub("", text)
def cap_repeats(text: str, max_repeat: int = CFG.MAX_REPEAT_CHARS) -> str:
# Replace runs longer than max_repeat with exactly max_repeat
def repl(m):
ch = m.group(1)
return ch * max_repeat
return REPEAT_CHAR_RE.sub(repl, text)
def replace_placeholders(text: str) -> str:
text = URL_RE.sub(CFG.PLACEHOLDERS["URL"], text)
text = USER_RE.sub(CFG.PLACEHOLDERS["USER"], text)
return text
def collapse_whitespace(text: str) -> str:
# Collapse spaces/tabs/etc but preserve newlines
parts = text.split("\n")
parts = [MULTI_WS_RE.sub(" ", p).strip() for p in parts]
return "\n".join([p for p in parts if p != ""]) # drop blank lines from internal collapse
def clean_text(text: str, lowercase: bool | None = None) -> str:
if lowercase is None:
lowercase = CFG.LOWERCASE
if not isinstance(text, str):
return ""
s = text
s = nfc(s)
s = strip_control(s)
s = replace_placeholders(s)
s = cap_repeats(s, CFG.MAX_REPEAT_CHARS)
if lowercase:
s = s.lower()
s = collapse_whitespace(s)
return s.strip()
def filter_short(lines: List[str], min_len: int = 1) -> List[str]:
return [s for s in lines if len(s) >= min_len]