Spaces:

Soldier-Boy
/

Twitch-BPE

Sleeping

App Files Files Community

Twitch-BPE / src /clean.py

Soldier-Boy

create: src files

c6e5251 verified about 2 months ago

raw

history blame contribute delete

1.82 kB

	from __future__ import annotations
	import re
	import unicodedata as ud
	from typing import List

	from . import config as CFG

	CTRL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
	MULTI_WS_RE = re.compile(r"[ \t\f\r]{2,}")
	REPEAT_CHAR_RE = re.compile(r"(.)\1{4,}") # 5+ -> cap to 4 total
	URL_RE = re.compile(CFG.PROTECT_REGEX["url"])
	USER_RE = re.compile(CFG.PROTECT_REGEX["user"])


	def nfc(text: str) -> str:
	return ud.normalize("NFC", text)


	def strip_control(text: str) -> str:
	return CTRL_RE.sub("", text)


	def cap_repeats(text: str, max_repeat: int = CFG.MAX_REPEAT_CHARS) -> str:
	# Replace runs longer than max_repeat with exactly max_repeat
	def repl(m):
	ch = m.group(1)
	return ch * max_repeat
	return REPEAT_CHAR_RE.sub(repl, text)


	def replace_placeholders(text: str) -> str:
	text = URL_RE.sub(CFG.PLACEHOLDERS["URL"], text)
	text = USER_RE.sub(CFG.PLACEHOLDERS["USER"], text)
	return text


	def collapse_whitespace(text: str) -> str:
	# Collapse spaces/tabs/etc but preserve newlines
	parts = text.split("\n")
	parts = [MULTI_WS_RE.sub(" ", p).strip() for p in parts]
	return "\n".join([p for p in parts if p != ""]) # drop blank lines from internal collapse


	def clean_text(text: str, lowercase: bool \| None = None) -> str:
	if lowercase is None:
	lowercase = CFG.LOWERCASE
	if not isinstance(text, str):
	return ""
	s = text
	s = nfc(s)
	s = strip_control(s)
	s = replace_placeholders(s)
	s = cap_repeats(s, CFG.MAX_REPEAT_CHARS)
	if lowercase:
	s = s.lower()
	s = collapse_whitespace(s)
	return s.strip()


	def filter_short(lines: List[str], min_len: int = 1) -> List[str]:
	return [s for s in lines if len(s) >= min_len]