from __future__ import annotations
import re
import unicodedata as ud
from typing import List

from . import config as CFG

CTRL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
MULTI_WS_RE = re.compile(r"[ \t\f\r]{2,}")
REPEAT_CHAR_RE = re.compile(r"(.)\1{4,}")  # 5+ -> cap to 4 total
URL_RE = re.compile(CFG.PROTECT_REGEX["url"]) 
USER_RE = re.compile(CFG.PROTECT_REGEX["user"]) 


def nfc(text: str) -> str:
    return ud.normalize("NFC", text)


def strip_control(text: str) -> str:
    return CTRL_RE.sub("", text)


def cap_repeats(text: str, max_repeat: int = CFG.MAX_REPEAT_CHARS) -> str:
    # Replace runs longer than max_repeat with exactly max_repeat
    def repl(m):
        ch = m.group(1)
        return ch * max_repeat
    return REPEAT_CHAR_RE.sub(repl, text)


def replace_placeholders(text: str) -> str:
    text = URL_RE.sub(CFG.PLACEHOLDERS["URL"], text)
    text = USER_RE.sub(CFG.PLACEHOLDERS["USER"], text)
    return text


def collapse_whitespace(text: str) -> str:
    # Collapse spaces/tabs/etc but preserve newlines
    parts = text.split("\n")
    parts = [MULTI_WS_RE.sub(" ", p).strip() for p in parts]
    return "\n".join([p for p in parts if p != ""])  # drop blank lines from internal collapse


def clean_text(text: str, lowercase: bool | None = None) -> str:
    if lowercase is None:
        lowercase = CFG.LOWERCASE
    if not isinstance(text, str):
        return ""
    s = text
    s = nfc(s)
    s = strip_control(s)
    s = replace_placeholders(s)
    s = cap_repeats(s, CFG.MAX_REPEAT_CHARS)
    if lowercase:
        s = s.lower()
    s = collapse_whitespace(s)
    return s.strip()


def filter_short(lines: List[str], min_len: int = 1) -> List[str]:
    return [s for s in lines if len(s) >= min_len]