# preprocessing.py
import re
import string
import unicodedata

def normalize(text: str) -> str:
    # 1. Chuyển viết thường, bỏ URL và email
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)

    # 2. Loại bỏ dấu câu nhưng GIỮ LẠI emoji
    punctuation_pattern = re.compile(f"[{re.escape(string.punctuation)}]")
    text = punctuation_pattern.sub(" ", text)

    # 3. Khử lặp ký tự >= 3 lần → giữ lại 2
    # "ngonnnnn" → "ngonn" | "vuiiiii" → "vuii" | "cc","đmm" giữ nguyên
    text = re.sub(
        r'([a-zàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ])\1{2,}',
        r'\1\1', text
    )

    # 4. Loại bỏ khoảng trắng thừa
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# 🔥 HÀM CHÍNH ĐƯỢC APP.PY GỌI SỬ DỤNG
def preprocess_text_for_Visobert(text: str) -> str:
    # Chuẩn hóa chuẩn Unicode tổ hợp/dựng sẵn tránh lỗi font chữ tiếng Việt ẩn
    text = unicodedata.normalize("NFC", str(text))
    
    # Áp dụng hàm normalize đồng bộ hoàn toàn với Colab
    return normalize(text)