import re import unicodedata from typing import Iterable from .config import STOPWORDS TOKEN_RE = re.compile(r"[a-zA-ZÀ-ÿ]+(?:[-'][a-zA-ZÀ-ÿ]+)?") def normalize_text(value: object) -> str: if value is None: return "" text = unicodedata.normalize("NFKC", str(value)) text = re.sub(r"\s+", " ", text) return text.strip() def normalize_label(value: object) -> str: label = normalize_text(value).lower() if label == "positif": return "Positif" if label == "negatif": return "Negatif" if label == "netral": return "Netral" return normalize_text(value) def tokenize(text: str, *, remove_stopwords: bool = False, min_len: int = 2) -> list[str]: tokens = [m.group(0).lower() for m in TOKEN_RE.finditer(normalize_text(text))] tokens = [t for t in tokens if len(t) >= min_len] if remove_stopwords: tokens = [t for t in tokens if t not in STOPWORDS] return tokens def tokenized_documents(texts: Iterable[str], *, remove_stopwords: bool = False) -> list[list[str]]: return [tokenize(text, remove_stopwords=remove_stopwords) for text in texts] def compact_for_key(text: str) -> str: return " ".join(tokenize(text, remove_stopwords=False, min_len=1))