seedflora's picture
Initial Space upload from matchaSentiment repo
4ce2b3e verified
import re
import unicodedata
from typing import Iterable
from .config import STOPWORDS
TOKEN_RE = re.compile(r"[a-zA-Z脌-每]+(?:[-'][a-zA-Z脌-每]+)?")
def normalize_text(value: object) -> str:
if value is None:
return ""
text = unicodedata.normalize("NFKC", str(value))
text = re.sub(r"\s+", " ", text)
return text.strip()
def normalize_label(value: object) -> str:
label = normalize_text(value).lower()
if label == "positif":
return "Positif"
if label == "negatif":
return "Negatif"
if label == "netral":
return "Netral"
return normalize_text(value)
def tokenize(text: str, *, remove_stopwords: bool = False, min_len: int = 2) -> list[str]:
tokens = [m.group(0).lower() for m in TOKEN_RE.finditer(normalize_text(text))]
tokens = [t for t in tokens if len(t) >= min_len]
if remove_stopwords:
tokens = [t for t in tokens if t not in STOPWORDS]
return tokens
def tokenized_documents(texts: Iterable[str], *, remove_stopwords: bool = False) -> list[list[str]]:
return [tokenize(text, remove_stopwords=remove_stopwords) for text in texts]
def compact_for_key(text: str) -> str:
return " ".join(tokenize(text, remove_stopwords=False, min_len=1))