|
|
|
|
|
import re |
|
|
import string |
|
|
from typing import List |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
import joblib |
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
text = text.lower() |
|
|
text = re.sub(r"http\S+|www\S+|https\S+", "", text) |
|
|
text = re.sub(r"\d+", " ", text) |
|
|
text = text.translate(str.maketrans("", "", string.punctuation)) |
|
|
text = re.sub(r"\s+", " ", text).strip() |
|
|
return text |
|
|
|
|
|
def preprocess_texts(texts: List[str]) -> List[str]: |
|
|
return [clean_text(t) for t in texts] |
|
|
|
|
|
def build_vectorizer(texts, max_features=15000): |
|
|
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2)) |
|
|
X = vectorizer.fit_transform(texts) |
|
|
return vectorizer, X |
|
|
|
|
|
def save_vectorizer(vectorizer, path="model/vectorizer.joblib"): |
|
|
joblib.dump(vectorizer, path) |
|
|
|