File size: 821 Bytes
30a2ab4 5c4388c 30a2ab4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# src/preprocessing.py
import re
import string
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
def clean_text(text: str) -> str:
text = text.lower()
text = re.sub(r"http\S+|www\S+|https\S+", "", text)
text = re.sub(r"\d+", " ", text)
text = text.translate(str.maketrans("", "", string.punctuation))
text = re.sub(r"\s+", " ", text).strip()
return text
def preprocess_texts(texts: List[str]) -> List[str]:
return [clean_text(t) for t in texts]
def build_vectorizer(texts, max_features=15000):
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
X = vectorizer.fit_transform(texts)
return vectorizer, X
def save_vectorizer(vectorizer, path="model/vectorizer.joblib"):
joblib.dump(vectorizer, path)
|