| # src/preprocessing.py | |
| import re | |
| import string | |
| from typing import List | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import joblib | |
| def clean_text(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text) | |
| text = re.sub(r"\d+", " ", text) | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def preprocess_texts(texts: List[str]) -> List[str]: | |
| return [clean_text(t) for t in texts] | |
| def build_vectorizer(texts, max_features=15000): | |
| vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2)) | |
| X = vectorizer.fit_transform(texts) | |
| return vectorizer, X | |
| def save_vectorizer(vectorizer, path="model/vectorizer.joblib"): | |
| joblib.dump(vectorizer, path) | |