File size: 821 Bytes
30a2ab4
 
 
 
 
 
 
 
 
5c4388c
 
30a2ab4
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# src/preprocessing.py
import re
import string
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  
    text = re.sub(r"\d+", " ", text)  
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_texts(texts: List[str]) -> List[str]:
    return [clean_text(t) for t in texts]

def build_vectorizer(texts, max_features=15000):
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    X = vectorizer.fit_transform(texts)
    return vectorizer, X

def save_vectorizer(vectorizer, path="model/vectorizer.joblib"):
    joblib.dump(vectorizer, path)