spam-filter-app / utils /preprocessing.py
arifa-batool's picture
Update utils/preprocessing.py
d7a9803 verified
raw
history blame contribute delete
674 Bytes
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text: str) -> str:
"""Clean and preprocess input text."""
text = text.lower()
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
tokens = nltk.word_tokenize(text)
tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return " ".join(tokens)