CrisisAI-Sentinal / app /data /preprocessor.py
karthik119's picture
Deploy CrisisAI Sentinel - Deep Learning NLP Dashboard
6328796 verified
"""
NLP Preprocessing pipeline for social media text.
Handles cleaning, tokenization, stopword removal, lemmatization, TF-IDF vectorization.
"""
import re
import string
import numpy as np
import pandas as pd
from datetime import datetime
try:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
for pkg in ['punkt', 'stopwords', 'wordnet', 'punkt_tab']:
try:
nltk.download(pkg, quiet=True)
except:
pass
NLTK_AVAILABLE = True
except Exception:
NLTK_AVAILABLE = False
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
class TextPreprocessor:
def __init__(self):
self.lemmatizer = WordNetLemmatizer() if NLTK_AVAILABLE else None
try:
self.stop_words = set(stopwords.words('english')) if NLTK_AVAILABLE else set()
except:
self.stop_words = set()
# Domain-specific stopwords to keep (they matter for disaster detection)
domain_keep = {'not', 'no', 'very', 'too', 'more', 'most', 'need', 'help', 'urgent'}
self.stop_words -= domain_keep
self.tfidf_vectorizer = TfidfVectorizer(
max_features=500,
ngram_range=(1, 2),
min_df=1,
max_df=0.95,
sublinear_tf=True
)
self.count_vectorizer = CountVectorizer(
max_features=300,
ngram_range=(1, 1),
min_df=1
)
self.is_fitted = False
self._texts_buffer = []
def clean_text(self, text: str) -> str:
"""Remove URLs, mentions, hashtag symbols, special chars, lowercase."""
if not isinstance(text, str):
return ""
text = re.sub(r'http\S+|www\S+', '', text)
text = re.sub(r'@\w+', '', text)
text = re.sub(r'#(\w+)', r'\1', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\d+', '', text)
text = text.lower().strip()
text = re.sub(r'\s+', ' ', text)
return text
def tokenize(self, text: str) -> list:
"""Tokenize and remove stopwords & short tokens."""
cleaned = self.clean_text(text)
if NLTK_AVAILABLE:
try:
tokens = word_tokenize(cleaned)
except:
tokens = cleaned.split()
else:
tokens = cleaned.split()
tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
return tokens
def lemmatize(self, tokens: list) -> list:
"""Apply lemmatization."""
if self.lemmatizer:
return [self.lemmatizer.lemmatize(t) for t in tokens]
return tokens
def preprocess(self, text: str) -> str:
"""Full pipeline: clean → tokenize → lemmatize → rejoin."""
tokens = self.tokenize(text)
tokens = self.lemmatize(tokens)
return ' '.join(tokens)
def preprocess_batch(self, texts: list) -> list:
"""Preprocess a list of texts."""
return [self.preprocess(t) for t in texts]
def fit_tfidf(self, texts: list):
"""Fit TF-IDF on a corpus."""
processed = self.preprocess_batch(texts)
# Filter out empty strings
processed = [t if t.strip() else 'unknown' for t in processed]
self.tfidf_vectorizer.fit(processed)
self.count_vectorizer.fit(processed)
self.is_fitted = True
return self
def transform_tfidf(self, texts: list) -> np.ndarray:
"""Transform texts to TF-IDF matrix."""
processed = self.preprocess_batch(texts)
processed = [t if t.strip() else 'unknown' for t in processed]
if not self.is_fitted:
return self.tfidf_vectorizer.fit_transform(processed).toarray()
return self.tfidf_vectorizer.transform(processed).toarray()
def get_vocabulary(self) -> list:
"""Return TF-IDF vocabulary."""
if self.is_fitted:
return list(self.tfidf_vectorizer.vocabulary_.keys())
return []
def get_count_matrix(self, texts: list) -> np.ndarray:
"""Return raw count matrix for LDA."""
processed = self.preprocess_batch(texts)
processed = [t if t.strip() else 'unknown' for t in processed]
try:
return self.count_vectorizer.fit_transform(processed)
except:
return None
def extract_features(self, tweet: dict) -> dict:
"""Extract numeric features from a tweet dict."""
text = tweet.get('text', '')
features = {
'text_length': len(text),
'word_count': len(text.split()),
'exclamation_count': text.count('!'),
'question_count': text.count('?'),
'caps_ratio': sum(1 for c in text if c.isupper()) / max(len(text), 1),
'hashtag_count': text.count('#'),
'mention_count': text.count('@'),
'url_count': len(re.findall(r'http\S+', text)),
}
return features