""" NLP Preprocessing pipeline for social media text. Handles cleaning, tokenization, stopword removal, lemmatization, TF-IDF vectorization. """ import re import string import numpy as np import pandas as pd from datetime import datetime try: import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize for pkg in ['punkt', 'stopwords', 'wordnet', 'punkt_tab']: try: nltk.download(pkg, quiet=True) except: pass NLTK_AVAILABLE = True except Exception: NLTK_AVAILABLE = False from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer class TextPreprocessor: def __init__(self): self.lemmatizer = WordNetLemmatizer() if NLTK_AVAILABLE else None try: self.stop_words = set(stopwords.words('english')) if NLTK_AVAILABLE else set() except: self.stop_words = set() # Domain-specific stopwords to keep (they matter for disaster detection) domain_keep = {'not', 'no', 'very', 'too', 'more', 'most', 'need', 'help', 'urgent'} self.stop_words -= domain_keep self.tfidf_vectorizer = TfidfVectorizer( max_features=500, ngram_range=(1, 2), min_df=1, max_df=0.95, sublinear_tf=True ) self.count_vectorizer = CountVectorizer( max_features=300, ngram_range=(1, 1), min_df=1 ) self.is_fitted = False self._texts_buffer = [] def clean_text(self, text: str) -> str: """Remove URLs, mentions, hashtag symbols, special chars, lowercase.""" if not isinstance(text, str): return "" text = re.sub(r'http\S+|www\S+', '', text) text = re.sub(r'@\w+', '', text) text = re.sub(r'#(\w+)', r'\1', text) text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'\d+', '', text) text = text.lower().strip() text = re.sub(r'\s+', ' ', text) return text def tokenize(self, text: str) -> list: """Tokenize and remove stopwords & short tokens.""" cleaned = self.clean_text(text) if NLTK_AVAILABLE: try: tokens = word_tokenize(cleaned) except: tokens = cleaned.split() else: tokens = cleaned.split() tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2] return tokens def lemmatize(self, tokens: list) -> list: """Apply lemmatization.""" if self.lemmatizer: return [self.lemmatizer.lemmatize(t) for t in tokens] return tokens def preprocess(self, text: str) -> str: """Full pipeline: clean → tokenize → lemmatize → rejoin.""" tokens = self.tokenize(text) tokens = self.lemmatize(tokens) return ' '.join(tokens) def preprocess_batch(self, texts: list) -> list: """Preprocess a list of texts.""" return [self.preprocess(t) for t in texts] def fit_tfidf(self, texts: list): """Fit TF-IDF on a corpus.""" processed = self.preprocess_batch(texts) # Filter out empty strings processed = [t if t.strip() else 'unknown' for t in processed] self.tfidf_vectorizer.fit(processed) self.count_vectorizer.fit(processed) self.is_fitted = True return self def transform_tfidf(self, texts: list) -> np.ndarray: """Transform texts to TF-IDF matrix.""" processed = self.preprocess_batch(texts) processed = [t if t.strip() else 'unknown' for t in processed] if not self.is_fitted: return self.tfidf_vectorizer.fit_transform(processed).toarray() return self.tfidf_vectorizer.transform(processed).toarray() def get_vocabulary(self) -> list: """Return TF-IDF vocabulary.""" if self.is_fitted: return list(self.tfidf_vectorizer.vocabulary_.keys()) return [] def get_count_matrix(self, texts: list) -> np.ndarray: """Return raw count matrix for LDA.""" processed = self.preprocess_batch(texts) processed = [t if t.strip() else 'unknown' for t in processed] try: return self.count_vectorizer.fit_transform(processed) except: return None def extract_features(self, tweet: dict) -> dict: """Extract numeric features from a tweet dict.""" text = tweet.get('text', '') features = { 'text_length': len(text), 'word_count': len(text.split()), 'exclamation_count': text.count('!'), 'question_count': text.count('?'), 'caps_ratio': sum(1 for c in text if c.isupper()) / max(len(text), 1), 'hashtag_count': text.count('#'), 'mention_count': text.count('@'), 'url_count': len(re.findall(r'http\S+', text)), } return features