Spaces:

elsayedelmandoh
/

sentiment-sleuth

Sleeping

File size: 15,167 Bytes

from __future__ import annotations
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import html
from typing import Union
from sklearn.feature_extraction.text import CountVectorizer
import unicodedata
import joblib
import scipy.sparse as sp
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.utils import check_random_state
import contractions
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import os

# data acquisition notebook
def save(df_base: str = "data/processed", df: Union[pd.DataFrame, pd.Series] = None, df_name: str = "dataset.csv", 
		 vectorizer_base: str = "data/vectorizers", vectorizer=None, vectorizer_name: str = "vectorizer.joblib", 
		 vectors_base: str = "data/vectorizers", vectors= None, vectors_name: str = "vectors.npz", 
		 model_base: str = "data/models", model=None, model_name: str = "model.joblib", 
		 verbose: bool = True):
	"""
	Save a dataframe (CSV), a vectorizer (joblib), a model (joblib) and/or sparse vectors (npz) to disk.
	Each artifact type can have its own base path.
	"""

	saved = {}
	# Helper function to create dir and return full path
	def get_full_path(base, name):
		base_dir = Path(base)
		base_dir.mkdir(parents=True, exist_ok=True)
		return base_dir / name

	# save CSV (dataframe)
	if df is not None:
		path = get_full_path(df_base, df_name)
		df.to_csv(path, index=False)
		saved['csv'] = path
		if verbose:
			print(f"Saved dataframe {df_name} to {path}")

	# save joblib (vectorizer)
	if vectorizer is not None:
		if joblib is None:
			raise ImportError("joblib is required to save vectorizer; install with `pip install joblib`")
		path = get_full_path(vectorizer_base, vectorizer_name)
		joblib.dump(vectorizer, path)
		saved['vectorizer'] = path
		if verbose:
			print(f"Saved vectorizer {vectorizer_name} to {path}")
	
	# save npz for sparse matrices
	if vectors is not None:
		if sp is None:
			raise ImportError("scipy is required to save sparse vectors; install with `pip install scipy`")
		path = get_full_path(vectors_base, vectors_name)
		sp.save_npz(path, vectors)
		saved['vectors'] = path
		if verbose:
			print(f"Saved vectors {vectors_name} to {path}")

	# save joblib (ML model)
	if model is not None:
		if joblib is None:
			raise ImportError("joblib is required to save model; install with `pip install joblib`")
		path = get_full_path(model_base, model_name)
		joblib.dump(model, path)
		saved['model'] = path
		if verbose:
			print(f"Saved model {model_name} to {path}")

	return saved


# eda notebook
def apply_balance(df: pd.DataFrame, target_col: str = "target", random_state: int = 42) -> pd.DataFrame:
	"""Return a balanced dataframe by undersampling majority classes to the minority count.

	If the dataframe is already balanced (all classes equal), it's returned unchanged.
	Args:
		df (pd.DataFrame): The input dataframe to balance.
		target_col (str, optional): The name of the target column. Defaults to "target".
		random_state (int, optional): Random state for reproducibility. Defaults to 42.
	Returns:
		pd.DataFrame: A balanced dataframe.
	"""
	counts = df[target_col].value_counts()
	if counts.nunique() == 1:
		return df.reset_index(drop=True)

	target_n = counts.min()
	parts = [
		grp.sample(n=target_n, replace=False, random_state=random_state)
		for _, grp in df.groupby(target_col)
	]
	balanced = pd.concat(parts, axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)
	return balanced


def plot_top_ngrams(corpus, n=1, top_k=20, stop_words='english', max_features=20000, figsize=(10,6), title=None):
	"""
	Compute and plot the top n-grams from a text corpus.

	Parameters
	----------
	corpus : iterable-like
		Iterable of text documents (e.g., pandas Series).
	n : int, optional
		The n in n-grams (uses ngram_range=(n,n)). Default is 1 (unigrams).
	top_k : int, optional
		Number of top n-grams to show. Default is 20.
	stop_words : str or list, optional
		Stop words parameter forwarded to CountVectorizer. Default 'english'.
	max_features : int, optional
		Max features for the vectorizer. Default 20000.
	figsize : tuple, optional
		Figure size for the plot.
	title : str, optional
		Custom title for the plot. If None, a default title is used.

	Returns
	-------
	list of (term, count)
		The top n-grams and their counts (sorted descending).
	"""

	vec = CountVectorizer(ngram_range=(n, n), stop_words=stop_words, max_features=max_features)
	X = vec.fit_transform(corpus)
	sums = np.array(X.sum(axis=0)).ravel()
	terms = np.array(vec.get_feature_names_out())

	if terms.size == 0:
		print("No terms found for the given corpus/parameters.")
		return []

	top_idx = sums.argsort()[::-1][:top_k]
	top_terms = terms[top_idx]
	top_counts = sums[top_idx]

	# Plot horizontal bar chart with largest on top
	plt.figure(figsize=figsize)
	plt.barh(top_terms[::-1], top_counts[::-1], color='steelblue')
	plt.xlabel("Count")
	plt.tight_layout()
	if title is None:
		title = f"Top {min(top_k, len(top_terms))} {n}-grams"
	plt.title(title)
	plt.savefig(f'docs/02_results/top_{top_k}_{n}grams.png', dpi=300, bbox_inches='tight')
	plt.show()
		
	return list(zip(top_terms, top_counts))

# preprocessing notebook
def clean_text(s):
	"""
	Professional NLP preprocessing for Sentiment Analysis.
	Accepts a str, pandas.Series or pandas.DataFrame (with 'review_content').
	Returns cleaned str or pandas.Series of cleaned strs.
	"""
	# Initialize NLTK resources and lemmatizer once
	if not hasattr(clean_text, "_nltk_initialized"):
		nltk.download('stopwords', quiet=True)
		nltk.download('wordnet', quiet=True)
		clean_text._lemmatizer = WordNetLemmatizer()
		clean_text._stopwords = set(stopwords.words('english'))
		clean_text._nltk_initialized = True

	lemmatizer = clean_text._lemmatizer
	all_stopwords = clean_text._stopwords

	# DataFrame: apply on 'review_content' column
	if isinstance(s, pd.DataFrame):
		if 'review_content' not in s.columns:
			raise ValueError("DataFrame must contain 'review_content' column")
		return s['review_content'].apply(clean_text)

	# Series: apply element-wise
	if isinstance(s, pd.Series):
		return s.apply(clean_text)

	# Non-string inputs -> return empty string
	if not isinstance(s, str):
		return ''

	# PRO TIP: keep negations and some modifiers
	sentiment_exceptions = {'not', 'no', 'nor', 'against', 'but', 'however', 'very', 'too'}
	custom_stopwords = all_stopwords - sentiment_exceptions

	# 1. Decode HTML & Unicode
	s = html.unescape(s)
	s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore').decode('ascii', errors='ignore')

	# 2. Lowercase
	s = s.lower()

	# 3. Emojis to text
	s = emoji.demojize(s, delimiters=(" ", " "))

	# 4. Expand contractions
	s = contractions.fix(s)

	# 5. Remove URLs and HTML tags
	s = re.sub(r'https?://\S+|www\.\S+', ' ', s)
	s = re.sub(r'<[^>]+>', ' ', s)

	# 6. Limit repeated characters (e.g., "loooove" -> "loove")
	s = re.sub(r'(.)\1{2,}', r'\1\1', s)

	# 7. Keep only letters, digits, whitespace and underscores (emoji text)
	s = re.sub(r'[^a-z0-9\s_]', ' ', s)

	# 8. Tokenize, remove stopwords, lemmatize
	words = s.split()
	cleaned_words = [
		lemmatizer.lemmatize(word)
		for word in words
		if word not in custom_stopwords and len(word) > 1
	]

	# 9. Rejoin and collapse extra whitespace
	s = ' '.join(cleaned_words)
	s = re.sub(r'\s+', ' ', s).strip()

	return s


# feature engineering notebook
def top_n_grams(corpus, ngram_range=(1,1), top_k=20, stop_words='english', max_features=20000):
	vec = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words, max_features=max_features)
	X = vec.fit_transform(corpus)
	sums = np.array(X.sum(axis=0)).ravel()
	terms = np.array(vec.get_feature_names_out())
	if terms.size == 0:
		return []
	top_idx = sums.argsort()[::-1][:top_k]
	return list(zip(terms[top_idx], sums[top_idx]))


def show_top_ngrams_by_class(df, target_col='review_target', text_col='review_cleaned',
							 ngram_ranges=((1,1),(2,2)), top_k=(15,12),
							 stop_words='english', max_features=20000,
							 plot=True, figsize=(8,5)):
	"""
	For each class in df[target_col], print and (optionally) plot top n-grams per ngram_range.
	Returns a nested dict: {class: {ngram_range: [(term, count), ...]}}
	"""
	if target_col not in df.columns:
		raise KeyError(f"Target column '{target_col}' not found")

	# Normalize ngram_ranges: allow a single range like (1,1) to be passed and wrap it
	if isinstance(ngram_ranges, tuple) and len(ngram_ranges) == 2 and all(isinstance(x, int) for x in ngram_ranges):
		ngram_ranges = (ngram_ranges,)
	elif isinstance(ngram_ranges, list):
		ngram_ranges = tuple(ngram_ranges)

	classes = (df[target_col].cat.categories
			   if hasattr(df[target_col], 'cat') else np.unique(df[target_col].astype(str)))

	results = {}
	for cls in classes:
		cls_mask = df[target_col] == cls if cls in df[target_col].values else df[target_col].astype(str) == str(cls)
		subset = df.loc[cls_mask, text_col].fillna("").astype(str)
		results.setdefault(cls, {})
		for i, rg in enumerate(ngram_ranges):
			k = top_k[i] if (isinstance(top_k, (list,tuple)) and i < len(top_k)) else (top_k if isinstance(top_k, int) else 20)
			top = top_n_grams(subset, ngram_range=rg, top_k=k, stop_words=stop_words, max_features=max_features)
			results[cls][rg] = top

			# Print
			nname = ("unigrams" if rg==(1,1) else "bigrams" if rg==(2,2) else f"{rg[0]}-{rg[1]}grams")
			print(f'--- Top {nname} for class {cls} ---')
			print(top)
			print()

			# Plot
			if plot and top:
				terms, counts = zip(*top)
				plt.figure(figsize=figsize)
				plt.barh(terms[::-1], counts[::-1], color='steelblue')
				plt.title(f"Top {len(terms)} {nname} for class {cls}")
				plt.xlabel("Count")
				plt.tight_layout()
				plt.savefig(f'docs/02_results/top_{nname}_for_class_{cls}.png', dpi=300, bbox_inches='tight')
				plt.show()

	return results


def add_basic_meta_features(df: pd.DataFrame, text_col: str = 'review_content') -> pd.DataFrame:
	"""
	Add basic meta-features to `df` based on the text column `text_col`.
	Feature column names are prefixed with a sanitized version of `text_col`
	(e.g. "review_title" -> "review_title_exclamation_count") to avoid collisions.
	"""
	if text_col not in df.columns:
		raise KeyError(f"Text column '{text_col}' not found in dataframe")

	# sanitize column name for use as prefix
	prefix = re.sub(r'\W+', '_', text_col).strip('_').lower()
	if not prefix:
		prefix = 'text'

	s = df[text_col].fillna("").astype(str)
	df = df.copy()

	df[f'{prefix}_exclamation_count'] = s.str.count(r'!')
	df[f'{prefix}_question_count'] = s.str.count(r'\?')
	df[f'{prefix}_punctuation_count'] = s.str.count(r"[^\w\s]")
	df[f'{prefix}_word_count'] = s.str.split().apply(lambda ws: len(ws) if isinstance(ws, list) else 0)
	df[f'{prefix}_avg_word_length'] = s.str.split().apply(
		lambda ws: float(np.mean([len(w) for w in ws])) if isinstance(ws, list) and len(ws) else 0.0
	)
	df[f'{prefix}_uppercase_count'] = s.apply(lambda x: sum(1 for c in x if c.isupper()))
	lengths = s.str.len().replace(0, 1)
	df[f'{prefix}_uppercase_ratio'] = df[f'{prefix}_uppercase_count'] / lengths

	return df


def plot_dimensionality_reduction(X, labels, method='PCA', sample=1000, random_state: int = 42, figsize=(8,6), data_name: str = None):
	"""
	Reduce `X` to 2D and plot colored by `labels`.

	- If `X` is sparse, uses TruncatedSVD for initial reduction.
	- `method` can be 'PCA' or 'TSNE'. For 'TSNE', X is first reduced to 50 components
	  (when high-dimensional) using TruncatedSVD for speed.
	- `sample` controls maximum number of points to plot (random sampling).
	- `data_name` optional string used in the saved filename (e.g. 'train','valid','test').
	  If None a timestamp will be used to avoid overwriting files.

	Returns the (n_samples,2) embedding array.
	"""
	# Handle sparse matrices
	is_sparse = hasattr(X, 'tocsr') or hasattr(X, 'tocsc')

	n_samples = X.shape[0]
	rng = check_random_state(random_state)
	if sample is not None and n_samples > sample:
		idx = rng.choice(n_samples, size=sample, replace=False)
		if is_sparse:
			X_sample = X[idx]
		else:
			X_sample = X[idx, :]
		y_sample = np.asarray(labels)[idx]
	else:
		X_sample = X
		y_sample = np.asarray(labels)

	# Produce 2D embedding
	if method.upper() == 'PCA':
		if is_sparse:
			svd = TruncatedSVD(n_components=2, random_state=random_state)
			emb = svd.fit_transform(X_sample)
		else:
			pca = PCA(n_components=2, random_state=random_state)
			emb = pca.fit_transform(X_sample)
	elif method.upper() == 'TSNE':
		# For TSNE, pre-reduce if needed
		if is_sparse:
			pre_n = min(50, X.shape[1])
			pre = TruncatedSVD(n_components=pre_n, random_state=random_state)
			X_pre = pre.fit_transform(X_sample)
		else:
			X_pre = X_sample
		tsne = TSNE(n_components=2, random_state=random_state)
		emb = tsne.fit_transform(X_pre)
	else:
		raise ValueError("Unsupported method. Choose 'PCA' or 'TSNE'.")

	# Plot
	plt.figure(figsize=figsize)
	unique_labels, label_idx = np.unique(y_sample, return_inverse=True)
	cmap = plt.get_cmap('tab10')
	for i, ul in enumerate(unique_labels):
		mask = label_idx == i
		plt.scatter(emb[mask, 0], emb[mask, 1], s=10, alpha=0.8, label=str(ul), color=cmap(i % 10))
	plt.legend(title='label', bbox_to_anchor=(1.05, 1), loc='upper left')
	plt.tight_layout()
	plt.xlabel('dim1')
	plt.ylabel('dim2')
	plt.title(f'{method} projection')
	plt.savefig(f'docs/02_results/{method}_projection_{data_name}_{i}.png', dpi=300, bbox_inches='tight')
	plt.show()

	return emb


# main.py helpers
def ensure_nltk_resources(verbose: bool = False):
	"""Ensure common NLTK resources are available in the Streamlit environment.

	Downloads resources quietly by default. Returns True when complete.
	"""
	try:
		import nltk
		resources = ["punkt", "wordnet", "omw-1.4", "stopwords"]
		for r in resources:
			try:
				nltk.data.find(f"corpora/{r}")
			except LookupError:
				try:
					nltk.download(r, quiet=not verbose)
				except Exception:
					# best-effort; ignore failures here and let downstream code handle missing resources
					pass
	except Exception:
		# If nltk is not available, caller will handle the ImportError when calling clean_text
		return False
	return True


def load_assets():
	"""Light wrapper that delegates to the HF-aware loader when available.

	This keeps top-level imports in `helpers` light so importing the module
	doesn't require all heavy ML packages to be installed. When the HF
	loader is unavailable, a very small local-only fallback is attempted.
	"""
	try:
		# import the lightweight HF-aware loader we created
		from .hf_loader import load_assets_hf
		return load_assets_hf()
	except Exception as e:
		print(f"HF loader unavailable or failed to import: {e}")

	# Fallback: attempt simple local loads using the central ASSET_PATHS
	from src.config import settings
	ASSET_PATHS = list(settings.ASSET_PATHS)

	assets = []
	for p in ASSET_PATHS:
		if joblib is None:
			assets.append(None)
			continue
		try:
			assets.append(joblib.load(p))
			print(f"Loaded local asset: {p}")
		except Exception:
			assets.append(None)

	return tuple(assets)