Spaces:

tkbarb10
/

ads505-app

Sleeping

ads505-app / utils /topically.py

Taylor Kirk

fixing import paths

3001d07 about 2 months ago

4.91 kB

	from __future__ import annotations

	from pathlib import Path
	from typing import List, Optional, Tuple

	import numpy as np
	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.decomposition import NMF
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import make_pipeline, Pipeline
	from utils.build_plotly import _build_topic_figure
	from utils.load_data import get_data_directory

	import plotly.graph_objects as go # type: ignore

	import streamlit as st
	from nltk.corpus import stopwords # type: ignore

	from utils.remove_html import remove_html_tags

	# --------- Defaults / Paths ---------
	# ROOT = Path(__file__).resolve().parents[1]
	# DEFAULT_DATA_DIR = ROOT / "review_data"

	DEFAULT_DATA_DIR = get_data_directory()

	COLOR_WHEEL = {
	"All_Beauty": "#d946ef", # magenta-ish
	"Appliances": "#800000", # maroon
	"Baby_Products": "#87ceeb", # skyblue
	"Electronics": "#ffd700", # gold
	"Health_and_Household": "#3cb371", # mediumseagreen
	"Movies_and_TV": "#663399" # rebeccapurple
	}

	# Build stopword list (don’t mutate across calls)
	BASE_STOPWORDS = set(stopwords.words("english"))
	CUSTOM_KEEP = {
	'not','no','but','ain','don',"don't",'aren',"aren't",'couldn',"couldn't",
	'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',
	"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',
	"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',
	"weren't",'won',"won't",'wouldn',"wouldn't",'very','too'
	}
	DEFAULT_STOPWORDS = sorted(list(BASE_STOPWORDS - CUSTOM_KEEP))


	# --------- Data loading / modeling ---------
	def _load_category_df(
	data_dir: Path \| str,
	category: str,
	lemmatize: bool,
	nrows: int
	) -> pd.DataFrame:
	"""Load parquet for category; choose lemma or raw; basic cleaning."""
	data_dir = Path(data_dir)
	path = data_dir / f"{category}.parquet"
	lemma_path = data_dir / f"lemma_data/{category}.parquet"

	if lemmatize:
	df = pd.read_parquet(lemma_path)
	else:
	df = pd.read_parquet(path)
	if "text" in df.columns:
	df["text"] = df["text"].astype(str).str.strip().apply(remove_html_tags)

	return df.iloc[:nrows, :].copy()


	#@st.cache_data(show_spinner="One moment please!", show_time=True)
	def make_topics(
	category: str,
	topic_columns: str,
	lemmatize: bool,
	n1: int,
	n2: int,
	n_components: int,
	rating: Optional[List[int]] = None,
	helpful_vote: Optional[int] = None,
	new_words: Optional[List[str]] = None,
	n_top_words: int = 5,
	data_dir: Optional[str \| Path] = None,
	nrows: int = 10_000
	) -> Tuple[ColumnTransformer \| Pipeline, go.Figure]:
	"""
	Fit TF-IDF + NMF topic model and return (pipeline, Plotly figure).

	Returns:
	(topic_pipeline, fig)
	"""
	data_dir = data_dir or DEFAULT_DATA_DIR
	df = _load_category_df(data_dir, category, lemmatize, nrows=nrows)

	# Optional filters
	if rating is not None and "rating" in df.columns:
	df = df[df["rating"].isin(rating)]
	if helpful_vote is not None and "helpful_vote" in df.columns:
	df = df[df["helpful_vote"] > helpful_vote]

	# Columns to model
	topic_columns = (topic_columns or "").strip().lower()
	# Make a fresh stopword list each call to avoid global mutation
	stop_list = list(DEFAULT_STOPWORDS)
	if new_words:
	stop_list.extend(new_words)

	tfidf_text = TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2))
	tfidf_title = TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2))

	if topic_columns == "both":
	preprocessor = ColumnTransformer([
	("title", tfidf_title, "title"),
	("text", tfidf_text, "text")
	])
	elif topic_columns == "text":
	preprocessor = ColumnTransformer([("text", tfidf_text, "text")])
	else:
	# default to title if not 'both' or 'text'
	preprocessor = ColumnTransformer([("title", tfidf_title, "title")])

	nmf = NMF(
	n_components=n_components,
	init="nndsvda",
	solver="mu",
	beta_loss=1,
	random_state=10
	)

	topic_pipeline = make_pipeline(preprocessor, nmf)
	# Fit on only the columns the preprocessor expects
	fit_cols = [c for c in ["title", "text"] if c in df.columns]
	topic_pipeline.fit(df[fit_cols])

	feature_names = topic_pipeline[0].get_feature_names_out()
	nmf_model: NMF = topic_pipeline[1]

	# Choose color from map (fallback if category label differs)
	bar_color = COLOR_WHEEL.get(category, "#184A90")

	fig = _build_topic_figure(
	model=nmf_model,
	feature_names=feature_names,
	n_top_words=n_top_words,
	title=category,
	n_components=n_components,
	bar_color=bar_color
	)

	return topic_pipeline, fig