Spaces:

tkbarb10
/

ads505-app

Sleeping

ads505-app / utils /prepare_user_dataframe.py

Taylor Kirk

Fresh deployment after moving datasets to hf datahub

5d4981c about 1 month ago

7.72 kB

	import streamlit as st
	import pandas as pd
	import re
	import nltk # type: ignore
	from nltk.stem import WordNetLemmatizer # type: ignore
	from nltk.corpus import wordnet # type: ignore
	from nltk.tokenize import word_tokenize # type: ignore
	from typing import Optional


	# -------------------------------------------------------------------
	# HTML cleaning and lemmatization helpers
	# -------------------------------------------------------------------


	def remove_user_html_tags(text: str) -> str:
	"""Remove basic HTML entities/tags and lowercase the text.

	This preserves the original behavior used when training the model.
	"""
	if text is None:
	return ""

	# Replace common HTML entities with their corresponding characters
	text = text.replace('"', '"') # Replace "
	text = text.replace('"', '"') # Also replace the named entity for "
	text = text.replace(''', "'") # Replace '
	text = text.replace(''', "'") # Also replace the numeric entity for '
	text = text.replace('&', '&') # Replace &
	text = text.replace('<br />', ' ') # Replace line breaks with a space
	text = text.replace('<br>', ' ') # Also handle <br>

	# Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
	clean_text = re.sub(r'<[^>]+>', '', text)

	return clean_text.lower()


	def get_wordnet_pos(treebank_tag: str) -> str:
	"""Converts treebank POS tags to WordNet POS tags."""
	if treebank_tag.startswith('J'):
	return wordnet.ADJ
	elif treebank_tag.startswith('V'):
	return wordnet.VERB
	elif treebank_tag.startswith('N'):
	return wordnet.NOUN
	elif treebank_tag.startswith('R'):
	return wordnet.ADV
	else:
	# Default to noun if the tag is not recognized
	return wordnet.NOUN


	def lemmatize_user_text(text: str) -> str:
	"""Tokenizes, POS-tags, and lemmatizes a string of text."""
	if not isinstance(text, str):
	text = "" if text is None else str(text)

	lemmatizer = WordNetLemmatizer()

	# 1. Tokenize the text into words
	tokens = word_tokenize(text)

	# 2. Get the part-of-speech tag for each token
	tagged_tokens = nltk.pos_tag(tokens)

	# 3. Lemmatize each word with its corresponding POS tag
	lemmatized_output = []
	for word, tag in tagged_tokens:
	pos = get_wordnet_pos(tag)
	lemma = lemmatizer.lemmatize(word, pos=pos)
	lemmatized_output.append(lemma)

	return " ".join(lemmatized_output)


	def prep_text_column(
	df: pd.DataFrame,
	text_col: str,
	lemma_col: str,
	overwrite: bool = False,
	) -> pd.DataFrame:
	"""
	Column-agnostic helper to clean HTML and create a lemma column.

	- If lemma_col already exists and overwrite=False, we return df unchanged.
	- Otherwise we copy df and do the expensive cleaning + lemmatization.
	"""
	# ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
	if lemma_col in df.columns and not overwrite:
	return df
	else:
	# Only now do we copy and do heavy work
	df_out = df.copy()

	if text_col not in df_out.columns:
	raise KeyError(f"Column '{text_col}' not found in dataframe.")

	df_out[text_col] = (
	df_out[text_col]
	.fillna("")
	.apply(remove_user_html_tags)
	.astype(str)
	.str.strip()
	)

	df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)

	return df_out



	# -------------------------------------------------------------------
	# Internal text prep for prediction
	# -------------------------------------------------------------------


	@st.cache_data(show_spinner='Prepping data!')
	def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
	"""Prepare core text columns for the prediction model.

	This function:
	- Ensures HTML cleaning + lemmatization for:
	* 'text' -> 'lemma_text'
	* 'review_title' -> 'lemma_title'
	- Ensures the length features:
	* 'Review Length'
	* 'Title Length'

	It is safe to call even if some of these columns already exist; in that case,
	lemmatization is skipped and only length features are added if needed.
	"""
	work_df = df.copy()

	# Only lemmatize if the lemma columns are missing
	if 'lemma_text' not in work_df.columns:
	work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')

	if 'lemma_title' not in work_df.columns:
	work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')

	# Ensure length features (only create if missing)
	if 'Review Length' not in work_df.columns:
	work_df['Review Length'] = work_df['text'].fillna('').apply(len)

	if 'Title Length' not in work_df.columns:
	work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)

	return work_df


	# -------------------------------------------------------------------
	# Public entry point used by the Streamlit app
	# -------------------------------------------------------------------


	def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
	"""Run text prep and feature assembly, storing results in session_state.

	Behavior:
	- If `df` is None, uses `st.session_state.raw_df` (current app behavior).
	- Checks that required columns are present for the predictive model.
	- Ensures HTML+lemma for title and text, and creates:
	* 'Review Length'
	* 'Title Length'
	* 'vote' (binary target: 1 if helpful_vote > 0 else 0)
	- Builds the feature matrix X based on `model.feature_names_in_`:
	['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
	- Stores:
	* prepped_df
	* X
	* true_y
	* prep_done flag
	* resets downstream prediction state
	"""

	if df is None:
	df = st.session_state.get('raw_df')

	if df is None:
	st.warning("Upload a dataframe first.")
	return

	# Make sure the core columns are present
	required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
	missing = required_cols - set(df.columns)

	if missing:
	st.error(
	"The uploaded dataframe is missing required columns: "
	+ ", ".join(sorted(missing))
	)
	return

	# Core text prep (HTML + lemma + length features)
	prepped = _prep_user_text(df)

	# Create binary target
	prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)

	# Assemble features expected by the predictive model
	# Your model expects:
	# 'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
	# We still respect model.feature_names_in_ for robustness.
	feature_cols = list(getattr(model, "feature_names_in_", [])) or [
	"lemma_title",
	"lemma_text",
	"images",
	"Review Length",
	"Title Length",
	]

	# Keep only columns that actually exist
	feature_cols = [c for c in feature_cols if c in prepped.columns]

	if not feature_cols:
	st.error(
	"No valid feature columns found for the model. Expected something like: "
	"lemma_title, lemma_text, images, Review Length, Title Length."
	)
	return

	X = prepped[feature_cols]
	true_y = prepped["vote"]

	# Store in session_state for downstream use
	st.session_state.prepped_df = prepped
	st.session_state.X = X
	st.session_state.true_y = true_y
	st.session_state.prep_done = True

	# Reset downstream state if re-prepping
	st.session_state.probs = None
	st.session_state.model_run = False