ads505-app / utils /prepare_user_dataframe.py
Taylor Kirk
Fresh deployment after moving datasets to hf datahub
5d4981c
import streamlit as st
import pandas as pd
import re
import nltk # type: ignore
from nltk.stem import WordNetLemmatizer # type: ignore
from nltk.corpus import wordnet # type: ignore
from nltk.tokenize import word_tokenize # type: ignore
from typing import Optional
# -------------------------------------------------------------------
# HTML cleaning and lemmatization helpers
# -------------------------------------------------------------------
def remove_user_html_tags(text: str) -> str:
"""Remove basic HTML entities/tags and lowercase the text.
This preserves the original behavior used when training the model.
"""
if text is None:
return ""
# Replace common HTML entities with their corresponding characters
text = text.replace('"', '"') # Replace "
text = text.replace('"', '"') # Also replace the named entity for "
text = text.replace(''', "'") # Replace '
text = text.replace(''', "'") # Also replace the numeric entity for '
text = text.replace('&', '&') # Replace &
text = text.replace('<br />', ' ') # Replace line breaks with a space
text = text.replace('<br>', ' ') # Also handle <br>
# Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
clean_text = re.sub(r'<[^>]+>', '', text)
return clean_text.lower()
def get_wordnet_pos(treebank_tag: str) -> str:
"""Converts treebank POS tags to WordNet POS tags."""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
# Default to noun if the tag is not recognized
return wordnet.NOUN
def lemmatize_user_text(text: str) -> str:
"""Tokenizes, POS-tags, and lemmatizes a string of text."""
if not isinstance(text, str):
text = "" if text is None else str(text)
lemmatizer = WordNetLemmatizer()
# 1. Tokenize the text into words
tokens = word_tokenize(text)
# 2. Get the part-of-speech tag for each token
tagged_tokens = nltk.pos_tag(tokens)
# 3. Lemmatize each word with its corresponding POS tag
lemmatized_output = []
for word, tag in tagged_tokens:
pos = get_wordnet_pos(tag)
lemma = lemmatizer.lemmatize(word, pos=pos)
lemmatized_output.append(lemma)
return " ".join(lemmatized_output)
def prep_text_column(
df: pd.DataFrame,
text_col: str,
lemma_col: str,
overwrite: bool = False,
) -> pd.DataFrame:
"""
Column-agnostic helper to clean HTML and create a lemma column.
- If lemma_col already exists and overwrite=False, we return df unchanged.
- Otherwise we copy df and do the expensive cleaning + lemmatization.
"""
# ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
if lemma_col in df.columns and not overwrite:
return df
else:
# Only now do we copy and do heavy work
df_out = df.copy()
if text_col not in df_out.columns:
raise KeyError(f"Column '{text_col}' not found in dataframe.")
df_out[text_col] = (
df_out[text_col]
.fillna("")
.apply(remove_user_html_tags)
.astype(str)
.str.strip()
)
df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)
return df_out
# -------------------------------------------------------------------
# Internal text prep for prediction
# -------------------------------------------------------------------
@st.cache_data(show_spinner='Prepping data!')
def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
"""Prepare core text columns for the prediction model.
This function:
- Ensures HTML cleaning + lemmatization for:
* 'text' -> 'lemma_text'
* 'review_title' -> 'lemma_title'
- Ensures the length features:
* 'Review Length'
* 'Title Length'
It is safe to call even if some of these columns already exist; in that case,
lemmatization is skipped and only length features are added if needed.
"""
work_df = df.copy()
# Only lemmatize if the lemma columns are missing
if 'lemma_text' not in work_df.columns:
work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')
if 'lemma_title' not in work_df.columns:
work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')
# Ensure length features (only create if missing)
if 'Review Length' not in work_df.columns:
work_df['Review Length'] = work_df['text'].fillna('').apply(len)
if 'Title Length' not in work_df.columns:
work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)
return work_df
# -------------------------------------------------------------------
# Public entry point used by the Streamlit app
# -------------------------------------------------------------------
def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
"""Run text prep and feature assembly, storing results in session_state.
Behavior:
- If `df` is None, uses `st.session_state.raw_df` (current app behavior).
- Checks that required columns are present for the predictive model.
- Ensures HTML+lemma for title and text, and creates:
* 'Review Length'
* 'Title Length'
* 'vote' (binary target: 1 if helpful_vote > 0 else 0)
- Builds the feature matrix X based on `model.feature_names_in_`:
['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
- Stores:
* prepped_df
* X
* true_y
* prep_done flag
* resets downstream prediction state
"""
if df is None:
df = st.session_state.get('raw_df')
if df is None:
st.warning("Upload a dataframe first.")
return
# Make sure the core columns are present
required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
missing = required_cols - set(df.columns)
if missing:
st.error(
"The uploaded dataframe is missing required columns: "
+ ", ".join(sorted(missing))
)
return
# Core text prep (HTML + lemma + length features)
prepped = _prep_user_text(df)
# Create binary target
prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)
# Assemble features expected by the predictive model
# Your model expects:
# 'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
# We still respect model.feature_names_in_ for robustness.
feature_cols = list(getattr(model, "feature_names_in_", [])) or [
"lemma_title",
"lemma_text",
"images",
"Review Length",
"Title Length",
]
# Keep only columns that actually exist
feature_cols = [c for c in feature_cols if c in prepped.columns]
if not feature_cols:
st.error(
"No valid feature columns found for the model. Expected something like: "
"lemma_title, lemma_text, images, Review Length, Title Length."
)
return
X = prepped[feature_cols]
true_y = prepped["vote"]
# Store in session_state for downstream use
st.session_state.prepped_df = prepped
st.session_state.X = X
st.session_state.true_y = true_y
st.session_state.prep_done = True
# Reset downstream state if re-prepping
st.session_state.probs = None
st.session_state.model_run = False