import streamlit as st import pandas as pd import re import nltk # type: ignore from nltk.stem import WordNetLemmatizer # type: ignore from nltk.corpus import wordnet # type: ignore from nltk.tokenize import word_tokenize # type: ignore from typing import Optional # ------------------------------------------------------------------- # HTML cleaning and lemmatization helpers # ------------------------------------------------------------------- def remove_user_html_tags(text: str) -> str: """Remove basic HTML entities/tags and lowercase the text. This preserves the original behavior used when training the model. """ if text is None: return "" # Replace common HTML entities with their corresponding characters text = text.replace('"', '"') # Replace " text = text.replace('"', '"') # Also replace the named entity for " text = text.replace(''', "'") # Replace ' text = text.replace(''', "'") # Also replace the numeric entity for ' text = text.replace('&', '&') # Replace & text = text.replace('
', ' ') # Replace line breaks with a space text = text.replace('
', ' ') # Also handle
# Use regex to remove any remaining HTML tags (e.g.,

,

, ) clean_text = re.sub(r'<[^>]+>', '', text) return clean_text.lower() def get_wordnet_pos(treebank_tag: str) -> str: """Converts treebank POS tags to WordNet POS tags.""" if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: # Default to noun if the tag is not recognized return wordnet.NOUN def lemmatize_user_text(text: str) -> str: """Tokenizes, POS-tags, and lemmatizes a string of text.""" if not isinstance(text, str): text = "" if text is None else str(text) lemmatizer = WordNetLemmatizer() # 1. Tokenize the text into words tokens = word_tokenize(text) # 2. Get the part-of-speech tag for each token tagged_tokens = nltk.pos_tag(tokens) # 3. Lemmatize each word with its corresponding POS tag lemmatized_output = [] for word, tag in tagged_tokens: pos = get_wordnet_pos(tag) lemma = lemmatizer.lemmatize(word, pos=pos) lemmatized_output.append(lemma) return " ".join(lemmatized_output) def prep_text_column( df: pd.DataFrame, text_col: str, lemma_col: str, overwrite: bool = False, ) -> pd.DataFrame: """ Column-agnostic helper to clean HTML and create a lemma column. - If lemma_col already exists and overwrite=False, we return df unchanged. - Otherwise we copy df and do the expensive cleaning + lemmatization. """ # ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it if lemma_col in df.columns and not overwrite: return df else: # Only now do we copy and do heavy work df_out = df.copy() if text_col not in df_out.columns: raise KeyError(f"Column '{text_col}' not found in dataframe.") df_out[text_col] = ( df_out[text_col] .fillna("") .apply(remove_user_html_tags) .astype(str) .str.strip() ) df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text) return df_out # ------------------------------------------------------------------- # Internal text prep for prediction # ------------------------------------------------------------------- @st.cache_data(show_spinner='Prepping data!') def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame: """Prepare core text columns for the prediction model. This function: - Ensures HTML cleaning + lemmatization for: * 'text' -> 'lemma_text' * 'review_title' -> 'lemma_title' - Ensures the length features: * 'Review Length' * 'Title Length' It is safe to call even if some of these columns already exist; in that case, lemmatization is skipped and only length features are added if needed. """ work_df = df.copy() # Only lemmatize if the lemma columns are missing if 'lemma_text' not in work_df.columns: work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text') if 'lemma_title' not in work_df.columns: work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title') # Ensure length features (only create if missing) if 'Review Length' not in work_df.columns: work_df['Review Length'] = work_df['text'].fillna('').apply(len) if 'Title Length' not in work_df.columns: work_df['Title Length'] = work_df['review_title'].fillna('').apply(len) return work_df # ------------------------------------------------------------------- # Public entry point used by the Streamlit app # ------------------------------------------------------------------- def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None: """Run text prep and feature assembly, storing results in session_state. Behavior: - If `df` is None, uses `st.session_state.raw_df` (current app behavior). - Checks that required columns are present for the predictive model. - Ensures HTML+lemma for title and text, and creates: * 'Review Length' * 'Title Length' * 'vote' (binary target: 1 if helpful_vote > 0 else 0) - Builds the feature matrix X based on `model.feature_names_in_`: ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'] - Stores: * prepped_df * X * true_y * prep_done flag * resets downstream prediction state """ if df is None: df = st.session_state.get('raw_df') if df is None: st.warning("Upload a dataframe first.") return # Make sure the core columns are present required_cols = {'helpful_vote', 'review_title', 'text', 'images'} missing = required_cols - set(df.columns) if missing: st.error( "The uploaded dataframe is missing required columns: " + ", ".join(sorted(missing)) ) return # Core text prep (HTML + lemma + length features) prepped = _prep_user_text(df) # Create binary target prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0) # Assemble features expected by the predictive model # Your model expects: # 'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length' # We still respect model.feature_names_in_ for robustness. feature_cols = list(getattr(model, "feature_names_in_", [])) or [ "lemma_title", "lemma_text", "images", "Review Length", "Title Length", ] # Keep only columns that actually exist feature_cols = [c for c in feature_cols if c in prepped.columns] if not feature_cols: st.error( "No valid feature columns found for the model. Expected something like: " "lemma_title, lemma_text, images, Review Length, Title Length." ) return X = prepped[feature_cols] true_y = prepped["vote"] # Store in session_state for downstream use st.session_state.prepped_df = prepped st.session_state.X = X st.session_state.true_y = true_y st.session_state.prep_done = True # Reset downstream state if re-prepping st.session_state.probs = None st.session_state.model_run = False