Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import re | |
| import nltk # type: ignore | |
| from nltk.stem import WordNetLemmatizer # type: ignore | |
| from nltk.corpus import wordnet # type: ignore | |
| from nltk.tokenize import word_tokenize # type: ignore | |
| from typing import Optional | |
| # ------------------------------------------------------------------- | |
| # HTML cleaning and lemmatization helpers | |
| # ------------------------------------------------------------------- | |
| def remove_user_html_tags(text: str) -> str: | |
| """Remove basic HTML entities/tags and lowercase the text. | |
| This preserves the original behavior used when training the model. | |
| """ | |
| if text is None: | |
| return "" | |
| # Replace common HTML entities with their corresponding characters | |
| text = text.replace('"', '"') # Replace " | |
| text = text.replace('"', '"') # Also replace the named entity for " | |
| text = text.replace(''', "'") # Replace ' | |
| text = text.replace(''', "'") # Also replace the numeric entity for ' | |
| text = text.replace('&', '&') # Replace & | |
| text = text.replace('<br />', ' ') # Replace line breaks with a space | |
| text = text.replace('<br>', ' ') # Also handle <br> | |
| # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>) | |
| clean_text = re.sub(r'<[^>]+>', '', text) | |
| return clean_text.lower() | |
| def get_wordnet_pos(treebank_tag: str) -> str: | |
| """Converts treebank POS tags to WordNet POS tags.""" | |
| if treebank_tag.startswith('J'): | |
| return wordnet.ADJ | |
| elif treebank_tag.startswith('V'): | |
| return wordnet.VERB | |
| elif treebank_tag.startswith('N'): | |
| return wordnet.NOUN | |
| elif treebank_tag.startswith('R'): | |
| return wordnet.ADV | |
| else: | |
| # Default to noun if the tag is not recognized | |
| return wordnet.NOUN | |
| def lemmatize_user_text(text: str) -> str: | |
| """Tokenizes, POS-tags, and lemmatizes a string of text.""" | |
| if not isinstance(text, str): | |
| text = "" if text is None else str(text) | |
| lemmatizer = WordNetLemmatizer() | |
| # 1. Tokenize the text into words | |
| tokens = word_tokenize(text) | |
| # 2. Get the part-of-speech tag for each token | |
| tagged_tokens = nltk.pos_tag(tokens) | |
| # 3. Lemmatize each word with its corresponding POS tag | |
| lemmatized_output = [] | |
| for word, tag in tagged_tokens: | |
| pos = get_wordnet_pos(tag) | |
| lemma = lemmatizer.lemmatize(word, pos=pos) | |
| lemmatized_output.append(lemma) | |
| return " ".join(lemmatized_output) | |
| def prep_text_column( | |
| df: pd.DataFrame, | |
| text_col: str, | |
| lemma_col: str, | |
| overwrite: bool = False, | |
| ) -> pd.DataFrame: | |
| """ | |
| Column-agnostic helper to clean HTML and create a lemma column. | |
| - If lemma_col already exists and overwrite=False, we return df unchanged. | |
| - Otherwise we copy df and do the expensive cleaning + lemmatization. | |
| """ | |
| # ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it | |
| if lemma_col in df.columns and not overwrite: | |
| return df | |
| else: | |
| # Only now do we copy and do heavy work | |
| df_out = df.copy() | |
| if text_col not in df_out.columns: | |
| raise KeyError(f"Column '{text_col}' not found in dataframe.") | |
| df_out[text_col] = ( | |
| df_out[text_col] | |
| .fillna("") | |
| .apply(remove_user_html_tags) | |
| .astype(str) | |
| .str.strip() | |
| ) | |
| df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text) | |
| return df_out | |
| # ------------------------------------------------------------------- | |
| # Internal text prep for prediction | |
| # ------------------------------------------------------------------- | |
| def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame: | |
| """Prepare core text columns for the prediction model. | |
| This function: | |
| - Ensures HTML cleaning + lemmatization for: | |
| * 'text' -> 'lemma_text' | |
| * 'review_title' -> 'lemma_title' | |
| - Ensures the length features: | |
| * 'Review Length' | |
| * 'Title Length' | |
| It is safe to call even if some of these columns already exist; in that case, | |
| lemmatization is skipped and only length features are added if needed. | |
| """ | |
| work_df = df.copy() | |
| # Only lemmatize if the lemma columns are missing | |
| if 'lemma_text' not in work_df.columns: | |
| work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text') | |
| if 'lemma_title' not in work_df.columns: | |
| work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title') | |
| # Ensure length features (only create if missing) | |
| if 'Review Length' not in work_df.columns: | |
| work_df['Review Length'] = work_df['text'].fillna('').apply(len) | |
| if 'Title Length' not in work_df.columns: | |
| work_df['Title Length'] = work_df['review_title'].fillna('').apply(len) | |
| return work_df | |
| # ------------------------------------------------------------------- | |
| # Public entry point used by the Streamlit app | |
| # ------------------------------------------------------------------- | |
| def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None: | |
| """Run text prep and feature assembly, storing results in session_state. | |
| Behavior: | |
| - If `df` is None, uses `st.session_state.raw_df` (current app behavior). | |
| - Checks that required columns are present for the predictive model. | |
| - Ensures HTML+lemma for title and text, and creates: | |
| * 'Review Length' | |
| * 'Title Length' | |
| * 'vote' (binary target: 1 if helpful_vote > 0 else 0) | |
| - Builds the feature matrix X based on `model.feature_names_in_`: | |
| ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'] | |
| - Stores: | |
| * prepped_df | |
| * X | |
| * true_y | |
| * prep_done flag | |
| * resets downstream prediction state | |
| """ | |
| if df is None: | |
| df = st.session_state.get('raw_df') | |
| if df is None: | |
| st.warning("Upload a dataframe first.") | |
| return | |
| # Make sure the core columns are present | |
| required_cols = {'helpful_vote', 'review_title', 'text', 'images'} | |
| missing = required_cols - set(df.columns) | |
| if missing: | |
| st.error( | |
| "The uploaded dataframe is missing required columns: " | |
| + ", ".join(sorted(missing)) | |
| ) | |
| return | |
| # Core text prep (HTML + lemma + length features) | |
| prepped = _prep_user_text(df) | |
| # Create binary target | |
| prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0) | |
| # Assemble features expected by the predictive model | |
| # Your model expects: | |
| # 'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length' | |
| # We still respect model.feature_names_in_ for robustness. | |
| feature_cols = list(getattr(model, "feature_names_in_", [])) or [ | |
| "lemma_title", | |
| "lemma_text", | |
| "images", | |
| "Review Length", | |
| "Title Length", | |
| ] | |
| # Keep only columns that actually exist | |
| feature_cols = [c for c in feature_cols if c in prepped.columns] | |
| if not feature_cols: | |
| st.error( | |
| "No valid feature columns found for the model. Expected something like: " | |
| "lemma_title, lemma_text, images, Review Length, Title Length." | |
| ) | |
| return | |
| X = prepped[feature_cols] | |
| true_y = prepped["vote"] | |
| # Store in session_state for downstream use | |
| st.session_state.prepped_df = prepped | |
| st.session_state.X = X | |
| st.session_state.true_y = true_y | |
| st.session_state.prep_done = True | |
| # Reset downstream state if re-prepping | |
| st.session_state.probs = None | |
| st.session_state.model_run = False | |