Spaces:
Sleeping
Sleeping
File size: 7,723 Bytes
5d4981c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
import streamlit as st
import pandas as pd
import re
import nltk # type: ignore
from nltk.stem import WordNetLemmatizer # type: ignore
from nltk.corpus import wordnet # type: ignore
from nltk.tokenize import word_tokenize # type: ignore
from typing import Optional
# -------------------------------------------------------------------
# HTML cleaning and lemmatization helpers
# -------------------------------------------------------------------
def remove_user_html_tags(text: str) -> str:
"""Remove basic HTML entities/tags and lowercase the text.
This preserves the original behavior used when training the model.
"""
if text is None:
return ""
# Replace common HTML entities with their corresponding characters
text = text.replace('"', '"') # Replace "
text = text.replace('"', '"') # Also replace the named entity for "
text = text.replace(''', "'") # Replace '
text = text.replace(''', "'") # Also replace the numeric entity for '
text = text.replace('&', '&') # Replace &
text = text.replace('<br />', ' ') # Replace line breaks with a space
text = text.replace('<br>', ' ') # Also handle <br>
# Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
clean_text = re.sub(r'<[^>]+>', '', text)
return clean_text.lower()
def get_wordnet_pos(treebank_tag: str) -> str:
"""Converts treebank POS tags to WordNet POS tags."""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
# Default to noun if the tag is not recognized
return wordnet.NOUN
def lemmatize_user_text(text: str) -> str:
"""Tokenizes, POS-tags, and lemmatizes a string of text."""
if not isinstance(text, str):
text = "" if text is None else str(text)
lemmatizer = WordNetLemmatizer()
# 1. Tokenize the text into words
tokens = word_tokenize(text)
# 2. Get the part-of-speech tag for each token
tagged_tokens = nltk.pos_tag(tokens)
# 3. Lemmatize each word with its corresponding POS tag
lemmatized_output = []
for word, tag in tagged_tokens:
pos = get_wordnet_pos(tag)
lemma = lemmatizer.lemmatize(word, pos=pos)
lemmatized_output.append(lemma)
return " ".join(lemmatized_output)
def prep_text_column(
df: pd.DataFrame,
text_col: str,
lemma_col: str,
overwrite: bool = False,
) -> pd.DataFrame:
"""
Column-agnostic helper to clean HTML and create a lemma column.
- If lemma_col already exists and overwrite=False, we return df unchanged.
- Otherwise we copy df and do the expensive cleaning + lemmatization.
"""
# ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
if lemma_col in df.columns and not overwrite:
return df
else:
# Only now do we copy and do heavy work
df_out = df.copy()
if text_col not in df_out.columns:
raise KeyError(f"Column '{text_col}' not found in dataframe.")
df_out[text_col] = (
df_out[text_col]
.fillna("")
.apply(remove_user_html_tags)
.astype(str)
.str.strip()
)
df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)
return df_out
# -------------------------------------------------------------------
# Internal text prep for prediction
# -------------------------------------------------------------------
@st.cache_data(show_spinner='Prepping data!')
def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
"""Prepare core text columns for the prediction model.
This function:
- Ensures HTML cleaning + lemmatization for:
* 'text' -> 'lemma_text'
* 'review_title' -> 'lemma_title'
- Ensures the length features:
* 'Review Length'
* 'Title Length'
It is safe to call even if some of these columns already exist; in that case,
lemmatization is skipped and only length features are added if needed.
"""
work_df = df.copy()
# Only lemmatize if the lemma columns are missing
if 'lemma_text' not in work_df.columns:
work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')
if 'lemma_title' not in work_df.columns:
work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')
# Ensure length features (only create if missing)
if 'Review Length' not in work_df.columns:
work_df['Review Length'] = work_df['text'].fillna('').apply(len)
if 'Title Length' not in work_df.columns:
work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)
return work_df
# -------------------------------------------------------------------
# Public entry point used by the Streamlit app
# -------------------------------------------------------------------
def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
"""Run text prep and feature assembly, storing results in session_state.
Behavior:
- If `df` is None, uses `st.session_state.raw_df` (current app behavior).
- Checks that required columns are present for the predictive model.
- Ensures HTML+lemma for title and text, and creates:
* 'Review Length'
* 'Title Length'
* 'vote' (binary target: 1 if helpful_vote > 0 else 0)
- Builds the feature matrix X based on `model.feature_names_in_`:
['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
- Stores:
* prepped_df
* X
* true_y
* prep_done flag
* resets downstream prediction state
"""
if df is None:
df = st.session_state.get('raw_df')
if df is None:
st.warning("Upload a dataframe first.")
return
# Make sure the core columns are present
required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
missing = required_cols - set(df.columns)
if missing:
st.error(
"The uploaded dataframe is missing required columns: "
+ ", ".join(sorted(missing))
)
return
# Core text prep (HTML + lemma + length features)
prepped = _prep_user_text(df)
# Create binary target
prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)
# Assemble features expected by the predictive model
# Your model expects:
# 'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
# We still respect model.feature_names_in_ for robustness.
feature_cols = list(getattr(model, "feature_names_in_", [])) or [
"lemma_title",
"lemma_text",
"images",
"Review Length",
"Title Length",
]
# Keep only columns that actually exist
feature_cols = [c for c in feature_cols if c in prepped.columns]
if not feature_cols:
st.error(
"No valid feature columns found for the model. Expected something like: "
"lemma_title, lemma_text, images, Review Length, Title Length."
)
return
X = prepped[feature_cols]
true_y = prepped["vote"]
# Store in session_state for downstream use
st.session_state.prepped_df = prepped
st.session_state.X = X
st.session_state.true_y = true_y
st.session_state.prep_done = True
# Reset downstream state if re-prepping
st.session_state.probs = None
st.session_state.model_run = False
|