File size: 7,723 Bytes
5d4981c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import streamlit as st
import pandas as pd
import re
import nltk  # type: ignore
from nltk.stem import WordNetLemmatizer  # type: ignore
from nltk.corpus import wordnet  # type: ignore
from nltk.tokenize import word_tokenize  # type: ignore
from typing import Optional


# -------------------------------------------------------------------
# HTML cleaning and lemmatization helpers
# -------------------------------------------------------------------


def remove_user_html_tags(text: str) -> str:
    """Remove basic HTML entities/tags and lowercase the text.

    This preserves the original behavior used when training the model.
    """
    if text is None:
        return ""

    # Replace common HTML entities with their corresponding characters
    text = text.replace('"', '"')    # Replace "
    text = text.replace('"', '"')   # Also replace the named entity for "
    text = text.replace(''', "'")   # Replace '
    text = text.replace(''', "'")    # Also replace the numeric entity for '
    text = text.replace('&', '&')    # Replace &
    text = text.replace('<br />', ' ')   # Replace line breaks with a space
    text = text.replace('<br>', ' ')     # Also handle <br>

    # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
    clean_text = re.sub(r'<[^>]+>', '', text)

    return clean_text.lower()


def get_wordnet_pos(treebank_tag: str) -> str:
    """Converts treebank POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default to noun if the tag is not recognized
        return wordnet.NOUN


def lemmatize_user_text(text: str) -> str:
    """Tokenizes, POS-tags, and lemmatizes a string of text."""
    if not isinstance(text, str):
        text = "" if text is None else str(text)

    lemmatizer = WordNetLemmatizer()

    # 1. Tokenize the text into words
    tokens = word_tokenize(text)

    # 2. Get the part-of-speech tag for each token
    tagged_tokens = nltk.pos_tag(tokens)

    # 3. Lemmatize each word with its corresponding POS tag
    lemmatized_output = []
    for word, tag in tagged_tokens:
        pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_output.append(lemma)

    return " ".join(lemmatized_output)


def prep_text_column(
    df: pd.DataFrame,
    text_col: str,
    lemma_col: str,
    overwrite: bool = False,
) -> pd.DataFrame:
    """
    Column-agnostic helper to clean HTML and create a lemma column.

    - If lemma_col already exists and overwrite=False, we return df unchanged.
    - Otherwise we copy df and do the expensive cleaning + lemmatization.
    """
    # ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
    if lemma_col in df.columns and not overwrite:
        return df
    else:
    # Only now do we copy and do heavy work
        df_out = df.copy()

        if text_col not in df_out.columns:
            raise KeyError(f"Column '{text_col}' not found in dataframe.")

        df_out[text_col] = (
            df_out[text_col]
            .fillna("")
            .apply(remove_user_html_tags)
            .astype(str)
            .str.strip()
        )

        df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)

        return df_out



# -------------------------------------------------------------------
# Internal text prep for prediction
# -------------------------------------------------------------------


@st.cache_data(show_spinner='Prepping data!')
def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare core text columns for the prediction model.

    This function:
    - Ensures HTML cleaning + lemmatization for:
        * 'text'         -> 'lemma_text'
        * 'review_title' -> 'lemma_title'
    - Ensures the length features:
        * 'Review Length'
        * 'Title Length'

    It is safe to call even if some of these columns already exist; in that case,
    lemmatization is skipped and only length features are added if needed.
    """
    work_df = df.copy()

    # Only lemmatize if the lemma columns are missing
    if 'lemma_text' not in work_df.columns:
        work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')

    if 'lemma_title' not in work_df.columns:
        work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')

    # Ensure length features (only create if missing)
    if 'Review Length' not in work_df.columns:
        work_df['Review Length'] = work_df['text'].fillna('').apply(len)

    if 'Title Length' not in work_df.columns:
        work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)

    return work_df


# -------------------------------------------------------------------
# Public entry point used by the Streamlit app
# -------------------------------------------------------------------


def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
    """Run text prep and feature assembly, storing results in session_state.

    Behavior:
    - If `df` is None, uses `st.session_state.raw_df` (current app behavior).
    - Checks that required columns are present for the predictive model.
    - Ensures HTML+lemma for title and text, and creates:
        * 'Review Length'
        * 'Title Length'
        * 'vote' (binary target: 1 if helpful_vote > 0 else 0)
    - Builds the feature matrix X based on `model.feature_names_in_`:
        ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
    - Stores:
        * prepped_df
        * X
        * true_y
        * prep_done flag
        * resets downstream prediction state
    """

    if df is None:
        df = st.session_state.get('raw_df')

    if df is None:
        st.warning("Upload a dataframe first.")
        return

    # Make sure the core columns are present
    required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
    missing = required_cols - set(df.columns)

    if missing:
        st.error(
            "The uploaded dataframe is missing required columns: "
            + ", ".join(sorted(missing))
        )
        return

    # Core text prep (HTML + lemma + length features)
    prepped = _prep_user_text(df)

    # Create binary target
    prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)

    # Assemble features expected by the predictive model
    # Your model expects:
    #   'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
    # We still respect model.feature_names_in_ for robustness.
    feature_cols = list(getattr(model, "feature_names_in_", [])) or [
        "lemma_title",
        "lemma_text",
        "images",
        "Review Length",
        "Title Length",
    ]

    # Keep only columns that actually exist
    feature_cols = [c for c in feature_cols if c in prepped.columns]

    if not feature_cols:
        st.error(
            "No valid feature columns found for the model. Expected something like: "
            "lemma_title, lemma_text, images, Review Length, Title Length."
        )
        return

    X = prepped[feature_cols]
    true_y = prepped["vote"]

    # Store in session_state for downstream use
    st.session_state.prepped_df = prepped
    st.session_state.X = X
    st.session_state.true_y = true_y
    st.session_state.prep_done = True

    # Reset downstream state if re-prepping
    st.session_state.probs = None
    st.session_state.model_run = False