Spaces:

tkbarb10
/

ads505-app

Sleeping

File size: 7,723 Bytes

5d4981c

import streamlit as st
import pandas as pd
import re
import nltk  # type: ignore
from nltk.stem import WordNetLemmatizer  # type: ignore
from nltk.corpus import wordnet  # type: ignore
from nltk.tokenize import word_tokenize  # type: ignore
from typing import Optional


# -------------------------------------------------------------------
# HTML cleaning and lemmatization helpers
# -------------------------------------------------------------------


def remove_user_html_tags(text: str) -> str:
    """Remove basic HTML entities/tags and lowercase the text.

    This preserves the original behavior used when training the model.
    """
    if text is None:
        return ""

    # Replace common HTML entities with their corresponding characters
    text = text.replace('&#34;', '"')    # Replace "
    text = text.replace('&quot;', '"')   # Also replace the named entity for "
    text = text.replace('&apos;', "'")   # Replace '
    text = text.replace('&#39;', "'")    # Also replace the numeric entity for '
    text = text.replace('&amp;', '&')    # Replace &
    text = text.replace('<br />', ' ')   # Replace line breaks with a space
    text = text.replace('<br>', ' ')     # Also handle <br>

    # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
    clean_text = re.sub(r'<[^>]+>', '', text)

    return clean_text.lower()


def get_wordnet_pos(treebank_tag: str) -> str:
    """Converts treebank POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default to noun if the tag is not recognized
        return wordnet.NOUN


def lemmatize_user_text(text: str) -> str:
    """Tokenizes, POS-tags, and lemmatizes a string of text."""
    if not isinstance(text, str):
        text = "" if text is None else str(text)

    lemmatizer = WordNetLemmatizer()

    # 1. Tokenize the text into words
    tokens = word_tokenize(text)

    # 2. Get the part-of-speech tag for each token
    tagged_tokens = nltk.pos_tag(tokens)

    # 3. Lemmatize each word with its corresponding POS tag
    lemmatized_output = []
    for word, tag in tagged_tokens:
        pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_output.append(lemma)

    return " ".join(lemmatized_output)


def prep_text_column(
    df: pd.DataFrame,
    text_col: str,
    lemma_col: str,
    overwrite: bool = False,
) -> pd.DataFrame:
    """
    Column-agnostic helper to clean HTML and create a lemma column.

    - If lemma_col already exists and overwrite=False, we return df unchanged.
    - Otherwise we copy df and do the expensive cleaning + lemmatization.
    """
    # ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
    if lemma_col in df.columns and not overwrite:
        return df
    else:
    # Only now do we copy and do heavy work
        df_out = df.copy()

        if text_col not in df_out.columns:
            raise KeyError(f"Column '{text_col}' not found in dataframe.")

        df_out[text_col] = (
            df_out[text_col]
            .fillna("")
            .apply(remove_user_html_tags)
            .astype(str)
            .str.strip()
        )

        df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)

        return df_out



# -------------------------------------------------------------------
# Internal text prep for prediction
# -------------------------------------------------------------------


@st.cache_data(show_spinner='Prepping data!')
def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
    """Prepare core text columns for the prediction model.

    This function:
    - Ensures HTML cleaning + lemmatization for:
        * 'text'         -> 'lemma_text'
        * 'review_title' -> 'lemma_title'
    - Ensures the length features:
        * 'Review Length'
        * 'Title Length'

    It is safe to call even if some of these columns already exist; in that case,
    lemmatization is skipped and only length features are added if needed.
    """
    work_df = df.copy()

    # Only lemmatize if the lemma columns are missing
    if 'lemma_text' not in work_df.columns:
        work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')

    if 'lemma_title' not in work_df.columns:
        work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')

    # Ensure length features (only create if missing)
    if 'Review Length' not in work_df.columns:
        work_df['Review Length'] = work_df['text'].fillna('').apply(len)

    if 'Title Length' not in work_df.columns:
        work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)

    return work_df


# -------------------------------------------------------------------
# Public entry point used by the Streamlit app
# -------------------------------------------------------------------


def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
    """Run text prep and feature assembly, storing results in session_state.

    Behavior:
    - If `df` is None, uses `st.session_state.raw_df` (current app behavior).
    - Checks that required columns are present for the predictive model.
    - Ensures HTML+lemma for title and text, and creates:
        * 'Review Length'
        * 'Title Length'
        * 'vote' (binary target: 1 if helpful_vote > 0 else 0)
    - Builds the feature matrix X based on `model.feature_names_in_`:
        ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
    - Stores:
        * prepped_df
        * X
        * true_y
        * prep_done flag
        * resets downstream prediction state
    """

    if df is None:
        df = st.session_state.get('raw_df')

    if df is None:
        st.warning("Upload a dataframe first.")
        return

    # Make sure the core columns are present
    required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
    missing = required_cols - set(df.columns)

    if missing:
        st.error(
            "The uploaded dataframe is missing required columns: "
            + ", ".join(sorted(missing))
        )
        return

    # Core text prep (HTML + lemma + length features)
    prepped = _prep_user_text(df)

    # Create binary target
    prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)

    # Assemble features expected by the predictive model
    # Your model expects:
    #   'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
    # We still respect model.feature_names_in_ for robustness.
    feature_cols = list(getattr(model, "feature_names_in_", [])) or [
        "lemma_title",
        "lemma_text",
        "images",
        "Review Length",
        "Title Length",
    ]

    # Keep only columns that actually exist
    feature_cols = [c for c in feature_cols if c in prepped.columns]

    if not feature_cols:
        st.error(
            "No valid feature columns found for the model. Expected something like: "
            "lemma_title, lemma_text, images, Review Length, Title Length."
        )
        return

    X = prepped[feature_cols]
    true_y = prepped["vote"]

    # Store in session_state for downstream use
    st.session_state.prepped_df = prepped
    st.session_state.X = X
    st.session_state.true_y = true_y
    st.session_state.prep_done = True

    # Reset downstream state if re-prepping
    st.session_state.probs = None
    st.session_state.model_run = False