Sentiment_Analysis_And_Topic_Modelling

Sleeping

File size: 7,331 Bytes

57b215d

# ============================================
# Import Libraries
# ============================================
import streamlit as st
import re
import pickle
import joblib
import nltk
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from huggingface_hub import hf_hub_download

# ============================================
# Setup NLTK
# ============================================
nltk_data_path = os.path.join("/tmp", "nltk_data")
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)
nltk.download("stopwords", download_dir=nltk_data_path)
nltk.download("punkt", download_dir=nltk_data_path)

# ============================================
# Loading Info
# ============================================
st.markdown(
    '<p style="color:gray; font-size:14px; font-style:italic;">'
    'Loading models (≈200 MB) and resources... this may take a while on first run. '
    'Please be patient and DO NOT refresh the page :)'
    '</p>',
    unsafe_allow_html=True
)

# ============================================
# Hugging Face Hub Repo
# ============================================
repo_id = "BesottenJenny/acre-sentiment-models"

# ============================================
# Cached Loading Functions
# ============================================
@st.cache_resource
def load_sentiment_model():
    path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
    return keras.models.load_model(path)

@st.cache_resource
def load_tokenizer_params():
    tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.pkl")
    params_path = hf_hub_download(repo_id=repo_id, filename="params.pkl")
    with open(tokenizer_path, "rb") as f:
        tokenizer = pickle.load(f)
    with open(params_path, "rb") as f:
        params = pickle.load(f)
    return tokenizer, params

@st.cache_resource
def load_topic_models():
    neg_path = hf_hub_download(repo_id=repo_id, filename="fastopic_negative_model.pkl")
    pos_path = hf_hub_download(repo_id=repo_id, filename="fastopic_positive_model.pkl")
    neg_model = joblib.load(neg_path)
    pos_model = joblib.load(pos_path)
    return neg_model, pos_model

# ============================================
# Load all resources once
# ============================================
sentiment_model = load_sentiment_model()
tokenizer, params = load_tokenizer_params()
topic_model_neg, topic_model_pos = load_topic_models()

max_len = params["max_len"]

# ============================================
# Preprocessing Function (NLTK)
# ============================================
negations = {"not", "no", "never"}
stpwrds_en = set(stopwords.words("english")) - negations
stemmer = PorterStemmer()

replacements = {
    "sia": "sq",
    "flown": "fly",
    "flew": "fly",
    "alway": "always",
    "boarding": "board",
    "told": "tell",
    "said": "say",
    "booked": "book",
    "paid": "pay",
    "well": "good",
    "aircraft": "plane"
}

def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r"\\n", " ", text)
    text = text.strip()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [replacements.get(word, word) for word in tokens]
    tokens = [word for word in tokens if word not in stpwrds_en]
    tokens = [stemmer.stem(word) for word in tokens]
    if len(tokens) == 0:
        return "emptytext"
    return ' '.join(tokens)

# ============================================
# Streamlit App
# ============================================
def run():
    st.title("ACRE - Automated Customer Review Analysis")
    st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
    st.markdown(
        """

        This section will help you understand how the **ACRE** system works.  

        Simply fill in the form below with either a dummy or real customer review, and the system will:  



        1. **Preprocess** your review text (cleaning, tokenization, and stemming).  

        2. **Predict sentiment** (Positive or Negative) along with a confidence score.  

        3. **Identify the most relevant topic** associated with the review, based on the predicted sentiment.  



        Use this tool to simulate how Singapore Airlines can transform raw customer feedback into **structured, data-driven insights**.

        """
    )

    with st.form(key='SQ-sentiment-analysis'):
        date = st.date_input("Review Date")
        platform = st.selectbox('Review Platform', ('Mobile', 'Desktop'), index=0)
        rating = st.number_input('Rating', min_value=0, max_value=5, value=3, step=1)
        st.markdown('---')
        text = st.text_input('Customer Review', value='--customer review--')
        title = st.text_input('Review Title', value='--review title--')
        vote = st.slider('Helpful Vote', min_value=0, max_value=200, value=50, step=1)
        st.markdown('---')
        submitted = st.form_submit_button('Predict')

    if submitted:
        st.markdown("---")
        st.write("### Input Data")
        data_inf = {
            'published_date': date,
            'published_platform': platform,
            'rating': rating,
            'type': 'Review',
            'text': text,
            'title': title,
            'helpful_votes': vote
        }
        st.dataframe(pd.DataFrame([data_inf]))

        # Preprocess
        processed = text_preprocessing(text)
        seq = tokenizer.texts_to_sequences([processed])
        padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")

        # Sentiment Prediction
        pred_probs = sentiment_model.predict(padded)
        pred_class = np.argmax(pred_probs, axis=1)[0]
        confidence = float(np.max(pred_probs))

        label_map = {0: "Negative", 1: "Positive"}
        sentiment_label = label_map[pred_class]

        st.write("### Sentiment Prediction")
        if sentiment_label == "Negative":
            st.markdown(f"<h3 style='color:red;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
        else:
            st.markdown(f"<h3 style='color:green;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
        st.write(f"**Confidence:** {confidence:.2f}")

        # Topic Prediction
        st.write("### Topic Modeling")
        if sentiment_label == "Negative":
            topics, probs = topic_model_neg.transform([text])
            st.write("**Using Negative Model**")
            st.markdown(f"<p style='color:red;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
        else:
            topics, probs = topic_model_pos.transform([text])
            st.write("**Using Positive Model**")
            st.markdown(f"<p style='color:green;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)

        st.write(f"**Probabilities:** {probs.tolist()}")

# ============================================
# Run App
# ============================================
if __name__ == "__main__":
    run()