Sentiment_Analysis_And_Topic_Modelling

Sleeping

App Files Files Community

hanantonio commited on Sep 10, 2025

Commit

57b215d

verified ·

1 Parent(s): a84f008

Upload 21 files

Browse files

Files changed (22) hide show

.gitattributes +3 -0
Dockerfile +20 -20
README.md +19 -19
requirements.txt +17 -3
src/Negative - Top Words Distributions.png +0 -0
src/Negative - Topic Activities Over Time.png +3 -0
src/Negative - Topics Weights.png +0 -0
src/Positive - Top Words Distributions.png +0 -0
src/Positive - Topic Activities Over Time.png +3 -0
src/Positive - Topics Weights.png +0 -0
src/__pycache__/eda.cpython-39.pyc +0 -0
src/__pycache__/prediction_src.cpython-39.pyc +0 -0
src/app.py +60 -0
src/best_lstm_model.h5 +3 -0
src/best_model.keras +3 -0
src/eda.py +111 -0
src/fastopic_negative_model.pkl +3 -0
src/fastopic_positive_model.pkl +3 -0
src/params.pkl +3 -0
src/prediction_compile.py +194 -0
src/singapore_airlines_reviews.csv +0 -0
src/tokenizer.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/best_model.keras filter=lfs diff=lfs merge=lfs -text
+src/Negative[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:space:]]Time.png filter=lfs diff=lfs merge=lfs -text
+src/Positive[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:space:]]Time.png filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,20 +1,20 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

README.md CHANGED Viewed

@@ -1,19 +1,19 @@
----
-title: SQ Sentiment Analysis
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+---
+title: acre-system
+emoji: 🚀
+colorFrom: red
+colorTo: red
+sdk: docker
+app_port: 8501
+tags:
+- streamlit
+pinned: false
+short_description: Streamlit template space
+---
+# Welcome to Streamlit!
+Edit `/src/app.py` to customize this app to your heart's desire. :heart:
+If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+forums](https://discuss.streamlit.io).

requirements.txt CHANGED Viewed

@@ -1,3 +1,17 @@
-altair
-pandas
-streamlit

+streamlit==1.44.0
+pandas==2.2.3
+seaborn
+matplotlib
+plotly
+pillow
+numpy
+wordcloud
+fastopic==1.0.1
+topmost==1.0.2
+torchvision==0.21.0
+gensim==4.3.3
+torch===1.11.0
+joblib===1.2.0
+scikit-learn==1.6.1
+tensorflow==2.20.0
+nltk

src/Negative - Top Words Distributions.png ADDED Viewed

src/Negative - Topic Activities Over Time.png ADDED Viewed

Git LFS Details

SHA256: c9021d82de6adf0b4ab744b9b701d86f4d608e0301c8303acc24966245e87782
Pointer size: 131 Bytes
Size of remote file: 163 kB

src/Negative - Topics Weights.png ADDED Viewed

src/Positive - Top Words Distributions.png ADDED Viewed

src/Positive - Topic Activities Over Time.png ADDED Viewed

Git LFS Details

SHA256: 54cf3b7587f84e87206dccbb9ead680bf47e9ca7b8ca422b6e84f51dcce7bed1
Pointer size: 131 Bytes
Size of remote file: 114 kB

src/Positive - Topics Weights.png ADDED Viewed

src/__pycache__/eda.cpython-39.pyc ADDED Viewed

Binary file (3.55 kB). View file

src/__pycache__/prediction_src.cpython-39.pyc ADDED Viewed

Binary file (6.22 kB). View file

src/app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# import streamlit as st
+# import eda
+# import prediction_src
+# ===============================
+# SQ_streamlit_app.py
+# ===============================
+import streamlit as st
+# ===============================
+# Streamlit Config
+# ===============================
+st.set_page_config(
+    page_title='ACRE - Automated Customer Review Analysis',
+    layout='wide',
+    initial_sidebar_state='expanded'
+)
+# st.markdown(
+#     """
+#     **ACRE** (Automated Customer Reviews Analysis) is a system designed to classify customer sentiment towards
+#     their flight experience with Singapore Airlines (SQ). It transforms raw customer feedback into structured insights,
+#     empowering management to make data-driven decisions and continuously enhance SQ’s reputation for service excellence.
+#     """
+# )
+# Import custom pages (pastikan tidak ada st.* di global scope modul ini)
+import eda
+import prediction_compile
+# ===============================
+# Sidebar Navigation
+# ===============================
+page = st.sidebar.selectbox(
+    'Select Page:',
+    ('Exploratory Data Analysis (EDA)', 'Prediction')
+)
+# ===============================
+# Page Content
+# ===============================
+# st.title("ACRE - Automated Customer Review Analysis")
+if page == 'Exploratory Data Analysis (EDA)':
+    eda.run()
+else:
+    prediction_compile.run()
+# ===============================
+# Footer
+# ===============================
+st.markdown(
+    """
+    <div style="text-align: center; color: gray; font-size: 12px; margin-top: 50px;">
+        © 2025 Hana Antonio, Muhammad Revi Gilang Pradana, Zhaky B. Triaji. All rights reserved. <br>
+        References: Dataset from <a href="https://www.kaggle.com" target="_blank" style="color: gray;">Kaggle</a>
+    </div>
+    """,
+    unsafe_allow_html=True
+)

src/best_lstm_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f3aa3bdc5dbc925914ff1463382d8f05090cb1e7e9ece6b2a8e1546d7f7630
+size 8057368

src/best_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4d974bd880724b25a438f7de32d562951740369b972ff6ce80562dc86417ae
+size 8048001

src/eda.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+from PIL import Image
+# =============================================
+# Cache dataset agar tidak reload setiap kali
+# =============================================
+@st.cache_data
+def load_data():
+    df = pd.read_csv('./src/singapore_airlines_reviews.csv')
+    return df
+# Load dataset
+df = load_data()
+# =============================================
+# Main app
+# =============================================
+def run():
+    # Judul dan Subjudul
+    st.title("ACRE - Automated Customer Review Analysis")
+    st.subheader("Exploratory Data Analysis (EDA)")
+    st.markdown(
+        """
+        This section provides an exploratory data analysis (EDA) of Singapore Airlines (SQ) customer reviews.
+        We aim to understand the distribution of ratings, textual review characteristics, and topic modeling results.
+        These insights serve as the foundation for building automated models that classify sentiment and uncover key themes
+        in customer feedback.
+        """
+    )
+    # ===============================
+    # Dataset Preview
+    # ===============================
+    st.write("### Dataset Preview")
+    st.dataframe(df.head())
+    # ===============================
+    # Distribusi Rating
+    # ===============================
+    st.write("### Distribution of Ratings")
+    fig, ax = plt.subplots(figsize=(8, 5))
+    sns.countplot(x='rating', data=df, palette='viridis', ax=ax,
+                  order=sorted(df['rating'].unique()))
+    for p in ax.patches:
+        height = p.get_height()
+        ax.annotate(f'{height:,}', (p.get_x() + p.get_width()/2, height),
+                    ha='center', va='bottom', fontsize=10, fontweight='bold')
+    st.pyplot(fig)
+    st.markdown(
+        """
+        **Note:** Ratings are explored here only as descriptive information about passenger experiences.
+        In the inference page, actual sentiment will be predicted automatically from the review text using NLP techniques.
+        """
+    )
+    # ===============================
+    # Analisis Panjang Teks
+    # ===============================
+    st.write("### Distribution of Review Length")
+    df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
+    fig = px.histogram(df, x='text_length', nbins=50, title='Review Length Distribution')
+    st.plotly_chart(fig, use_container_width=True)
+    # ===============================
+    # Topic Modeling Results (Images)
+    # ===============================
+    st.write("## Topic Modeling Results")
+    # 1. Top Words Distributions
+    col1, col2 = st.columns(2)
+    with col1:
+        st.image("./src/Negative - Top Words Distributions.png", caption="Negative - Top Words Distributions")
+    with col2:
+        st.image("./src/Positive - Top Words Distributions.png", caption="Positive - Top Words Distributions")
+    st.write("Lorem ipsum explanation for Top Words Distributions.")
+    # 2. Topic Activities Over Time
+    col1, col2 = st.columns(2)
+    with col1:
+        st.image("./src/Negative - Topic Activities Over Time.png", caption="Negative - Topic Activities Over Time")
+    with col2:
+        st.image("./src/Positive - Topic Activities Over Time.png", caption="Positive - Topic Activities Over Time")
+    st.write("Lorem ipsum explanation for Topic Activities Over Time.")
+    # 3. Topics Hierarchy
+    # col1, col2 = st.columns(2)
+    # with col1:
+    #     st.image("./src/Negative - Topics Hierarchy.png", caption="Negative - Topics Hierarchy")
+    # with col2:
+    #     st.image("./src/Positive - Topics Hierarchy.png", caption="Positive - Topics Hierarchy")
+    # st.write("Lorem ipsum explanation for Topics Hierarchy.")
+    # 4. Topic Weights
+    col1, col2 = st.columns(2)
+    with col1:
+        st.image("./src/Negative - Topics Weights.png", caption="Negative - Topic Weights")
+    with col2:
+        st.image("./src/Positive - Topics Weights.png", caption="Positive - Topic Weights")
+    st.write("Lorem ipsum explanation for Topics Weights.")
+# =============================================
+# Run Script
+# =============================================
+if __name__ == '__main__':
+    run()

src/fastopic_negative_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fba351cbeb9a08a89a53957b6b6234cf637ebfff4dec49b5ff16174e2f69885f
+size 114269121

src/fastopic_positive_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a06d7de2a2d378f4e8fcb90846607e83fc655b649bbb4590415acab297bd881d
+size 124508274

src/params.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:166628b4f0cd37e23ad24105f70b940c084aef6b368714a92e305576357ded45
+size 43

src/prediction_compile.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# ============================================
+# Import Libraries
+# ============================================
+import streamlit as st
+import re
+import pickle
+import joblib
+import nltk
+import os
+import numpy as np
+import pandas as pd
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow import keras
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from huggingface_hub import hf_hub_download
+# ============================================
+# Setup NLTK
+# ============================================
+nltk_data_path = os.path.join("/tmp", "nltk_data")
+os.makedirs(nltk_data_path, exist_ok=True)
+nltk.data.path.append(nltk_data_path)
+nltk.download("stopwords", download_dir=nltk_data_path)
+nltk.download("punkt", download_dir=nltk_data_path)
+# ============================================
+# Loading Info
+# ============================================
+st.markdown(
+    '<p style="color:gray; font-size:14px; font-style:italic;">'
+    'Loading models (≈200 MB) and resources... this may take a while on first run. '
+    'Please be patient and DO NOT refresh the page :)'
+    '</p>',
+    unsafe_allow_html=True
+)
+# ============================================
+# Hugging Face Hub Repo
+# ============================================
+repo_id = "BesottenJenny/acre-sentiment-models"
+# ============================================
+# Cached Loading Functions
+# ============================================
+@st.cache_resource
+def load_sentiment_model():
+    path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
+    return keras.models.load_model(path)
+@st.cache_resource
+def load_tokenizer_params():
+    tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.pkl")
+    params_path = hf_hub_download(repo_id=repo_id, filename="params.pkl")
+    with open(tokenizer_path, "rb") as f:
+        tokenizer = pickle.load(f)
+    with open(params_path, "rb") as f:
+        params = pickle.load(f)
+    return tokenizer, params
+@st.cache_resource
+def load_topic_models():
+    neg_path = hf_hub_download(repo_id=repo_id, filename="fastopic_negative_model.pkl")
+    pos_path = hf_hub_download(repo_id=repo_id, filename="fastopic_positive_model.pkl")
+    neg_model = joblib.load(neg_path)
+    pos_model = joblib.load(pos_path)
+    return neg_model, pos_model
+# ============================================
+# Load all resources once
+# ============================================
+sentiment_model = load_sentiment_model()
+tokenizer, params = load_tokenizer_params()
+topic_model_neg, topic_model_pos = load_topic_models()
+max_len = params["max_len"]
+# ============================================
+# Preprocessing Function (NLTK)
+# ============================================
+negations = {"not", "no", "never"}
+stpwrds_en = set(stopwords.words("english")) - negations
+stemmer = PorterStemmer()
+replacements = {
+    "sia": "sq",
+    "flown": "fly",
+    "flew": "fly",
+    "alway": "always",
+    "boarding": "board",
+    "told": "tell",
+    "said": "say",
+    "booked": "book",
+    "paid": "pay",
+    "well": "good",
+    "aircraft": "plane"
+}
+def text_preprocessing(text):
+    text = text.lower()
+    text = re.sub(r"\\n", " ", text)
+    text = text.strip()
+    text = re.sub(r'[^a-z0-9\s]', ' ', text)
+    tokens = word_tokenize(text)
+    tokens = [replacements.get(word, word) for word in tokens]
+    tokens = [word for word in tokens if word not in stpwrds_en]
+    tokens = [stemmer.stem(word) for word in tokens]
+    if len(tokens) == 0:
+        return "emptytext"
+    return ' '.join(tokens)
+# ============================================
+# Streamlit App
+# ============================================
+def run():
+    st.title("ACRE - Automated Customer Review Analysis")
+    st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
+    st.markdown(
+        """
+        This section will help you understand how the **ACRE** system works.
+        Simply fill in the form below with either a dummy or real customer review, and the system will:
+        1. **Preprocess** your review text (cleaning, tokenization, and stemming).
+        2. **Predict sentiment** (Positive or Negative) along with a confidence score.
+        3. **Identify the most relevant topic** associated with the review, based on the predicted sentiment.
+        Use this tool to simulate how Singapore Airlines can transform raw customer feedback into **structured, data-driven insights**.
+        """
+    )
+    with st.form(key='SQ-sentiment-analysis'):
+        date = st.date_input("Review Date")
+        platform = st.selectbox('Review Platform', ('Mobile', 'Desktop'), index=0)
+        rating = st.number_input('Rating', min_value=0, max_value=5, value=3, step=1)
+        st.markdown('---')
+        text = st.text_input('Customer Review', value='--customer review--')
+        title = st.text_input('Review Title', value='--review title--')
+        vote = st.slider('Helpful Vote', min_value=0, max_value=200, value=50, step=1)
+        st.markdown('---')
+        submitted = st.form_submit_button('Predict')
+    if submitted:
+        st.markdown("---")
+        st.write("### Input Data")
+        data_inf = {
+            'published_date': date,
+            'published_platform': platform,
+            'rating': rating,
+            'type': 'Review',
+            'text': text,
+            'title': title,
+            'helpful_votes': vote
+        }
+        st.dataframe(pd.DataFrame([data_inf]))
+        # Preprocess
+        processed = text_preprocessing(text)
+        seq = tokenizer.texts_to_sequences([processed])
+        padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
+        # Sentiment Prediction
+        pred_probs = sentiment_model.predict(padded)
+        pred_class = np.argmax(pred_probs, axis=1)[0]
+        confidence = float(np.max(pred_probs))
+        label_map = {0: "Negative", 1: "Positive"}
+        sentiment_label = label_map[pred_class]
+        st.write("### Sentiment Prediction")
+        if sentiment_label == "Negative":
+            st.markdown(f"<h3 style='color:red;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
+        else:
+            st.markdown(f"<h3 style='color:green;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
+        st.write(f"**Confidence:** {confidence:.2f}")
+        # Topic Prediction
+        st.write("### Topic Modeling")
+        if sentiment_label == "Negative":
+            topics, probs = topic_model_neg.transform([text])
+            st.write("**Using Negative Model**")
+            st.markdown(f"<p style='color:red;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
+        else:
+            topics, probs = topic_model_pos.transform([text])
+            st.write("**Using Positive Model**")
+            st.markdown(f"<p style='color:green;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
+        st.write(f"**Probabilities:** {probs.tolist()}")
+# ============================================
+# Run App
+# ============================================
+if __name__ == "__main__":
+    run()

src/singapore_airlines_reviews.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

src/tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:814c12639a83d0262d298a91e1aeb404f281dbebd1224b620deea0b36eeb5ad3
+size 453750