Sentiment_Analysis_And_Topic_Modelling

Sleeping

App Files Files Community

hanantonio commited on Sep 10, 2025

Commit

791da3d

verified ·

1 Parent(s): de96184

Upload 21 files

Browse files

Files changed (5) hide show

src/Dockerfile +2 -6
src/requirements.txt +4 -8
src/src/app.py +46 -52
src/src/eda.py +31 -61
src/src/prediction_compile.py +26 -91

src/Dockerfile CHANGED Viewed

@@ -8,17 +8,13 @@ RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
 COPY src/ ./src/
-RUN pip install --upgrade pip
 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
 RUN pip install fastopic==1.0.1 --no-deps
 RUN pip install topmost==1.0.2 --no-deps
 EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

     git \
     && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
 COPY src/ ./src/
 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
 RUN pip install fastopic==1.0.1 --no-deps
 RUN pip install topmost==1.0.2 --no-deps
 EXPOSE 8501
+ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]

src/requirements.txt CHANGED Viewed

@@ -1,22 +1,18 @@
-pip>=25.2
 torch==2.5.0
 torchvision==0.20.0
 streamlit==1.44.0
 pandas==2.2.3
-seaborn
 matplotlib
 plotly
 pillow
-numpy
 wordcloud
-# Install fastopic without pulling torch==1.11.0
-fastopic==1.0.1 --no-deps
-# Install topmost without pulling old torch deps
-topmost==1.0.2 --no-deps
 gensim==4.3.3
 joblib==1.2.0
 scikit-learn==1.5.2
-tensorflow==2.17.1
 nltk

 torch==2.5.0
 torchvision==0.20.0
+tensorflow==2.17.1
 streamlit==1.44.0
 pandas==2.2.3
+numpy
 matplotlib
+seaborn
 plotly
 pillow
 wordcloud
 gensim==4.3.3
 joblib==1.2.0
 scikit-learn==1.5.2
 nltk
+# Remove fastopic/topmost from here

src/src/app.py CHANGED Viewed

@@ -1,60 +1,54 @@
-# import streamlit as st
-# import eda
-# import prediction_src
-# ===============================
-# SQ_streamlit_app.py
-# ===============================
 import streamlit as st
-# ===============================
-# Streamlit Config
-# ===============================
 st.set_page_config(
     page_title='ACRE - Automated Customer Review Analysis',
     layout='wide',
     initial_sidebar_state='expanded'
 )
-# st.markdown(
-#     """
-#     **ACRE** (Automated Customer Reviews Analysis) is a system designed to classify customer sentiment towards
-#     their flight experience with Singapore Airlines (SQ). It transforms raw customer feedback into structured insights,
-#     empowering management to make data-driven decisions and continuously enhance SQ’s reputation for service excellence.
-#     """
-# )
-# Import custom pages (pastikan tidak ada st.* di global scope modul ini)
-import eda
-import prediction_compile
-# ===============================
-# Sidebar Navigation
-# ===============================
-page = st.sidebar.selectbox(
-    'Select Page:',
-    ('Exploratory Data Analysis (EDA)', 'Prediction')
-)
-# ===============================
-# Page Content
-# ===============================
-# st.title("ACRE - Automated Customer Review Analysis")
-if page == 'Exploratory Data Analysis (EDA)':
-    eda.run()
-else:
-    prediction_compile.run()
-# ===============================
-# Footer
-# ===============================
-st.markdown(
-    """
-    <div style="text-align: center; color: gray; font-size: 12px; margin-top: 50px;">
-        © 2025 Hana Antonio, Muhammad Revi Gilang Pradana, Zhaky B. Triaji. All rights reserved. <br>
-        References: Dataset from <a href="https://www.kaggle.com" target="_blank" style="color: gray;">Kaggle</a>
-    </div>
-    """,
-    unsafe_allow_html=True
-)

 import streamlit as st
+import os
+# =============================================
+# Base directory for relative paths
+# =============================================
+BASE_DIR = os.path.dirname(__file__)
+# =============================================
+# Streamlit Page Config
+# =============================================
 st.set_page_config(
     page_title='ACRE - Automated Customer Review Analysis',
     layout='wide',
     initial_sidebar_state='expanded'
 )
+# =============================================
+# Import custom modules inside run() to avoid
+# global loading issues
+# =============================================
+def run():
+    # Import pages
+    from eda import run as eda_run
+    from prediction_compile import run as prediction_run
+    # Sidebar navigation
+    page = st.sidebar.selectbox(
+        'Select Page:',
+        ('Exploratory Data Analysis (EDA)', 'Prediction')
+    )
+    # Page content
+    if page == 'Exploratory Data Analysis (EDA)':
+        eda_run()
+    else:
+        prediction_run()
+    # Footer
+    st.markdown(
+        """
+        <div style="text-align: center; color: gray; font-size: 12px; margin-top: 50px;">
+            © 2025 Hana Antonio, Muhammad Revi Gilang Pradana, Zhaky B. Triaji. All rights reserved. <br>
+            References: Dataset from <a href="https://www.kaggle.com" target="_blank" style="color: gray;">Kaggle</a>
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+# =============================================
+# Run app
+# =============================================
+if __name__ == "__main__":
+    run()

src/src/eda.py CHANGED Viewed

@@ -4,118 +4,88 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import plotly.express as px
 from PIL import Image
 # =============================================
-# Cache dataset agar tidak reload setiap kali
 # =============================================
 @st.cache_data
 def load_data():
-    df = pd.read_csv('./src/singapore_airlines_reviews.csv')
     return df
 # Load dataset
 df = load_data()
 # =============================================
-# Main app
 # =============================================
 def run():
-    # Judul dan Subjudul
     st.title("ACRE - Automated Customer Review Analysis")
     st.subheader("Exploratory Data Analysis (EDA)")
     st.markdown(
         """
         This section provides an exploratory data analysis (EDA) of Singapore Airlines (SQ) customer reviews.
-        We aim to understand the distribution of ratings, textual review characteristics, and topic modeling results.
-        These insights serve as the foundation for building automated models that classify sentiment and uncover key themes
-        in customer feedback.
         """
     )
-    # ===============================
-    # Dataset Preview
-    # ===============================
     st.write("### Dataset Preview")
     st.dataframe(df.head())
-    # ===============================
-    # Distribusi Rating
-    # ===============================
     st.write("### Distribution of Ratings")
     fig, ax = plt.subplots(figsize=(8, 5))
-    sns.countplot(x='rating', data=df, palette='viridis', ax=ax,
-                  order=sorted(df['rating'].unique()))
     for p in ax.patches:
         height = p.get_height()
         ax.annotate(f'{height:,}', (p.get_x() + p.get_width()/2, height),
                     ha='center', va='bottom', fontsize=10, fontweight='bold')
     st.pyplot(fig)
-    st.markdown(
-        """
-        **Note:** Ratings are explored here only as descriptive information about passenger experiences.
-        In the inference page, actual sentiment will be predicted automatically from the review text using NLP techniques.
-        """
-    )
-    # ===============================
-    # Analisis Panjang Teks
-    # ===============================
     st.write("### Distribution of Review Length")
     df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
     fig = px.histogram(df, x='text_length', nbins=50, title='Review Length Distribution')
     st.plotly_chart(fig, use_container_width=True)
-    # Wordcloud
     col1, col2 = st.columns(2)
     with col1:
-        st.image("./src/Negative - Wordcloud.png", caption="Negative - Wordcloud")
     with col2:
-        st.image("./src/Positive - Wordcloud.png", caption="Positive - Wordcloud")
-    st.write("Lorem ipsum explanation for Wordcloud.")
-    # ===============================
-    # Topic Modeling Results (Images)
-    # ===============================
     st.write("## Topic Modeling Results")
-    # 1. Top Words Distributions
     col1, col2 = st.columns(2)
     with col1:
-        st.image("./src/Negative - Top Words Distributions.png", caption="Negative - Top Words Distributions")
     with col2:
-        st.image("./src/Positive - Top Words Distributions.png", caption="Positive - Top Words Distributions")
-    # st.write("Lorem ipsum explanation for Top Words Distributions.")
-    # 2. Topic Activities Over Time
     col1, col2 = st.columns(2)
     with col1:
-        st.image("./src/Negative - Topic Activities Over Time.png", caption="Negative - Topic Activities Over Time")
     with col2:
-        st.image("./src/Positive - Topic Activities Over Time.png", caption="Positive - Topic Activities Over Time")
-    # st.write("Lorem ipsum explanation for Topic Activities Over Time.")
-    # 3. Topics Hierarchy
-    # col1, col2 = st.columns(2)
-    # with col1:
-    #     st.image("./src/Negative - Topics Hierarchy.png", caption="Negative - Topics Hierarchy")
-    # with col2:
-    #     st.image("./src/Positive - Topics Hierarchy.png", caption="Positive - Topics Hierarchy")
-    # st.write("Lorem ipsum explanation for Topics Hierarchy.")
-    # 4. Topic Weights
     col1, col2 = st.columns(2)
     with col1:
-        st.image("./src/Negative - Topics Weights.png", caption="Negative - Topic Weights")
     with col2:
-        st.image("./src/Positive - Topics Weights.png", caption="Positive - Topic Weights")
-    # st.write("Lorem ipsum explanation for Topics Weights.")
-# =============================================
-# Run Script
-# =============================================
-if __name__ == '__main__':
-    run()

 import seaborn as sns
 import plotly.express as px
 from PIL import Image
+import os
 # =============================================
+# Base directory (works in container)
+# =============================================
+BASE_DIR = os.path.dirname(__file__)
+# =============================================
+# Cache dataset to avoid reload every time
 # =============================================
 @st.cache_data
 def load_data():
+    csv_path = os.path.join(BASE_DIR, 'singapore_airlines_reviews.csv')
+    df = pd.read_csv(csv_path)
     return df
 # Load dataset
 df = load_data()
 # =============================================
+# Main EDA function
 # =============================================
 def run():
     st.title("ACRE - Automated Customer Review Analysis")
     st.subheader("Exploratory Data Analysis (EDA)")
     st.markdown(
         """
         This section provides an exploratory data analysis (EDA) of Singapore Airlines (SQ) customer reviews.
+        We aim to understand the distribution of ratings, textual review characteristics, and topic modeling results.
         """
     )
+    # Dataset preview
     st.write("### Dataset Preview")
     st.dataframe(df.head())
+    # Distribution of ratings
     st.write("### Distribution of Ratings")
     fig, ax = plt.subplots(figsize=(8, 5))
+    sns.countplot(
+        x='rating',
+        data=df,
+        palette='viridis',
+        ax=ax,
+        order=sorted(df['rating'].unique())
+    )
     for p in ax.patches:
         height = p.get_height()
         ax.annotate(f'{height:,}', (p.get_x() + p.get_width()/2, height),
                     ha='center', va='bottom', fontsize=10, fontweight='bold')
     st.pyplot(fig)
+    # Distribution of review length
     st.write("### Distribution of Review Length")
     df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
     fig = px.histogram(df, x='text_length', nbins=50, title='Review Length Distribution')
     st.plotly_chart(fig, use_container_width=True)
+    # Wordclouds
     col1, col2 = st.columns(2)
     with col1:
+        st.image(os.path.join(BASE_DIR, "Negative - Wordcloud.png"), caption="Negative - Wordcloud")
     with col2:
+        st.image(os.path.join(BASE_DIR, "Positive - Wordcloud.png"), caption="Positive - Wordcloud")
+    # Topic Modeling Results
     st.write("## Topic Modeling Results")
     col1, col2 = st.columns(2)
     with col1:
+        st.image(os.path.join(BASE_DIR, "Negative - Top Words Distributions.png"), caption="Negative - Top Words Distributions")
     with col2:
+        st.image(os.path.join(BASE_DIR, "Positive - Top Words Distributions.png"), caption="Positive - Top Words Distributions")
     col1, col2 = st.columns(2)
     with col1:
+        st.image(os.path.join(BASE_DIR, "Negative - Topic Activities Over Time.png"), caption="Negative - Topic Activities Over Time")
     with col2:
+        st.image(os.path.join(BASE_DIR, "Positive - Topic Activities Over Time.png"), caption="Positive - Topic Activities Over Time")
     col1, col2 = st.columns(2)
     with col1:
+        st.image(os.path.join(BASE_DIR, "Negative - Topics Weights.png"), caption="Negative - Topics Weights")
     with col2:
+        st.image(os.path.join(BASE_DIR, "Positive - Topics Weights.png"), caption="Positive - Topics Weights")

src/src/prediction_compile.py CHANGED Viewed

@@ -1,12 +1,9 @@
-# ============================================
-# Import Libraries
-# ============================================
 import streamlit as st
 import re
 import pickle
 import joblib
 import nltk
-import os
 import numpy as np
 import pandas as pd
 from tensorflow.keras.preprocessing.sequence import pad_sequences
@@ -16,34 +13,23 @@ from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
 from huggingface_hub import hf_hub_download
-# ============================================
-# Setup NLTK
-# ============================================
 nltk_data_path = os.path.join("/tmp", "nltk_data")
 os.makedirs(nltk_data_path, exist_ok=True)
 nltk.data.path.append(nltk_data_path)
 nltk.download("stopwords", download_dir=nltk_data_path)
 nltk.download("punkt", download_dir=nltk_data_path)
-# ============================================
-# Loading Info
-# ============================================
-st.markdown(
-    '<p style="color:gray; font-size:14px; font-style:italic;">'
-    'Loading models (≈200 MB) and resources... this may take a while on first run. '
-    'Please be patient and DO NOT refresh the page :)'
-    '</p>',
-    unsafe_allow_html=True
-)
-# ============================================
-# Hugging Face Hub Repo
-# ============================================
 repo_id = "BesottenJenny/acre-sentiment-models"
-# ============================================
-# Cached Loading Functions
-# ============================================
 @st.cache_resource
 def load_sentiment_model():
     path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
@@ -67,18 +53,15 @@ def load_topic_models():
     pos_model = joblib.load(pos_path)
     return neg_model, pos_model
-# ============================================
-# Load all resources once
-# ============================================
 sentiment_model = load_sentiment_model()
 tokenizer, params = load_tokenizer_params()
 topic_model_neg, topic_model_pos = load_topic_models()
 max_len = params["max_len"]
-# ============================================
-# Preprocessing Function (NLTK)
-# ============================================
 negations = {"not", "no", "never"}
 stpwrds_en = set(stopwords.words("english")) - negations
 stemmer = PorterStemmer()
@@ -99,96 +82,48 @@ replacements = {
 def text_preprocessing(text):
     text = text.lower()
-    text = re.sub(r"\\n", " ", text)
     text = text.strip()
     text = re.sub(r'[^a-z0-9\s]', ' ', text)
     tokens = word_tokenize(text)
     tokens = [replacements.get(word, word) for word in tokens]
     tokens = [word for word in tokens if word not in stpwrds_en]
     tokens = [stemmer.stem(word) for word in tokens]
-    if len(tokens) == 0:
-        return "emptytext"
-    return ' '.join(tokens)
-# ============================================
 # Streamlit App
-# ============================================
 def run():
     st.title("ACRE - Automated Customer Review Analysis")
     st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
     st.markdown(
-        """
-        This section will help you understand how the **ACRE** system works.
-        Simply fill in the form below with either a dummy or real customer review, and the system will:
-        1. **Preprocess** your review text (cleaning, tokenization, and stemming).
-        2. **Predict sentiment** (Positive or Negative) along with a confidence score.
-        3. **Identify the most relevant topic** associated with the review, based on the predicted sentiment.
-        Use this tool to simulate how Singapore Airlines can transform raw customer feedback into **structured, data-driven insights**.
-        """
     )
-    with st.form(key='SQ-sentiment-analysis'):
-        date = st.date_input("Review Date")
-        platform = st.selectbox('Review Platform', ('Mobile', 'Desktop'), index=0)
-        rating = st.number_input('Rating', min_value=0, max_value=5, value=3, step=1)
-        st.markdown('---')
-        text = st.text_input('Customer Review', value='--customer review--')
-        title = st.text_input('Review Title', value='--review title--')
-        vote = st.slider('Helpful Vote', min_value=0, max_value=200, value=50, step=1)
-        st.markdown('---')
-        submitted = st.form_submit_button('Predict')
     if submitted:
-        st.markdown("---")
-        st.write("### Input Data")
-        data_inf = {
-            'published_date': date,
-            'published_platform': platform,
-            'rating': rating,
-            'type': 'Review',
-            'text': text,
-            'title': title,
-            'helpful_votes': vote
-        }
-        st.dataframe(pd.DataFrame([data_inf]))
-        # Preprocess
         processed = text_preprocessing(text)
         seq = tokenizer.texts_to_sequences([processed])
         padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
-        # Sentiment Prediction
         pred_probs = sentiment_model.predict(padded)
         pred_class = np.argmax(pred_probs, axis=1)[0]
         confidence = float(np.max(pred_probs))
         label_map = {0: "Negative", 1: "Positive"}
         sentiment_label = label_map[pred_class]
-        st.write("### Sentiment Prediction")
-        if sentiment_label == "Negative":
-            st.markdown(f"<h3 style='color:red;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
-        else:
-            st.markdown(f"<h3 style='color:green;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
-        st.write(f"**Confidence:** {confidence:.2f}")
-        # Topic Prediction
-        st.write("### Topic Modeling")
         if sentiment_label == "Negative":
             topics, probs = topic_model_neg.transform([text])
-            st.write("**Using Negative Model**")
-            st.markdown(f"<p style='color:red;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
         else:
             topics, probs = topic_model_pos.transform([text])
-            st.write("**Using Positive Model**")
-            st.markdown(f"<p style='color:green;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
         st.write(f"**Probabilities:** {probs.tolist()}")
-# ============================================
-# Run App
-# ============================================
-if __name__ == "__main__":
-    run()

 import streamlit as st
+import os
 import re
 import pickle
 import joblib
 import nltk
 import numpy as np
 import pandas as pd
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from nltk.stem import PorterStemmer
 from huggingface_hub import hf_hub_download
+# =============================================
+# Setup NLTK with container-safe path
+# =============================================
 nltk_data_path = os.path.join("/tmp", "nltk_data")
 os.makedirs(nltk_data_path, exist_ok=True)
 nltk.data.path.append(nltk_data_path)
 nltk.download("stopwords", download_dir=nltk_data_path)
 nltk.download("punkt", download_dir=nltk_data_path)
+# =============================================
+# HF Hub repo
+# =============================================
 repo_id = "BesottenJenny/acre-sentiment-models"
+# =============================================
+# Cached loading functions
+# =============================================
 @st.cache_resource
 def load_sentiment_model():
     path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
     pos_model = joblib.load(pos_path)
     return neg_model, pos_model
+# Load models
 sentiment_model = load_sentiment_model()
 tokenizer, params = load_tokenizer_params()
 topic_model_neg, topic_model_pos = load_topic_models()
 max_len = params["max_len"]
+# =============================================
+# Text preprocessing
+# =============================================
 negations = {"not", "no", "never"}
 stpwrds_en = set(stopwords.words("english")) - negations
 stemmer = PorterStemmer()
 def text_preprocessing(text):
     text = text.lower()
+    text = re.sub(r"\n", " ", text)
     text = text.strip()
     text = re.sub(r'[^a-z0-9\s]', ' ', text)
     tokens = word_tokenize(text)
     tokens = [replacements.get(word, word) for word in tokens]
     tokens = [word for word in tokens if word not in stpwrds_en]
     tokens = [stemmer.stem(word) for word in tokens]
+    return "emptytext" if len(tokens) == 0 else ' '.join(tokens)
+# =============================================
 # Streamlit App
+# =============================================
 def run():
     st.title("ACRE - Automated Customer Review Analysis")
     st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
     st.markdown(
+        "Enter a customer review below to predict sentiment and topic."
     )
+    with st.form(key='review_form'):
+        text = st.text_area("Customer Review", value="--customer review--")
+        submitted = st.form_submit_button("Predict")
     if submitted:
         processed = text_preprocessing(text)
         seq = tokenizer.texts_to_sequences([processed])
         padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
+        # Sentiment
         pred_probs = sentiment_model.predict(padded)
         pred_class = np.argmax(pred_probs, axis=1)[0]
         confidence = float(np.max(pred_probs))
         label_map = {0: "Negative", 1: "Positive"}
         sentiment_label = label_map[pred_class]
+        st.write(f"**Sentiment:** {sentiment_label} (Confidence: {confidence:.2f})")
+        # Topic Modeling
         if sentiment_label == "Negative":
             topics, probs = topic_model_neg.transform([text])
         else:
             topics, probs = topic_model_pos.transform([text])
+        st.write(f"**Topic ID(s):** {topics}")
         st.write(f"**Probabilities:** {probs.tolist()}")