hanantonio commited on
Commit
60be2ab
·
verified ·
1 Parent(s): 8384ac8

Upload 18 files

Browse files
Files changed (3) hide show
  1. src/app.py +30 -58
  2. src/eda.py +35 -55
  3. src/prediction_compile.py +29 -93
src/app.py CHANGED
@@ -1,60 +1,32 @@
1
- # import streamlit as st
2
- # import eda
3
- # import prediction_src
4
-
5
- # ===============================
6
- # SQ_streamlit_app.py
7
- # ===============================
8
-
9
  import streamlit as st
10
 
11
- # ===============================
12
- # Streamlit Config
13
- # ===============================
14
- st.set_page_config(
15
- page_title='ACRE - Automated Customer Review Analysis',
16
- layout='wide',
17
- initial_sidebar_state='expanded'
18
- )
19
- # st.markdown(
20
- # """
21
- # **ACRE** (Automated Customer Reviews Analysis) is a system designed to classify customer sentiment towards
22
- # their flight experience with Singapore Airlines (SQ). It transforms raw customer feedback into structured insights,
23
- # empowering management to make data-driven decisions and continuously enhance SQ’s reputation for service excellence.
24
- # """
25
- # )
26
-
27
- # Import custom pages (pastikan tidak ada st.* di global scope modul ini)
28
- import eda
29
- import prediction_compile
30
-
31
- # ===============================
32
- # Sidebar Navigation
33
- # ===============================
34
- page = st.sidebar.selectbox(
35
- 'Select Page:',
36
- ('Exploratory Data Analysis (EDA)', 'Prediction')
37
- )
38
-
39
- # ===============================
40
- # Page Content
41
- # ===============================
42
- # st.title("ACRE - Automated Customer Review Analysis")
43
-
44
- if page == 'Exploratory Data Analysis (EDA)':
45
- eda.run()
46
- else:
47
- prediction_compile.run()
48
-
49
- # ===============================
50
- # Footer
51
- # ===============================
52
- st.markdown(
53
- """
54
- <div style="text-align: center; color: gray; font-size: 12px; margin-top: 50px;">
55
- © 2025 Hana Antonio, Muhammad Revi Gilang Pradana, Zhaky B. Triaji. All rights reserved. <br>
56
- References: Dataset from <a href="https://www.kaggle.com" target="_blank" style="color: gray;">Kaggle</a>
57
- </div>
58
- """,
59
- unsafe_allow_html=True
60
- )
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
+ def run():
4
+ # Import pages inside function to avoid global loading issues
5
+ from eda import run as eda_run
6
+ from prediction_compile import run as prediction_run
7
+
8
+ # Sidebar navigation
9
+ page = st.sidebar.selectbox(
10
+ 'Select Page:',
11
+ ('Exploratory Data Analysis (EDA)', 'Prediction')
12
+ )
13
+
14
+ # Show page
15
+ if page == 'Exploratory Data Analysis (EDA)':
16
+ eda_run()
17
+ else:
18
+ prediction_run()
19
+
20
+ # Footer
21
+ st.markdown(
22
+ """
23
+ <div style="text-align: center; color: gray; font-size: 12px; margin-top: 50px;">
24
+ © 2025 Hana Antonio, Muhammad Revi Gilang Pradana, Zhaky B. Triaji. All rights reserved. <br>
25
+ References: Dataset from <a href="https://www.kaggle.com" target="_blank" style="color: gray;">Kaggle</a>
26
+ </div>
27
+ """,
28
+ unsafe_allow_html=True
29
+ )
30
+
31
+ if __name__ == "__main__":
32
+ run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/eda.py CHANGED
@@ -4,108 +4,88 @@ import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import plotly.express as px
6
  from PIL import Image
 
7
 
8
  # =============================================
9
- # Cache dataset agar tidak reload setiap kali
 
 
 
 
 
10
  # =============================================
11
  @st.cache_data
12
  def load_data():
13
- df = pd.read_csv('./src/singapore_airlines_reviews.csv')
 
14
  return df
15
 
16
  # Load dataset
17
  df = load_data()
18
 
19
  # =============================================
20
- # Main app
21
  # =============================================
22
  def run():
23
- # Judul dan Subjudul
24
  st.title("ACRE - Automated Customer Review Analysis")
25
  st.subheader("Exploratory Data Analysis (EDA)")
26
 
27
  st.markdown(
28
  """
29
  This section provides an exploratory data analysis (EDA) of Singapore Airlines (SQ) customer reviews.
30
- We aim to understand the distribution of ratings, textual review characteristics, and topic modeling results.
31
- These insights serve as the foundation for building automated models that classify sentiment and uncover key themes
32
- in customer feedback.
33
  """
34
  )
35
 
36
- # ===============================
37
- # Dataset Preview
38
- # ===============================
39
  st.write("### Dataset Preview")
40
  st.dataframe(df.head())
41
 
42
- # ===============================
43
- # Distribusi Rating
44
- # ===============================
45
  st.write("### Distribution of Ratings")
46
  fig, ax = plt.subplots(figsize=(8, 5))
47
- sns.countplot(x='rating', data=df, palette='viridis', ax=ax,
48
- order=sorted(df['rating'].unique()))
 
 
 
 
 
49
  for p in ax.patches:
50
  height = p.get_height()
51
  ax.annotate(f'{height:,}', (p.get_x() + p.get_width()/2, height),
52
  ha='center', va='bottom', fontsize=10, fontweight='bold')
53
  st.pyplot(fig)
54
 
55
- st.markdown(
56
- """
57
- **Note:** Ratings are explored here only as descriptive information about passenger experiences.
58
- In the inference page, actual sentiment will be predicted automatically from the review text using NLP techniques.
59
- """
60
- )
61
-
62
- # ===============================
63
- # Analisis Panjang Teks
64
- # ===============================
65
  st.write("### Distribution of Review Length")
66
  df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
67
  fig = px.histogram(df, x='text_length', nbins=50, title='Review Length Distribution')
68
  st.plotly_chart(fig, use_container_width=True)
69
 
70
- # ===============================
71
- # Topic Modeling Results (Images)
72
- # ===============================
73
- st.write("## Topic Modeling Results")
74
-
75
- # 1. Top Words Distributions
76
  col1, col2 = st.columns(2)
77
  with col1:
78
- st.image("./src/Negative - Top Words Distributions.png", caption="Negative - Top Words Distributions")
79
  with col2:
80
- st.image("./src/Positive - Top Words Distributions.png", caption="Positive - Top Words Distributions")
81
- st.write("Lorem ipsum explanation for Top Words Distributions.")
82
 
83
- # 2. Topic Activities Over Time
 
84
  col1, col2 = st.columns(2)
85
  with col1:
86
- st.image("./src/Negative - Topic Activities Over Time.png", caption="Negative - Topic Activities Over Time")
87
  with col2:
88
- st.image("./src/Positive - Topic Activities Over Time.png", caption="Positive - Topic Activities Over Time")
89
- st.write("Lorem ipsum explanation for Topic Activities Over Time.")
90
 
91
- # 3. Topics Hierarchy
92
- # col1, col2 = st.columns(2)
93
- # with col1:
94
- # st.image("./src/Negative - Topics Hierarchy.png", caption="Negative - Topics Hierarchy")
95
- # with col2:
96
- # st.image("./src/Positive - Topics Hierarchy.png", caption="Positive - Topics Hierarchy")
97
- # st.write("Lorem ipsum explanation for Topics Hierarchy.")
98
-
99
- # 4. Topic Weights
100
  col1, col2 = st.columns(2)
101
  with col1:
102
- st.image("./src/Negative - Topics Weights.png", caption="Negative - Topic Weights")
103
  with col2:
104
- st.image("./src/Positive - Topics Weights.png", caption="Positive - Topic Weights")
105
- st.write("Lorem ipsum explanation for Topics Weights.")
106
 
107
- # =============================================
108
- # Run Script
109
- # =============================================
110
- if __name__ == '__main__':
111
- run()
 
4
  import seaborn as sns
5
  import plotly.express as px
6
  from PIL import Image
7
+ import os
8
 
9
  # =============================================
10
+ # Base directory (works in container)
11
+ # =============================================
12
+ BASE_DIR = os.path.dirname(__file__)
13
+
14
+ # =============================================
15
+ # Cache dataset to avoid reload every time
16
  # =============================================
17
  @st.cache_data
18
  def load_data():
19
+ csv_path = os.path.join(BASE_DIR, 'singapore_airlines_reviews.csv')
20
+ df = pd.read_csv(csv_path)
21
  return df
22
 
23
  # Load dataset
24
  df = load_data()
25
 
26
  # =============================================
27
+ # Main EDA function
28
  # =============================================
29
  def run():
 
30
  st.title("ACRE - Automated Customer Review Analysis")
31
  st.subheader("Exploratory Data Analysis (EDA)")
32
 
33
  st.markdown(
34
  """
35
  This section provides an exploratory data analysis (EDA) of Singapore Airlines (SQ) customer reviews.
36
+ We aim to understand the distribution of ratings, textual review characteristics, and topic modeling results.
 
 
37
  """
38
  )
39
 
40
+ # Dataset preview
 
 
41
  st.write("### Dataset Preview")
42
  st.dataframe(df.head())
43
 
44
+ # Distribution of ratings
 
 
45
  st.write("### Distribution of Ratings")
46
  fig, ax = plt.subplots(figsize=(8, 5))
47
+ sns.countplot(
48
+ x='rating',
49
+ data=df,
50
+ palette='viridis',
51
+ ax=ax,
52
+ order=sorted(df['rating'].unique())
53
+ )
54
  for p in ax.patches:
55
  height = p.get_height()
56
  ax.annotate(f'{height:,}', (p.get_x() + p.get_width()/2, height),
57
  ha='center', va='bottom', fontsize=10, fontweight='bold')
58
  st.pyplot(fig)
59
 
60
+ # Distribution of review length
 
 
 
 
 
 
 
 
 
61
  st.write("### Distribution of Review Length")
62
  df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
63
  fig = px.histogram(df, x='text_length', nbins=50, title='Review Length Distribution')
64
  st.plotly_chart(fig, use_container_width=True)
65
 
66
+ # Wordclouds
 
 
 
 
 
67
  col1, col2 = st.columns(2)
68
  with col1:
69
+ st.image(os.path.join(BASE_DIR, "Negative - Wordcloud.png"), caption="Negative - Wordcloud")
70
  with col2:
71
+ st.image(os.path.join(BASE_DIR, "Positive - Wordcloud.png"), caption="Positive - Wordcloud")
 
72
 
73
+ # Topic Modeling Results
74
+ st.write("## Topic Modeling Results")
75
  col1, col2 = st.columns(2)
76
  with col1:
77
+ st.image(os.path.join(BASE_DIR, "Negative - Top Words Distributions.png"), caption="Negative - Top Words Distributions")
78
  with col2:
79
+ st.image(os.path.join(BASE_DIR, "Positive - Top Words Distributions.png"), caption="Positive - Top Words Distributions")
 
80
 
 
 
 
 
 
 
 
 
 
81
  col1, col2 = st.columns(2)
82
  with col1:
83
+ st.image(os.path.join(BASE_DIR, "Negative - Topic Activities Over Time.png"), caption="Negative - Topic Activities Over Time")
84
  with col2:
85
+ st.image(os.path.join(BASE_DIR, "Positive - Topic Activities Over Time.png"), caption="Positive - Topic Activities Over Time")
 
86
 
87
+ col1, col2 = st.columns(2)
88
+ with col1:
89
+ st.image(os.path.join(BASE_DIR, "Negative - Topics Weights.png"), caption="Negative - Topics Weights")
90
+ with col2:
91
+ st.image(os.path.join(BASE_DIR, "Positive - Topics Weights.png"), caption="Positive - Topics Weights")
src/prediction_compile.py CHANGED
@@ -1,12 +1,9 @@
1
- # ============================================
2
- # Import Libraries
3
- # ============================================
4
  import streamlit as st
 
5
  import re
6
  import pickle
7
  import joblib
8
  import nltk
9
- import os
10
  import numpy as np
11
  import pandas as pd
12
  from tensorflow.keras.preprocessing.sequence import pad_sequences
@@ -16,34 +13,23 @@ from nltk.tokenize import word_tokenize
16
  from nltk.stem import PorterStemmer
17
  from huggingface_hub import hf_hub_download
18
 
19
- # ============================================
20
- # Setup NLTK
21
- # ============================================
22
  nltk_data_path = os.path.join("/tmp", "nltk_data")
23
  os.makedirs(nltk_data_path, exist_ok=True)
24
  nltk.data.path.append(nltk_data_path)
25
  nltk.download("stopwords", download_dir=nltk_data_path)
26
  nltk.download("punkt", download_dir=nltk_data_path)
27
 
28
- # ============================================
29
- # Loading Info
30
- # ============================================
31
- st.markdown(
32
- '<p style="color:gray; font-size:14px; font-style:italic;">'
33
- 'Loading models (≈200 MB) and resources... this may take a while on first run. '
34
- 'Please be patient and DO NOT refresh the page :)'
35
- '</p>',
36
- unsafe_allow_html=True
37
- )
38
-
39
- # ============================================
40
- # Hugging Face Hub Repo
41
- # ============================================
42
  repo_id = "BesottenJenny/acre-sentiment-models"
43
 
44
- # ============================================
45
- # Cached Loading Functions
46
- # ============================================
47
  @st.cache_resource
48
  def load_sentiment_model():
49
  path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
@@ -67,18 +53,15 @@ def load_topic_models():
67
  pos_model = joblib.load(pos_path)
68
  return neg_model, pos_model
69
 
70
- # ============================================
71
- # Load all resources once
72
- # ============================================
73
  sentiment_model = load_sentiment_model()
74
  tokenizer, params = load_tokenizer_params()
75
  topic_model_neg, topic_model_pos = load_topic_models()
76
-
77
  max_len = params["max_len"]
78
 
79
- # ============================================
80
- # Preprocessing Function (NLTK)
81
- # ============================================
82
  negations = {"not", "no", "never"}
83
  stpwrds_en = set(stopwords.words("english")) - negations
84
  stemmer = PorterStemmer()
@@ -99,96 +82,49 @@ replacements = {
99
 
100
  def text_preprocessing(text):
101
  text = text.lower()
102
- text = re.sub(r"\\n", " ", text)
103
  text = text.strip()
104
  text = re.sub(r'[^a-z0-9\s]', ' ', text)
105
  tokens = word_tokenize(text)
106
  tokens = [replacements.get(word, word) for word in tokens]
107
  tokens = [word for word in tokens if word not in stpwrds_en]
108
  tokens = [stemmer.stem(word) for word in tokens]
109
- if len(tokens) == 0:
110
- return "emptytext"
111
- return ' '.join(tokens)
112
 
113
- # ============================================
114
  # Streamlit App
115
- # ============================================
116
  def run():
117
  st.title("ACRE - Automated Customer Review Analysis")
118
  st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
119
  st.markdown(
120
- """
121
- This section will help you understand how the **ACRE** system works.
122
- Simply fill in the form below with either a dummy or real customer review, and the system will:
123
-
124
- 1. **Preprocess** your review text (cleaning, tokenization, and stemming).
125
- 2. **Predict sentiment** (Positive or Negative) along with a confidence score.
126
- 3. **Identify the most relevant topic** associated with the review, based on the predicted sentiment.
127
-
128
- Use this tool to simulate how Singapore Airlines can transform raw customer feedback into **structured, data-driven insights**.
129
- """
130
  )
131
 
132
- with st.form(key='SQ-sentiment-analysis'):
133
- date = st.date_input("Review Date")
134
- platform = st.selectbox('Review Platform', ('Mobile', 'Desktop'), index=0)
135
- rating = st.number_input('Rating', min_value=0, max_value=5, value=3, step=1)
136
- st.markdown('---')
137
- text = st.text_input('Customer Review', value='--customer review--')
138
- title = st.text_input('Review Title', value='--review title--')
139
- vote = st.slider('Helpful Vote', min_value=0, max_value=200, value=50, step=1)
140
- st.markdown('---')
141
- submitted = st.form_submit_button('Predict')
142
 
143
  if submitted:
144
- st.markdown("---")
145
- st.write("### Input Data")
146
- data_inf = {
147
- 'published_date': date,
148
- 'published_platform': platform,
149
- 'rating': rating,
150
- 'type': 'Review',
151
- 'text': text,
152
- 'title': title,
153
- 'helpful_votes': vote
154
- }
155
- st.dataframe(pd.DataFrame([data_inf]))
156
-
157
- # Preprocess
158
  processed = text_preprocessing(text)
159
  seq = tokenizer.texts_to_sequences([processed])
160
  padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
161
 
162
- # Sentiment Prediction
163
  pred_probs = sentiment_model.predict(padded)
164
  pred_class = np.argmax(pred_probs, axis=1)[0]
165
  confidence = float(np.max(pred_probs))
166
-
167
  label_map = {0: "Negative", 1: "Positive"}
168
  sentiment_label = label_map[pred_class]
169
 
170
- st.write("### Sentiment Prediction")
171
- if sentiment_label == "Negative":
172
- st.markdown(f"<h3 style='color:red;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
173
- else:
174
- st.markdown(f"<h3 style='color:green;'>Predicted Sentiment: {sentiment_label}</h3>", unsafe_allow_html=True)
175
- st.write(f"**Confidence:** {confidence:.2f}")
176
 
177
- # Topic Prediction
178
- st.write("### Topic Modeling")
179
  if sentiment_label == "Negative":
180
- topics, probs = topic_model_neg.transform([text])
181
- st.write("**Using Negative Model**")
182
- st.markdown(f"<p style='color:red;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
183
  else:
184
- topics, probs = topic_model_pos.transform([text])
185
- st.write("**Using Positive Model**")
186
- st.markdown(f"<p style='color:green;'>Topic ID(s): {topics}</p>", unsafe_allow_html=True)
187
 
 
 
188
  st.write(f"**Probabilities:** {probs.tolist()}")
189
-
190
- # ============================================
191
- # Run App
192
- # ============================================
193
- if __name__ == "__main__":
194
- run()
 
 
 
 
1
  import streamlit as st
2
+ import os
3
  import re
4
  import pickle
5
  import joblib
6
  import nltk
 
7
  import numpy as np
8
  import pandas as pd
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
13
  from nltk.stem import PorterStemmer
14
  from huggingface_hub import hf_hub_download
15
 
16
+ # =============================================
17
+ # Setup NLTK with container-safe path
18
+ # =============================================
19
  nltk_data_path = os.path.join("/tmp", "nltk_data")
20
  os.makedirs(nltk_data_path, exist_ok=True)
21
  nltk.data.path.append(nltk_data_path)
22
  nltk.download("stopwords", download_dir=nltk_data_path)
23
  nltk.download("punkt", download_dir=nltk_data_path)
24
 
25
+ # =============================================
26
+ # HF Hub repo
27
+ # =============================================
 
 
 
 
 
 
 
 
 
 
 
28
  repo_id = "BesottenJenny/acre-sentiment-models"
29
 
30
+ # =============================================
31
+ # Cached loading functions
32
+ # =============================================
33
  @st.cache_resource
34
  def load_sentiment_model():
35
  path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
 
53
  pos_model = joblib.load(pos_path)
54
  return neg_model, pos_model
55
 
56
+ # Load models
 
 
57
  sentiment_model = load_sentiment_model()
58
  tokenizer, params = load_tokenizer_params()
59
  topic_model_neg, topic_model_pos = load_topic_models()
 
60
  max_len = params["max_len"]
61
 
62
+ # =============================================
63
+ # Text preprocessing
64
+ # =============================================
65
  negations = {"not", "no", "never"}
66
  stpwrds_en = set(stopwords.words("english")) - negations
67
  stemmer = PorterStemmer()
 
82
 
83
  def text_preprocessing(text):
84
  text = text.lower()
85
+ text = re.sub(r"\n", " ", text)
86
  text = text.strip()
87
  text = re.sub(r'[^a-z0-9\s]', ' ', text)
88
  tokens = word_tokenize(text)
89
  tokens = [replacements.get(word, word) for word in tokens]
90
  tokens = [word for word in tokens if word not in stpwrds_en]
91
  tokens = [stemmer.stem(word) for word in tokens]
92
+ return "emptytext" if len(tokens) == 0 else ' '.join(tokens)
 
 
93
 
94
+ # =============================================
95
  # Streamlit App
96
+ # =============================================
97
  def run():
98
  st.title("ACRE - Automated Customer Review Analysis")
99
  st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
100
  st.markdown(
101
+ "Enter a customer review below to predict sentiment and topic."
 
 
 
 
 
 
 
 
 
102
  )
103
 
104
+ with st.form(key='review_form'):
105
+ text = st.text_area("Customer Review", value="--customer review--")
106
+ submitted = st.form_submit_button("Predict")
 
 
 
 
 
 
 
107
 
108
  if submitted:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  processed = text_preprocessing(text)
110
  seq = tokenizer.texts_to_sequences([processed])
111
  padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
112
 
113
+ # Sentiment
114
  pred_probs = sentiment_model.predict(padded)
115
  pred_class = np.argmax(pred_probs, axis=1)[0]
116
  confidence = float(np.max(pred_probs))
 
117
  label_map = {0: "Negative", 1: "Positive"}
118
  sentiment_label = label_map[pred_class]
119
 
120
+ st.write(f"**Sentiment:** {sentiment_label} (Confidence: {confidence:.2f})")
 
 
 
 
 
121
 
122
+ # Topic Modeling
 
123
  if sentiment_label == "Negative":
124
+ result = topic_model_neg.transform([text])
 
 
125
  else:
126
+ result = topic_model_pos.transform([text])
 
 
127
 
128
+ topics, probs = result
129
+ st.write(f"**Topic ID(s):** {topics}")
130
  st.write(f"**Probabilities:** {probs.tolist()}")