hanantonio commited on
Commit
1dffecd
·
verified ·
1 Parent(s): 6a2eee4

Upload 3 files

Browse files
.gitattributes CHANGED
@@ -39,3 +39,5 @@ src/Positive[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:s
39
  src/src/best_model.keras filter=lfs diff=lfs merge=lfs -text
40
  src/src/Negative[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:space:]]Time.png filter=lfs diff=lfs merge=lfs -text
41
  src/src/Positive[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:space:]]Time.png filter=lfs diff=lfs merge=lfs -text
 
 
 
39
  src/src/best_model.keras filter=lfs diff=lfs merge=lfs -text
40
  src/src/Negative[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:space:]]Time.png filter=lfs diff=lfs merge=lfs -text
41
  src/src/Positive[[:space:]]-[[:space:]]Topic[[:space:]]Activities[[:space:]]Over[[:space:]]Time.png filter=lfs diff=lfs merge=lfs -text
42
+ src/Negative[[:space:]]-[[:space:]]Wordcloud.png filter=lfs diff=lfs merge=lfs -text
43
+ src/Positive[[:space:]]-[[:space:]]Wordcloud.png filter=lfs diff=lfs merge=lfs -text
src/Negative - Wordcloud.png ADDED

Git LFS Details

  • SHA256: 7a899012e1cee65f0d4f7a137f8b5e25c3d3cc8fc09bb814a768bfc6c3bf68ea
  • Pointer size: 131 Bytes
  • Size of remote file: 282 kB
src/Positive - Wordcloud.png ADDED

Git LFS Details

  • SHA256: a0e0d0c36a41c744070fe168978381c744e40e717efa1d69926b48f1e89548b0
  • Pointer size: 131 Bytes
  • Size of remote file: 308 kB
src/prediction_compile.py CHANGED
@@ -1,9 +1,11 @@
 
 
1
  import streamlit as st
2
- import os
3
  import re
4
  import pickle
5
  import joblib
6
  import nltk
 
7
  import numpy as np
8
  import pandas as pd
9
  from tensorflow.keras.preprocessing.sequence import pad_sequences
@@ -11,34 +13,33 @@ from tensorflow import keras
11
  from nltk.corpus import stopwords
12
  from nltk.tokenize import word_tokenize
13
  from nltk.stem import PorterStemmer
14
- from huggingface_hub import hf_hub_download
15
 
16
- # =============================================
17
- # Setup NLTK with container-safe path
18
- # =============================================
19
  nltk_data_path = os.path.join("/tmp", "nltk_data")
20
  os.makedirs(nltk_data_path, exist_ok=True)
21
  nltk.data.path.append(nltk_data_path)
22
  nltk.download("stopwords", download_dir=nltk_data_path)
23
  nltk.download("punkt", download_dir=nltk_data_path)
24
 
25
- # =============================================
26
- # HF Hub repo
27
- # =============================================
28
- repo_id = "BesottenJenny/acre-sentiment-models"
 
 
 
 
29
 
30
- # =============================================
31
- # Cached loading functions
32
- # =============================================
33
  @st.cache_resource
34
  def load_sentiment_model():
35
- path = hf_hub_download(repo_id=repo_id, filename="best_model.keras")
36
  return keras.models.load_model(path)
37
 
38
  @st.cache_resource
39
  def load_tokenizer_params():
40
- tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.pkl")
41
- params_path = hf_hub_download(repo_id=repo_id, filename="params.pkl")
42
  with open(tokenizer_path, "rb") as f:
43
  tokenizer = pickle.load(f)
44
  with open(params_path, "rb") as f:
@@ -47,21 +48,20 @@ def load_tokenizer_params():
47
 
48
  @st.cache_resource
49
  def load_topic_models():
50
- neg_path = hf_hub_download(repo_id=repo_id, filename="fastopic_negative_model.pkl")
51
- pos_path = hf_hub_download(repo_id=repo_id, filename="fastopic_positive_model.pkl")
52
  neg_model = joblib.load(neg_path)
53
  pos_model = joblib.load(pos_path)
54
  return neg_model, pos_model
55
 
56
- # Load models
57
  sentiment_model = load_sentiment_model()
58
  tokenizer, params = load_tokenizer_params()
59
  topic_model_neg, topic_model_pos = load_topic_models()
 
60
  max_len = params["max_len"]
61
 
62
- # =============================================
63
- # Text preprocessing
64
- # =============================================
65
  negations = {"not", "no", "never"}
66
  stpwrds_en = set(stopwords.words("english")) - negations
67
  stemmer = PorterStemmer()
@@ -82,7 +82,7 @@ replacements = {
82
 
83
  def text_preprocessing(text):
84
  text = text.lower()
85
- text = re.sub(r"\n", " ", text)
86
  text = text.strip()
87
  text = re.sub(r'[^a-z0-9\s]', ' ', text)
88
  tokens = word_tokenize(text)
@@ -91,45 +91,86 @@ def text_preprocessing(text):
91
  tokens = [stemmer.stem(word) for word in tokens]
92
  return "emptytext" if len(tokens) == 0 else ' '.join(tokens)
93
 
94
- # =============================================
95
- # Streamlit App
96
- # =============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def run():
98
- st.title("ACRE - Automated Customer Review Analysis")
99
  st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
 
100
  st.markdown(
101
- "Enter a customer review below to predict sentiment and topic."
 
 
102
  )
103
 
104
- with st.form(key='review_form'):
105
- text = st.text_area("Customer Review", value="--customer review--")
106
- submitted = st.form_submit_button("Predict")
107
 
108
  if submitted:
 
109
  processed = text_preprocessing(text)
110
  seq = tokenizer.texts_to_sequences([processed])
111
  padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
112
 
113
- # Sentiment
114
  pred_probs = sentiment_model.predict(padded)
115
- pred_class = np.argmax(pred_probs, axis=1)[0]
116
- confidence = float(np.max(pred_probs))
117
- label_map = {0: "Negative", 1: "Positive"}
118
- sentiment_label = label_map[pred_class]
119
-
120
- st.write(f"**Sentiment:** {sentiment_label} (Confidence: {confidence:.2f})")
121
-
122
- # Topic Modeling
123
- result = topic_model_neg.transform([text]) if sentiment_label == "Negative" else topic_model_pos.transform([text])
124
-
125
- if isinstance(result, tuple) and len(result) == 2:
126
- topics, probs = result
127
- st.write(f"**Topic ID(s):** {topics}")
128
- st.write(f"**Probabilities:** {probs.tolist()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  else:
130
- topics = result
131
- st.write(f"**Topic ID(s):** {topics}")
132
- st.write("**Probabilities:** Not available")
 
 
 
 
 
 
 
 
 
133
 
134
  if __name__ == "__main__":
135
  run()
 
1
+ # prediction_compile.py
2
+ # Import Libraries
3
  import streamlit as st
 
4
  import re
5
  import pickle
6
  import joblib
7
  import nltk
8
+ import os
9
  import numpy as np
10
  import pandas as pd
11
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
13
  from nltk.corpus import stopwords
14
  from nltk.tokenize import word_tokenize
15
  from nltk.stem import PorterStemmer
 
16
 
17
+ # --- Setup NLTK ---
 
 
18
  nltk_data_path = os.path.join("/tmp", "nltk_data")
19
  os.makedirs(nltk_data_path, exist_ok=True)
20
  nltk.data.path.append(nltk_data_path)
21
  nltk.download("stopwords", download_dir=nltk_data_path)
22
  nltk.download("punkt", download_dir=nltk_data_path)
23
 
24
+ # --- Loading Info ---
25
+ st.markdown(
26
+ '<p style="color:gray; font-size:14px; font-style:italic;">'
27
+ 'Loading models and resources from local storage... '
28
+ 'Please be patient and DO NOT refresh the page :)'
29
+ '</p>',
30
+ unsafe_allow_html=True
31
+ )
32
 
33
+ # --- Cached Loading Functions ---
 
 
34
  @st.cache_resource
35
  def load_sentiment_model():
36
+ path = "./src/best_model.keras"
37
  return keras.models.load_model(path)
38
 
39
  @st.cache_resource
40
  def load_tokenizer_params():
41
+ tokenizer_path = "./src/tokenizer.pkl"
42
+ params_path = "./src/params.pkl"
43
  with open(tokenizer_path, "rb") as f:
44
  tokenizer = pickle.load(f)
45
  with open(params_path, "rb") as f:
 
48
 
49
  @st.cache_resource
50
  def load_topic_models():
51
+ neg_path = "./src/fastopic_negative_model.pkl"
52
+ pos_path = "./src/fastopic_positive_model.pkl"
53
  neg_model = joblib.load(neg_path)
54
  pos_model = joblib.load(pos_path)
55
  return neg_model, pos_model
56
 
57
+ # --- Load all resources once ---
58
  sentiment_model = load_sentiment_model()
59
  tokenizer, params = load_tokenizer_params()
60
  topic_model_neg, topic_model_pos = load_topic_models()
61
+
62
  max_len = params["max_len"]
63
 
64
+ # --- Preprocessing Function ---
 
 
65
  negations = {"not", "no", "never"}
66
  stpwrds_en = set(stopwords.words("english")) - negations
67
  stemmer = PorterStemmer()
 
82
 
83
  def text_preprocessing(text):
84
  text = text.lower()
85
+ text = re.sub(r"\\n", " ", text)
86
  text = text.strip()
87
  text = re.sub(r'[^a-z0-9\s]', ' ', text)
88
  tokens = word_tokenize(text)
 
91
  tokens = [stemmer.stem(word) for word in tokens]
92
  return "emptytext" if len(tokens) == 0 else ' '.join(tokens)
93
 
94
+ # --- Topic Labels ---
95
+ topic_labels_neg = {
96
+ 1: "meal and entertainment service",
97
+ 2: "refund, cancellation, and booking tickets policy",
98
+ 3: "business class/premium facility",
99
+ 4: "baggage limits and price",
100
+ 5: "hidden charges"
101
+ }
102
+
103
+ topic_labels_pos = {
104
+ 1: "good food and crew service",
105
+ 2: "excellent economy seat",
106
+ 3: "refund and cancellation policy",
107
+ 4: "meals quality",
108
+ 5: "accommodation and assistance"
109
+ }
110
+
111
+ # --- Streamlit App ---
112
  def run():
 
113
  st.subheader("Sentiment & Topic Prediction for SQ Customer Reviews")
114
+
115
  st.markdown(
116
+ """
117
+ Enter a customer review below to predict sentiment and topic.
118
+ """
119
  )
120
 
121
+ with st.form(key='SQ-sentiment-analysis'):
122
+ text = st.text_input('Customer Review', value='--customer review--')
123
+ submitted = st.form_submit_button('Predict')
124
 
125
  if submitted:
126
+ # Preprocess
127
  processed = text_preprocessing(text)
128
  seq = tokenizer.texts_to_sequences([processed])
129
  padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
130
 
131
+ # Sentiment Prediction
132
  pred_probs = sentiment_model.predict(padded)
133
+ if pred_probs.shape[1] == 1:
134
+ # Binary sigmoid
135
+ p_pos = float(pred_probs[0][0])
136
+ p_neg = 1 - p_pos
137
+ sentiment_label = "Positive" if p_pos >= 0.5 else "Negative"
138
+ confidence = max(p_pos, p_neg)
139
+ else:
140
+ # Softmax
141
+ pred_class = np.argmax(pred_probs, axis=1)[0]
142
+ label_map = {0: "Negative", 1: "Positive"}
143
+ sentiment_label = label_map[pred_class]
144
+ confidence = float(pred_probs[0][pred_class])
145
+
146
+ color = "green" if sentiment_label == "Positive" else "red"
147
+ st.markdown(
148
+ f"<p style='font-size:22px; font-weight:bold; color:{color};'>"
149
+ f"Predicted Sentiment: {sentiment_label} "
150
+ f"(Confidence: {confidence:.2f})</p>",
151
+ unsafe_allow_html=True
152
+ )
153
+
154
+ # Topic Prediction
155
+ st.write("### Topic Modeling")
156
+ if sentiment_label == "Negative":
157
+ probs = topic_model_neg.transform([text])[0]
158
+ topic_id = int(np.argmax(probs)) + 1
159
+ topic_name = topic_labels_neg.get(topic_id, "Unknown Topic")
160
+ st.write("**Using Negative Model**")
161
  else:
162
+ probs = topic_model_pos.transform([text])[0]
163
+ topic_id = int(np.argmax(probs)) + 1
164
+ topic_name = topic_labels_pos.get(topic_id, "Unknown Topic")
165
+ st.write("**Using Positive Model**")
166
+
167
+ # Output
168
+ st.markdown(
169
+ f"<p style='font-size:20px; font-weight:bold; color:{color};'>"
170
+ f"Topic {topic_id}: {topic_name}</p>",
171
+ unsafe_allow_html=True
172
+ )
173
+ st.write("**Probabilities:**", probs.tolist())
174
 
175
  if __name__ == "__main__":
176
  run()