MienOlle commited on
Commit
1258ed0
·
verified ·
1 Parent(s): 44143a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +260 -260
app.py CHANGED
@@ -1,261 +1,261 @@
1
- import numpy as np
2
- import pandas as pd
3
- import torch
4
- import re
5
- import emoji
6
- import contractions
7
- from collections import defaultdict
8
- import joblib
9
- from transformers import BertTokenizer, BertModel
10
- import torch.nn as nn
11
- from torch.nn import functional as F
12
- import streamlit as st
13
-
14
- def load_lex(filepath):
15
- lexicon = defaultdict(dict)
16
- with open(filepath, 'r') as file:
17
- for line in file:
18
- word, emotion, value = line.strip().split('\t')
19
- if int(value) == 1:
20
- lexicon[word][emotion] = 1
21
- return lexicon
22
-
23
- def load_nrc_vad(filepath):
24
- vad_lex = {}
25
- with open(filepath, 'r', encoding='utf-8') as f:
26
- next(f) # skip header
27
- for line in f:
28
- word, val, aro, dom = line.strip().split('\t')
29
- vad_lex[word] = {
30
- 'valence': float(val),
31
- 'arousal': float(aro),
32
- 'dominance': float(dom)
33
- }
34
- return vad_lex
35
-
36
- def load_nrc_hash_emo(filepath):
37
- lexicon = defaultdict(dict)
38
- with open(filepath, 'r', encoding='utf-8') as f:
39
- for line in f:
40
- emotion, word, score = line.strip().split('\t')
41
- lexicon[word][emotion] = float(score)
42
- return lexicon
43
-
44
- def convert_emojis(text):
45
- text = emoji.demojize(text, delimiters=(" ", " "))
46
- text = re.sub(r':([a-zA-Z_]+):', r'\1', text)
47
- text = re.sub(r'\s+', ' ', text).strip()
48
- return text
49
-
50
- def clean_text(text):
51
- text = text.lower()
52
- text = contractions.fix(text)
53
- text = convert_emojis(text)
54
- text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
55
- text = re.sub(r'@\w+', '', text)
56
- text = re.sub(r"[^a-zA-Z\s.,!?']", '', text)
57
- text = re.sub(r'\s+', ' ', text).strip()
58
- return text
59
-
60
- def extract_lex(text, lexicon):
61
- emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
62
- 'sadness', 'surprise', 'trust', 'positive', 'negative']
63
- counts = dict.fromkeys(emotions, 0)
64
-
65
- for word in text.split():
66
- if word in lexicon:
67
- for emo in lexicon[word]:
68
- counts[emo] += 1
69
- return [counts[emo] for emo in emotions]
70
-
71
- def extract_vad(text, lexicon):
72
- valence = []
73
- arousal = []
74
- dominance = []
75
-
76
- for word in text.split():
77
- if word in lexicon:
78
- valence.append(lexicon[word]['valence'])
79
- arousal.append(lexicon[word]['arousal'])
80
- dominance.append(lexicon[word]['dominance'])
81
-
82
- # If no word matched, return zeros
83
- if not valence:
84
- return [0.0, 0.0, 0.0]
85
-
86
- # Otherwise, return means
87
- return [
88
- np.mean(valence),
89
- np.mean(arousal),
90
- np.mean(dominance)
91
- ]
92
-
93
- def extract_hash_emo(text, lexicon):
94
- emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
95
- 'sadness', 'surprise', 'trust']
96
- scores = {emo: [] for emo in emotions}
97
-
98
- for word in text.split():
99
- if word in lexicon:
100
- for emo, value in lexicon[word].items():
101
- scores[emo].append(value)
102
-
103
- return [np.mean(scores[emo]) if scores[emo] else 0.0 for emo in emotions]
104
-
105
- class EmotionMultiTaskModel(nn.Module):
106
- def __init__(self, num_emotions=4, lex_dim=21):
107
- super(EmotionMultiTaskModel, self).__init__()
108
- self.bert = BertModel.from_pretrained('bert-base-uncased')
109
- self.dropout = nn.Dropout(0.3)
110
-
111
- # Shared representation
112
- hidden_size = self.bert.config.hidden_size
113
- self.shared_layer = nn.Linear(hidden_size + lex_dim, hidden_size)
114
-
115
- # Task-specific layers
116
- self.classifier = nn.Linear(hidden_size, num_emotions) # Multi-label classification
117
- self.regressor = nn.Linear(hidden_size, num_emotions) # Multi-output regression
118
-
119
- def forward(self, input_ids, attention_mask, lexicon_feats):
120
- # Get BERT embeddings
121
- outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
122
- pooled_output = outputs.pooler_output
123
-
124
- # Concatenate with lexicon features
125
- combined = torch.cat((pooled_output, lexicon_feats), dim=1)
126
-
127
- # Shared representation
128
- shared_repr = F.relu(self.shared_layer(combined))
129
- shared_repr = self.dropout(shared_repr)
130
-
131
- # Task-specific outputs
132
- cls_logits = self.classifier(shared_repr) # For binary classification of each emotion
133
- reg_output = self.regressor(shared_repr) # For regression of each emotion's intensity
134
-
135
- # Apply sigmoid to classification logits
136
- cls_probs = torch.sigmoid(cls_logits)
137
-
138
- # Scale regression outputs to [0,1]
139
- reg_output = (torch.tanh(reg_output) + 1) / 2
140
-
141
- return cls_probs, reg_output
142
-
143
- emotion_cols = ["joy", "sadness", "anger", "fear"]
144
- lex_dim = 21
145
-
146
- @st.cache_resource
147
- def load_model_tokenizer(num_emotions, lex_dim, device):
148
- model = EmotionMultiTaskModel(num_emotions=num_emotions, lex_dim=lex_dim).to(device)
149
- model.load_state_dict(torch.load("best_multitask_multilabel_model.pth", map_location=device))
150
- model.eval()
151
-
152
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
153
- return model, tokenizer
154
-
155
- @st.cache_resource
156
- def load_scalers():
157
- scaler_lex = joblib.load("lex_scaler.pkl")
158
- scaler_vad = joblib.load("vad_scaler.pkl")
159
- scaler_hash = joblib.load("hash_scaler.pkl")
160
- return scaler_lex, scaler_vad, scaler_hash
161
-
162
- def load_lexicon_data():
163
- nrc_lexicon = load_lex("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
164
- nrc_vad_lexicon = load_nrc_vad("NRC-VAD-Lexicon-v2.1.txt")
165
- hash_emo_lex = load_nrc_hash_emo("NRC-Hashtag-Emotion-Lexicon-v0.2.txt")
166
- return nrc_lexicon, nrc_vad_lexicon, hash_emo_lex
167
-
168
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
169
- num_emotions = len(emotion_cols)
170
- model, tokenizer = load_model_tokenizer(num_emotions, lex_dim, device)
171
- scaler_lex, scaler_vad, scaler_hash = load_scalers
172
- nrc_lexicon, nrc_vad_lexicon, hash_emo_lex = load_lexicon_data
173
-
174
- def extract_all_lexicons(text):
175
- vad_feats = extract_vad(text, nrc_vad_lexicon)
176
- vad_feats = scaler_vad.transform([vad_feats])
177
-
178
- lex_feats = extract_lex(text, nrc_lexicon)
179
- lex_feats = scaler_lex.transform([lex_feats])
180
-
181
- hash_feats = extract_hash_emo(text, hash_emo_lex)
182
- hash_feats = scaler_hash.transform([hash_feats])
183
-
184
- combined_feats = np.concatenate([vad_feats, lex_feats, hash_feats], axis = 1)
185
- return combined_feats
186
-
187
- def predict_emotions(text, model, tokenizer, device, threshold=0.3):
188
- model.eval()
189
-
190
- # Clean and tokenize the text
191
- clean = clean_text(text)
192
- tokens = tokenizer(
193
- clean,
194
- padding='max_length',
195
- truncation=True,
196
- max_length=128,
197
- return_tensors='pt'
198
- )
199
-
200
- # Create lexicon features
201
- lexicon_feats = torch.tensor(extract_all_lexicons(clean), dtype=torch.float).to(device)
202
-
203
- # Move inputs to device
204
- input_ids = tokens['input_ids'].to(device)
205
- attention_mask = tokens['attention_mask'].to(device)
206
-
207
- # Get predictions
208
- with torch.no_grad():
209
- cls_probs, intensities = model(
210
- input_ids=input_ids,
211
- attention_mask=attention_mask,
212
- lexicon_feats=lexicon_feats
213
- )
214
-
215
- # Convert to numpy
216
- cls_probs = cls_probs.cpu().numpy()[0]
217
- intensities = intensities.cpu().numpy()[0]
218
-
219
- detected_emotions = np.zeros_like(cls_probs, dtype=bool)
220
- detected_emotions[cls_probs.argmax()] = True
221
-
222
- # Prepare results
223
- results = {}
224
- for i, emotion in enumerate(emotion_cols):
225
- results[emotion] = {
226
- "probability": float(cls_probs[i]),
227
- "detected": bool(detected_emotions[i]),
228
- "intensity": float(intensities[i]) if detected_emotions[i] else 0.0
229
- }
230
-
231
- return results
232
-
233
- # STREAMLIT UI
234
- st.title("Emotion Intensity Prediction using Transformer Based Models")
235
- st.markdown("Enter text below to predict emotions and their intensities.")
236
-
237
- text_input = st.text_area("Input Text:", height=150, placeholder="Type your sentence here... eg.I am very happy")
238
-
239
- if st.button("Predict Emotions"):
240
- if text_input.strip() == "":
241
- st.warning("Please enter some text to get predictions.")
242
- else:
243
- with st.spinner("Analyzing emotions..."):
244
- results = predict_emotions(text_input, model, tokenizer, device)
245
-
246
- st.subheader("Prediction Results:")
247
-
248
- emotions_sorted = sorted(
249
- [(emotion, details) for emotion, details in results.items() if details["detected"]],
250
- key=lambda x: x[1]["intensity"],
251
- reverse=True
252
- )
253
-
254
- if emotions_sorted:
255
- st.write("---")
256
- for emotion, details in emotions_sorted:
257
- st.write(f"### {emotion.capitalize()}")
258
- st.progress(details['intensity'], text = f"Intensity: {details['intensity']:.2f}")
259
- st.progress(details['probability'], text = f"Confidence Score: {details['probability']:.2f}")
260
- else:
261
  st.info("No emotions detected")
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ import re
5
+ import emoji
6
+ import contractions
7
+ from collections import defaultdict
8
+ import joblib
9
+ from transformers import BertTokenizer, BertModel
10
+ import torch.nn as nn
11
+ from torch.nn import functional as F
12
+ import streamlit as st
13
+
14
+ def load_lex(filepath):
15
+ lexicon = defaultdict(dict)
16
+ with open(filepath, 'r') as file:
17
+ for line in file:
18
+ word, emotion, value = line.strip().split('\t')
19
+ if int(value) == 1:
20
+ lexicon[word][emotion] = 1
21
+ return lexicon
22
+
23
+ def load_nrc_vad(filepath):
24
+ vad_lex = {}
25
+ with open(filepath, 'r', encoding='utf-8') as f:
26
+ next(f) # skip header
27
+ for line in f:
28
+ word, val, aro, dom = line.strip().split('\t')
29
+ vad_lex[word] = {
30
+ 'valence': float(val),
31
+ 'arousal': float(aro),
32
+ 'dominance': float(dom)
33
+ }
34
+ return vad_lex
35
+
36
+ def load_nrc_hash_emo(filepath):
37
+ lexicon = defaultdict(dict)
38
+ with open(filepath, 'r', encoding='utf-8') as f:
39
+ for line in f:
40
+ emotion, word, score = line.strip().split('\t')
41
+ lexicon[word][emotion] = float(score)
42
+ return lexicon
43
+
44
+ def convert_emojis(text):
45
+ text = emoji.demojize(text, delimiters=(" ", " "))
46
+ text = re.sub(r':([a-zA-Z_]+):', r'\1', text)
47
+ text = re.sub(r'\s+', ' ', text).strip()
48
+ return text
49
+
50
+ def clean_text(text):
51
+ text = text.lower()
52
+ text = contractions.fix(text)
53
+ text = convert_emojis(text)
54
+ text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
55
+ text = re.sub(r'@\w+', '', text)
56
+ text = re.sub(r"[^a-zA-Z\s.,!?']", '', text)
57
+ text = re.sub(r'\s+', ' ', text).strip()
58
+ return text
59
+
60
+ def extract_lex(text, lexicon):
61
+ emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
62
+ 'sadness', 'surprise', 'trust', 'positive', 'negative']
63
+ counts = dict.fromkeys(emotions, 0)
64
+
65
+ for word in text.split():
66
+ if word in lexicon:
67
+ for emo in lexicon[word]:
68
+ counts[emo] += 1
69
+ return [counts[emo] for emo in emotions]
70
+
71
+ def extract_vad(text, lexicon):
72
+ valence = []
73
+ arousal = []
74
+ dominance = []
75
+
76
+ for word in text.split():
77
+ if word in lexicon:
78
+ valence.append(lexicon[word]['valence'])
79
+ arousal.append(lexicon[word]['arousal'])
80
+ dominance.append(lexicon[word]['dominance'])
81
+
82
+ # If no word matched, return zeros
83
+ if not valence:
84
+ return [0.0, 0.0, 0.0]
85
+
86
+ # Otherwise, return means
87
+ return [
88
+ np.mean(valence),
89
+ np.mean(arousal),
90
+ np.mean(dominance)
91
+ ]
92
+
93
+ def extract_hash_emo(text, lexicon):
94
+ emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
95
+ 'sadness', 'surprise', 'trust']
96
+ scores = {emo: [] for emo in emotions}
97
+
98
+ for word in text.split():
99
+ if word in lexicon:
100
+ for emo, value in lexicon[word].items():
101
+ scores[emo].append(value)
102
+
103
+ return [np.mean(scores[emo]) if scores[emo] else 0.0 for emo in emotions]
104
+
105
+ class EmotionMultiTaskModel(nn.Module):
106
+ def __init__(self, num_emotions=4, lex_dim=21):
107
+ super(EmotionMultiTaskModel, self).__init__()
108
+ self.bert = BertModel.from_pretrained('bert-base-uncased')
109
+ self.dropout = nn.Dropout(0.3)
110
+
111
+ # Shared representation
112
+ hidden_size = self.bert.config.hidden_size
113
+ self.shared_layer = nn.Linear(hidden_size + lex_dim, hidden_size)
114
+
115
+ # Task-specific layers
116
+ self.classifier = nn.Linear(hidden_size, num_emotions) # Multi-label classification
117
+ self.regressor = nn.Linear(hidden_size, num_emotions) # Multi-output regression
118
+
119
+ def forward(self, input_ids, attention_mask, lexicon_feats):
120
+ # Get BERT embeddings
121
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
122
+ pooled_output = outputs.pooler_output
123
+
124
+ # Concatenate with lexicon features
125
+ combined = torch.cat((pooled_output, lexicon_feats), dim=1)
126
+
127
+ # Shared representation
128
+ shared_repr = F.relu(self.shared_layer(combined))
129
+ shared_repr = self.dropout(shared_repr)
130
+
131
+ # Task-specific outputs
132
+ cls_logits = self.classifier(shared_repr) # For binary classification of each emotion
133
+ reg_output = self.regressor(shared_repr) # For regression of each emotion's intensity
134
+
135
+ # Apply sigmoid to classification logits
136
+ cls_probs = torch.sigmoid(cls_logits)
137
+
138
+ # Scale regression outputs to [0,1]
139
+ reg_output = (torch.tanh(reg_output) + 1) / 2
140
+
141
+ return cls_probs, reg_output
142
+
143
+ emotion_cols = ["joy", "sadness", "anger", "fear"]
144
+ lex_dim = 21
145
+
146
+ @st.cache_resource
147
+ def load_model_tokenizer(num_emotions, lex_dim, device):
148
+ model = EmotionMultiTaskModel(num_emotions=num_emotions, lex_dim=lex_dim).to(device)
149
+ model.load_state_dict(torch.load("best_multitask_multilabel_model.pth", map_location=device))
150
+ model.eval()
151
+
152
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
153
+ return model, tokenizer
154
+
155
+ @st.cache_resource
156
+ def load_scalers():
157
+ scaler_lex = joblib.load("lex_scaler.pkl")
158
+ scaler_vad = joblib.load("vad_scaler.pkl")
159
+ scaler_hash = joblib.load("hash_scaler.pkl")
160
+ return scaler_lex, scaler_vad, scaler_hash
161
+
162
+ def load_lexicon_data():
163
+ nrc_lexicon = load_lex("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
164
+ nrc_vad_lexicon = load_nrc_vad("NRC-VAD-Lexicon-v2.1.txt")
165
+ hash_emo_lex = load_nrc_hash_emo("NRC-Hashtag-Emotion-Lexicon-v0.2.txt")
166
+ return nrc_lexicon, nrc_vad_lexicon, hash_emo_lex
167
+
168
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
169
+ num_emotions = len(emotion_cols)
170
+ model, tokenizer = load_model_tokenizer(num_emotions, lex_dim, device)
171
+ scaler_lex, scaler_vad, scaler_hash = load_scalers()
172
+ nrc_lexicon, nrc_vad_lexicon, hash_emo_lex = load_lexicon_data()
173
+
174
+ def extract_all_lexicons(text):
175
+ vad_feats = extract_vad(text, nrc_vad_lexicon)
176
+ vad_feats = scaler_vad.transform([vad_feats])
177
+
178
+ lex_feats = extract_lex(text, nrc_lexicon)
179
+ lex_feats = scaler_lex.transform([lex_feats])
180
+
181
+ hash_feats = extract_hash_emo(text, hash_emo_lex)
182
+ hash_feats = scaler_hash.transform([hash_feats])
183
+
184
+ combined_feats = np.concatenate([vad_feats, lex_feats, hash_feats], axis = 1)
185
+ return combined_feats
186
+
187
+ def predict_emotions(text, model, tokenizer, device, threshold=0.3):
188
+ model.eval()
189
+
190
+ # Clean and tokenize the text
191
+ clean = clean_text(text)
192
+ tokens = tokenizer(
193
+ clean,
194
+ padding='max_length',
195
+ truncation=True,
196
+ max_length=128,
197
+ return_tensors='pt'
198
+ )
199
+
200
+ # Create lexicon features
201
+ lexicon_feats = torch.tensor(extract_all_lexicons(clean), dtype=torch.float).to(device)
202
+
203
+ # Move inputs to device
204
+ input_ids = tokens['input_ids'].to(device)
205
+ attention_mask = tokens['attention_mask'].to(device)
206
+
207
+ # Get predictions
208
+ with torch.no_grad():
209
+ cls_probs, intensities = model(
210
+ input_ids=input_ids,
211
+ attention_mask=attention_mask,
212
+ lexicon_feats=lexicon_feats
213
+ )
214
+
215
+ # Convert to numpy
216
+ cls_probs = cls_probs.cpu().numpy()[0]
217
+ intensities = intensities.cpu().numpy()[0]
218
+
219
+ detected_emotions = np.zeros_like(cls_probs, dtype=bool)
220
+ detected_emotions[cls_probs.argmax()] = True
221
+
222
+ # Prepare results
223
+ results = {}
224
+ for i, emotion in enumerate(emotion_cols):
225
+ results[emotion] = {
226
+ "probability": float(cls_probs[i]),
227
+ "detected": bool(detected_emotions[i]),
228
+ "intensity": float(intensities[i]) if detected_emotions[i] else 0.0
229
+ }
230
+
231
+ return results
232
+
233
+ # STREAMLIT UI
234
+ st.title("Emotion Intensity Prediction using Transformer Based Models")
235
+ st.markdown("Enter text below to predict emotions and their intensities.")
236
+
237
+ text_input = st.text_area("Input Text:", height=150, placeholder="Type your sentence here... eg.I am very happy")
238
+
239
+ if st.button("Predict Emotions"):
240
+ if text_input.strip() == "":
241
+ st.warning("Please enter some text to get predictions.")
242
+ else:
243
+ with st.spinner("Analyzing emotions..."):
244
+ results = predict_emotions(text_input, model, tokenizer, device)
245
+
246
+ st.subheader("Prediction Results:")
247
+
248
+ emotions_sorted = sorted(
249
+ [(emotion, details) for emotion, details in results.items() if details["detected"]],
250
+ key=lambda x: x[1]["intensity"],
251
+ reverse=True
252
+ )
253
+
254
+ if emotions_sorted:
255
+ st.write("---")
256
+ for emotion, details in emotions_sorted:
257
+ st.write(f"### {emotion.capitalize()}")
258
+ st.progress(details['intensity'], text = f"Intensity: {details['intensity']:.2f}")
259
+ st.progress(details['probability'], text = f"Confidence Score: {details['probability']:.2f}")
260
+ else:
261
  st.info("No emotions detected")