MienOlle commited on
Commit
7c8b31f
·
verified ·
1 Parent(s): 63c8407

Main Initial Commits

Browse files
NRC-Emotion-Lexicon-Wordlevel-v0.92.txt ADDED
The diff for this file is too large to render. See raw diff
 
NRC-Hashtag-Emotion-Lexicon-v0.2.txt ADDED
The diff for this file is too large to render. See raw diff
 
NRC-VAD-Lexicon-v2.1.txt ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,20 +1,37 @@
1
- ---
2
- title: EmoInt BERT
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- license: apache-2.0
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Emotion Intensity Prediction using Transformer Based Models
3
+ emoji: 🤩
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.x.x
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # Multitask Emotion Prediction Space
13
+
14
+ This Hugging Face Space hosts a deep learning model that predicts emotions and their intensities from text.
15
+ It utilizes a BERT-based architecture combined with lexicon features for enhanced performance.
16
+
17
+ **Features:**
18
+ - BERT-based text understanding.
19
+ - Integration of NRC VAD, NRC Emotion Lexicon, and NRC Hashtag Emotion Lexicon.
20
+ - Multi-task learning for emotion classification (joy, sadness, anger, fear) and intensity regression.
21
+
22
+ **How to use:**
23
+ Enter your text in the input box below and click "Predict Emotions" to see the model's output.
24
+
25
+ **Model Details:**
26
+ - Trained on dataset SemEval-2018 El-reg
27
+ - Uses `bert-base-uncased` from Hugging Face.
28
+ - `lex_dim`: 21 (number of combined lexicon features)
29
+
30
+ **Files included:**
31
+ - `app.py`: The Streamlit application code.
32
+ - `best_multitask_multilabel_model.pth`: Trained model weights.
33
+ - `*_scaler.pkl`: Joblib-saved feature scalers for lexicon features.
34
+ - `NRC-*.txt`: Lexicon data files.
35
+
36
+ ---
37
+ Feel free to duplicate this Space and experiment!
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ import re
5
+ import emoji
6
+ import contractions
7
+ from collections import defaultdict
8
+ import joblib
9
+ from transformers import BertTokenizer, BertModel
10
+ import torch.nn as nn
11
+ from torch.nn import functional as F
12
+ import streamlit as st
13
+
14
+ def load_lex(filepath):
15
+ lexicon = defaultdict(dict)
16
+ with open(filepath, 'r') as file:
17
+ for line in file:
18
+ word, emotion, value = line.strip().split('\t')
19
+ if int(value) == 1:
20
+ lexicon[word][emotion] = 1
21
+ return lexicon
22
+
23
+ def load_nrc_vad(filepath):
24
+ vad_lex = {}
25
+ with open(filepath, 'r', encoding='utf-8') as f:
26
+ next(f) # skip header
27
+ for line in f:
28
+ word, val, aro, dom = line.strip().split('\t')
29
+ vad_lex[word] = {
30
+ 'valence': float(val),
31
+ 'arousal': float(aro),
32
+ 'dominance': float(dom)
33
+ }
34
+ return vad_lex
35
+
36
+ def load_nrc_hash_emo(filepath):
37
+ lexicon = defaultdict(dict)
38
+ with open(filepath, 'r', encoding='utf-8') as f:
39
+ for line in f:
40
+ emotion, word, score = line.strip().split('\t')
41
+ lexicon[word][emotion] = float(score)
42
+ return lexicon
43
+
44
+ def convert_emojis(text):
45
+ text = emoji.demojize(text, delimiters=(" ", " "))
46
+ text = re.sub(r':([a-zA-Z_]+):', r'\1', text)
47
+ text = re.sub(r'\s+', ' ', text).strip()
48
+ return text
49
+
50
+ def clean_text(text):
51
+ text = text.lower()
52
+ text = contractions.fix(text)
53
+ text = convert_emojis(text)
54
+ text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
55
+ text = re.sub(r'@\w+', '', text)
56
+ text = re.sub(r"[^a-zA-Z\s.,!?']", '', text)
57
+ text = re.sub(r'\s+', ' ', text).strip()
58
+ return text
59
+
60
+ def extract_lex(text, lexicon):
61
+ emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
62
+ 'sadness', 'surprise', 'trust', 'positive', 'negative']
63
+ counts = dict.fromkeys(emotions, 0)
64
+
65
+ for word in text.split():
66
+ if word in lexicon:
67
+ for emo in lexicon[word]:
68
+ counts[emo] += 1
69
+ return [counts[emo] for emo in emotions]
70
+
71
+ def extract_vad(text, lexicon):
72
+ valence = []
73
+ arousal = []
74
+ dominance = []
75
+
76
+ for word in text.split():
77
+ if word in lexicon:
78
+ valence.append(lexicon[word]['valence'])
79
+ arousal.append(lexicon[word]['arousal'])
80
+ dominance.append(lexicon[word]['dominance'])
81
+
82
+ # If no word matched, return zeros
83
+ if not valence:
84
+ return [0.0, 0.0, 0.0]
85
+
86
+ # Otherwise, return means
87
+ return [
88
+ np.mean(valence),
89
+ np.mean(arousal),
90
+ np.mean(dominance)
91
+ ]
92
+
93
+ def extract_hash_emo(text, lexicon):
94
+ emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
95
+ 'sadness', 'surprise', 'trust']
96
+ scores = {emo: [] for emo in emotions}
97
+
98
+ for word in text.split():
99
+ if word in lexicon:
100
+ for emo, value in lexicon[word].items():
101
+ scores[emo].append(value)
102
+
103
+ return [np.mean(scores[emo]) if scores[emo] else 0.0 for emo in emotions]
104
+
105
+ class EmotionMultiTaskModel(nn.Module):
106
+ def __init__(self, num_emotions=4, lex_dim=21):
107
+ super(EmotionMultiTaskModel, self).__init__()
108
+ self.bert = BertModel.from_pretrained('bert-base-uncased')
109
+ self.dropout = nn.Dropout(0.3)
110
+
111
+ # Shared representation
112
+ hidden_size = self.bert.config.hidden_size
113
+ self.shared_layer = nn.Linear(hidden_size + lex_dim, hidden_size)
114
+
115
+ # Task-specific layers
116
+ self.classifier = nn.Linear(hidden_size, num_emotions) # Multi-label classification
117
+ self.regressor = nn.Linear(hidden_size, num_emotions) # Multi-output regression
118
+
119
+ def forward(self, input_ids, attention_mask, lexicon_feats):
120
+ # Get BERT embeddings
121
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
122
+ pooled_output = outputs.pooler_output
123
+
124
+ # Concatenate with lexicon features
125
+ combined = torch.cat((pooled_output, lexicon_feats), dim=1)
126
+
127
+ # Shared representation
128
+ shared_repr = F.relu(self.shared_layer(combined))
129
+ shared_repr = self.dropout(shared_repr)
130
+
131
+ # Task-specific outputs
132
+ cls_logits = self.classifier(shared_repr) # For binary classification of each emotion
133
+ reg_output = self.regressor(shared_repr) # For regression of each emotion's intensity
134
+
135
+ # Apply sigmoid to classification logits
136
+ cls_probs = torch.sigmoid(cls_logits)
137
+
138
+ # Scale regression outputs to [0,1]
139
+ reg_output = (torch.tanh(reg_output) + 1) / 2
140
+
141
+ return cls_probs, reg_output
142
+
143
+ emotion_cols = ["joy", "sadness", "anger", "fear"]
144
+ lex_dim = 21
145
+
146
+ @st.cache_resource
147
+ def load_model_tokenizer(num_emotions, lex_dim, device):
148
+ model = EmotionMultiTaskModel(num_emotions=num_emotions, lex_dim=lex_dim).to(device)
149
+ model.load_state_dict(torch.load("best_multitask_multilabel_model.pth", map_location=device))
150
+ model.eval()
151
+
152
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
153
+ return model, tokenizer
154
+
155
+ @st.cache_resource
156
+ def load_scalers():
157
+ scaler_lex = joblib.load("lex_scaler.pkl")
158
+ scaler_vad = joblib.load("vad_scaler.pkl")
159
+ scaler_hash = joblib.load("hash_scaler.pkl")
160
+ return scaler_lex, scaler_vad, scaler_hash
161
+
162
+ def load_lexicon_data():
163
+ nrc_lexicon = load_lex("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
164
+ nrc_vad_lexicon = load_nrc_vad("NRC-VAD-Lexicon-v2.1.txt")
165
+ hash_emo_lex = load_nrc_hash_emo("NRC-Hashtag-Emotion-Lexicon-v0.2.txt")
166
+ return nrc_lexicon, nrc_vad_lexicon, hash_emo_lex
167
+
168
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
169
+ num_emotions = len(emotion_cols)
170
+ model, tokenizer = load_model_tokenizer(num_emotions, lex_dim, device)
171
+ scaler_lex, scaler_vad, scaler_hash = load_scalers
172
+ nrc_lexicon, nrc_vad_lexicon, hash_emo_lex = load_lexicon_data
173
+
174
+ def extract_all_lexicons(text):
175
+ vad_feats = extract_vad(text, nrc_vad_lexicon)
176
+ vad_feats = scaler_vad.transform([vad_feats])
177
+
178
+ lex_feats = extract_lex(text, nrc_lexicon)
179
+ lex_feats = scaler_lex.transform([lex_feats])
180
+
181
+ hash_feats = extract_hash_emo(text, hash_emo_lex)
182
+ hash_feats = scaler_hash.transform([hash_feats])
183
+
184
+ combined_feats = np.concatenate([vad_feats, lex_feats, hash_feats], axis = 1)
185
+ return combined_feats
186
+
187
+ def predict_emotions(text, model, tokenizer, device, threshold=0.3):
188
+ model.eval()
189
+
190
+ # Clean and tokenize the text
191
+ clean = clean_text(text)
192
+ tokens = tokenizer(
193
+ clean,
194
+ padding='max_length',
195
+ truncation=True,
196
+ max_length=128,
197
+ return_tensors='pt'
198
+ )
199
+
200
+ # Create lexicon features
201
+ lexicon_feats = torch.tensor(extract_all_lexicons(clean), dtype=torch.float).to(device)
202
+
203
+ # Move inputs to device
204
+ input_ids = tokens['input_ids'].to(device)
205
+ attention_mask = tokens['attention_mask'].to(device)
206
+
207
+ # Get predictions
208
+ with torch.no_grad():
209
+ cls_probs, intensities = model(
210
+ input_ids=input_ids,
211
+ attention_mask=attention_mask,
212
+ lexicon_feats=lexicon_feats
213
+ )
214
+
215
+ # Convert to numpy
216
+ cls_probs = cls_probs.cpu().numpy()[0]
217
+ intensities = intensities.cpu().numpy()[0]
218
+
219
+ detected_emotions = np.zeros_like(cls_probs, dtype=bool)
220
+ detected_emotions[cls_probs.argmax()] = True
221
+
222
+ # Prepare results
223
+ results = {}
224
+ for i, emotion in enumerate(emotion_cols):
225
+ results[emotion] = {
226
+ "probability": float(cls_probs[i]),
227
+ "detected": bool(detected_emotions[i]),
228
+ "intensity": float(intensities[i]) if detected_emotions[i] else 0.0
229
+ }
230
+
231
+ return results
232
+
233
+ # STREAMLIT UI
234
+ st.title("Emotion Intensity Prediction using Transformer Based Models")
235
+ st.markdown("Enter text below to predict emotions and their intensities.")
236
+
237
+ text_input = st.text_area("Input Text:", height=150, placeholder="Type your sentence here... eg.I am very happy")
238
+
239
+ if st.button("Predict Emotions"):
240
+ if text_input.strip() == "":
241
+ st.warning("Please enter some text to get predictions.")
242
+ else:
243
+ with st.spinner("Analyzing emotions..."):
244
+ results = predict_emotions(text_input, model, tokenizer, device)
245
+
246
+ st.subheader("Prediction Results:")
247
+
248
+ emotions_sorted = sorted(
249
+ [(emotion, details) for emotion, details in results.items() if details["detected"]],
250
+ key=lambda x: x[1]["intensity"],
251
+ reverse=True
252
+ )
253
+
254
+ if emotions_sorted:
255
+ st.write("---")
256
+ for emotion, details in emotions_sorted:
257
+ st.write(f"### {emotion.capitalize()}")
258
+ st.progress(details['intensity'], text = f"Intensity: {details['intensity']:.2f}")
259
+ st.progress(details['probability'], text = f"Confidence Score: {details['probability']:.2f}")
260
+ else:
261
+ st.info("No emotions detected")
best_multitask_multilabel_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b3a907a4570aaba9e52178de750ff390a6cb670df28bb8d2764d034d0940c5e
3
+ size 440468464
hash_scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96fc40c55c141a99d1987619dcbde2ec5a91f2c586e36ccd2b2a7a7a2ea9d5ed
3
+ size 807
lex_scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62355246521d488c9cb51ff1c77cff6bf468329eca1c6c5f79e9c5c18e4feb09
3
+ size 855
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
1
+ streamlit
2
+ numpy
3
+ pandas
4
+ torch
5
+ transformers
6
+ scikit-learn
7
+ emoji
8
+ contractions
vad_scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83254f8757483b76edf9dababfbcfa47e994357d7c1d1b15c9420b2cf93ebf9e
3
+ size 671