opinder2906 commited on
Commit
0a6f875
·
verified ·
1 Parent(s): eb46451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +316 -85
app.py CHANGED
@@ -1,96 +1,327 @@
1
- # app.py
 
 
 
 
 
 
 
2
 
3
- import streamlit as st
4
  import pandas as pd
5
- import re
6
  from sklearn.feature_extraction.text import TfidfVectorizer
 
7
  from sklearn.linear_model import LogisticRegression
8
- from sklearn.model_selection import train_test_split
9
- from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # --- Page Config ---
12
- st.set_page_config(page_title="Conversational ChatBot", layout="centered")
13
- st.title("💬 Conversational Sentiment ChatBot")
14
 
15
- # --- Utility Functions ---
16
- def clean_text(text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  text = text.lower()
18
- text = re.sub(r"http\S+|www\S+|https\S+", "", text)
19
- text = re.sub(r"[^a-z\s]", "", text)
20
- return text.strip()
21
-
22
- # --- Data Load & Prepare ---
23
- @st.cache_data
24
- def load_and_sample(sample_size: int = 2000):
25
- # Load two sentiment CSVs
26
- df1 = pd.read_csv(
27
- "https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
28
- delimiter=';', header=None, names=['sentence', 'label']
29
- )
30
- df2 = pd.read_csv(
31
- "https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
32
- delimiter=';', header=None, names=['sentence', 'label']
33
- )
34
- df = pd.concat([df1, df2], ignore_index=True)
35
- if len(df) > sample_size:
36
- df = df.sample(sample_size, random_state=42)
37
- df['clean'] = df['sentence'].apply(clean_text)
38
- return df
39
-
40
- df = load_and_sample(sample_size=2000)
41
-
42
- # --- Train/Test Split ---
43
- X = df['clean']
44
  y = df['label']
45
- X_train, X_test, y_train, y_test = train_test_split(
46
- X, y, test_size=0.2, stratify=y, random_state=42
47
- )
48
 
49
- # --- Model Training ---
50
- @st.cache_resource
51
- def train_sentiment_model(max_feats: int = 500, ngram=(1,1)):
52
- vec = TfidfVectorizer(max_features=max_feats, ngram_range=ngram)
53
- X_tr = vec.fit_transform(X_train)
54
- clf = LogisticRegression(max_iter=1000)
55
- clf.fit(X_tr, y_train)
56
- return vec, clf
57
-
58
- vectorizer, model = train_sentiment_model()
59
-
60
- # --- Mapping Sentiment to Responses ---
61
- response_map = {
62
- 'sadness': "I’m sorry you’re feeling sad. It might help to talk to a friend or try a relaxing activity.",
63
- 'joy': "That’s wonderful! What’s making you feel happy today?",
64
- 'anger': "I understand you’re upset. Taking deep breaths or a short walk might help calm you.",
65
- 'love': "Love is a beautiful feeling. Would you like to share more about it?",
66
- 'fear': "It sounds like you’re feeling scared. Remember, it’s okay to feel fear—what’s on your mind?",
67
- 'surprise': "That’s surprising! Care to tell me more about what happened?",
68
- 'neutral': "Thanks for sharing. How are you feeling overall today?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
 
71
- def get_chatbot_reply(text: str) -> str:
72
- clean = clean_text(text)
73
- vec = vectorizer.transform([clean])
74
- pred = model.predict(vec)[0]
75
- prob = model.predict_proba(vec).max()
76
- reply = response_map.get(pred, "I hear you. Tell me more.")
77
- return f"{reply} (I’m {int(prob*100)}% certain)"
78
-
79
- # --- Chat Interface ---
80
- st.markdown("---")
81
- st.header("🤖 Chat with the Bot")
82
-
83
- if 'history' not in st.session_state:
84
- st.session_state.history = []
85
-
86
- # Display conversation
87
- for speaker, message in st.session_state.history:
88
- tag = "**You:**" if speaker=='user' else "**Bot:**"
89
- st.markdown(f"{tag} {message}")
90
-
91
- # User input
92
- text_in = st.text_input("You:", key="input")
93
- if text_in:
94
- st.session_state.history.append(('user', text_in))
95
- bot_reply = get_chatbot_reply(text_in)
96
- st.session_state.history.append(('bot', bot_reply))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of May 22.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1wsn8k2j5HMalAorkmvAnIh01I-6zWJRG
8
+ """
9
 
 
10
  import pandas as pd
 
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.model_selection import train_test_split, GridSearchCV
13
  from sklearn.linear_model import LogisticRegression
14
+ from sklearn.ensemble import RandomForestClassifier
15
+ from sklearn.svm import SVC
16
+ from xgboost import XGBClassifier
17
+ from sklearn.pipeline import Pipeline
18
+ import joblib
19
+ from transformers import pipeline as hf_pipeline
20
+ import re
21
+
22
+ # 1. Load datasets
23
+ df = pd.read_csv(
24
+ "https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
25
+ delimiter=';', header=None, names=['sentence', 'label']
26
+ )
27
+ ts_df = pd.read_csv(
28
+ "https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
29
+ delimiter=';', header=None, names=['sentence', 'label']
30
+ )
31
 
32
+ df = pd.concat([df, ts_df], ignore_index=True)
 
 
33
 
34
+ df
35
+
36
+ total_rows = df.shape[0]
37
+
38
+ # % of null values
39
+ null_percent = df.isnull().mean() * 100
40
+
41
+ # % of duplicate rows
42
+ duplicate_rows = df.duplicated().sum()
43
+ duplicate_percent = (duplicate_rows / total_rows) * 100
44
+
45
+ print("Null Value Percentage:\n", null_percent)
46
+ print(f"\n📄 Duplicate Rows: {duplicate_rows} ({duplicate_percent:.2f}%)")
47
+ df.drop_duplicates(inplace=True)
48
+ def clean_text(text):
49
+ if pd.isnull(text):
50
+ return ""
51
  text = text.lower()
52
+ text = re.sub(r"http\S+|www\S+|https\S+", '', text) # Remove URLs
53
+ text = re.sub(r'\@\w+|\#','', text) # Remove @ and #
54
+ text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
55
+ text = re.sub(r'\s+', ' ', text).strip() # Normalize spaces
56
+ return text
57
+ df['clean_sentence'] = df['sentence'].apply(clean_text)
58
+ # Load and prepare data
59
+ X = df['clean_sentence']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  y = df['label']
 
 
 
61
 
62
+ # 1. Install necessary libraries in Colab (run once)
63
+ !pip install textblob
64
+ !python -m textblob.download_corpora
65
+
66
+ # === MODEL TRAINING CODE WITH REQUIRED CONCEPTS ===
67
+
68
+ import torch
69
+ import torch.nn as nn
70
+ from torch.utils.data import Dataset, DataLoader
71
+ import pandas as pd
72
+ import numpy as np
73
+ from sklearn.model_selection import train_test_split
74
+ from sklearn.preprocessing import LabelEncoder
75
+ from collections import Counter
76
+ import matplotlib.pyplot as plt
77
+ import seaborn as sns
78
+ from sklearn.feature_extraction.text import CountVectorizer
79
+
80
+ # --- 1. Load and preprocess your DataFrame ---
81
+
82
+ tokenized = df['clean_sentence'].apply(str.split)
83
+
84
+ # --- 2. Build Vocabulary ---
85
+ vocab = Counter([token for sentence in tokenized for token in sentence])
86
+ vocab = {word: i+2 for i, (word, _) in enumerate(vocab.most_common())}
87
+ vocab['<PAD>'] = 0
88
+ vocab['<UNK>'] = 1
89
+
90
+ def encode(text):
91
+ return [vocab.get(word, vocab['<UNK>']) for word in text]
92
+
93
+ encoded_texts = tokenized.apply(encode)
94
+
95
+ # --- 3. Pad Sequences ---
96
+ MAX_LEN = 32
97
+ def pad_sequence(seq):
98
+ return seq[:MAX_LEN] + [vocab['<PAD>']] * max(0, MAX_LEN - len(seq))
99
+ padded = encoded_texts.apply(pad_sequence).tolist()
100
+
101
+ # --- 4. Encode Labels ---
102
+ le = LabelEncoder()
103
+ labels = le.fit_transform(df['label'])
104
+
105
+ # --- 5. Dataset + DataLoader ---
106
+ class EmotionDataset(Dataset):
107
+ def __init__(self, X, y):
108
+ self.X = torch.tensor(X, dtype=torch.long)
109
+ self.y = torch.tensor(y, dtype=torch.long)
110
+
111
+ def __len__(self):
112
+ return len(self.X)
113
+
114
+ def __getitem__(self, idx):
115
+ return self.X[idx], self.y[idx]
116
+
117
+ X_train, X_val, y_train, y_val = train_test_split(padded, labels, test_size=0.2, stratify=labels, random_state=42)
118
+ train_loader = DataLoader(EmotionDataset(X_train, y_train), batch_size=16, shuffle=True)
119
+ val_loader = DataLoader(EmotionDataset(X_val, y_val), batch_size=16)
120
+
121
+ # --- 6. Co-occurrence Matrix (Visualization Only) ---
122
+ vectorizer = CountVectorizer(max_features=20)
123
+ X_counts = vectorizer.fit_transform(df['clean_sentence'])
124
+ X_counts = (X_counts.T * X_counts)
125
+ X_counts.setdiag(0)
126
+ plt.figure(figsize=(18, 18))
127
+ sns.heatmap(X_counts.toarray(), xticklabels=vectorizer.get_feature_names_out(),
128
+ yticklabels=vectorizer.get_feature_names_out(), cmap="YlGnBu", annot=True)
129
+ plt.title("Word Co-occurrence Matrix")
130
+ plt.show()
131
+
132
+ # --- 7. Positional Encoding ---
133
+ class PositionalEncoding(nn.Module):
134
+ def __init__(self, d_model, max_len=MAX_LEN):
135
+ super().__init__()
136
+ pe = torch.zeros(max_len, d_model)
137
+ position = torch.arange(0, max_len).unsqueeze(1)
138
+ div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
139
+ pe[:, 0::2] = torch.sin(position * div_term)
140
+ pe[:, 1::2] = torch.cos(position * div_term)
141
+ self.pe = pe.unsqueeze(0)
142
+
143
+ def forward(self, x):
144
+ return x + self.pe[:, :x.size(1)].to(x.device)
145
+
146
+ # --- 8. Transformer Model with Masking + Dropout for Bayesian Inference ---
147
+ class EmotionTransformer(nn.Module):
148
+ def __init__(self, vocab_size, embed_dim, num_heads, num_classes):
149
+ super().__init__()
150
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab['<PAD>'])
151
+ self.pos_encoder = PositionalEncoding(embed_dim)
152
+ encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
153
+ self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
154
+ self.dropout = nn.Dropout(0.3)
155
+ self.fc = nn.Linear(embed_dim, num_classes)
156
+
157
+ def forward(self, x):
158
+ mask = (x == vocab['<PAD>'])
159
+ x = self.embedding(x)
160
+ x = self.pos_encoder(x)
161
+ x = self.transformer(x, src_key_padding_mask=mask)
162
+ x = self.dropout(x.mean(dim=1)) # mean pooling
163
+ return self.fc(x)
164
+
165
+ # --- 9. Train the Model ---
166
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
167
+ model = EmotionTransformer(len(vocab), embed_dim=64, num_heads=4, num_classes=len(le.classes_)).to(device)
168
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
169
+ criterion = nn.CrossEntropyLoss()
170
+
171
+ for epoch in range(5):
172
+ model.train()
173
+ total_loss = 0
174
+ for X_batch, y_batch in train_loader:
175
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
176
+ optimizer.zero_grad()
177
+ logits = model(X_batch)
178
+ loss = criterion(logits, y_batch)
179
+ loss.backward()
180
+ optimizer.step()
181
+ total_loss += loss.item()
182
+
183
+ # Validation
184
+ model.eval()
185
+ correct = total = 0
186
+ with torch.no_grad():
187
+ for X_batch, y_batch in val_loader:
188
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
189
+ outputs = model(X_batch)
190
+ preds = torch.argmax(outputs, dim=1)
191
+ correct += (preds == y_batch).sum().item()
192
+ total += y_batch.size(0)
193
+
194
+ print(f"Epoch {epoch+1} | Train Loss: {total_loss:.4f} | Val Accuracy: {correct / total:.4f}")
195
+
196
+ # Save model
197
+ torch.save(model.state_dict(), "emotion_transformer_model.pth")
198
+
199
+ ! pip install textblob
200
+ ! python -m textblob.download_corpora
201
+
202
+ import torch
203
+ import torch.nn.functional as F
204
+ import random
205
+ from textblob import TextBlob
206
+
207
+ # Load model
208
+ model.load_state_dict(torch.load("emotion_transformer_model.pth", map_location=device))
209
+ model.eval()
210
+
211
+ # Preprocess user input
212
+ def preprocess_input(text):
213
+ tokens = text.lower().split()
214
+ encoded = [vocab.get(token, vocab['<UNK>']) for token in tokens]
215
+ padded = encoded[:MAX_LEN] + [vocab['<PAD>']] * max(0, MAX_LEN - len(encoded))
216
+ return torch.tensor([padded], dtype=torch.long).to(device)
217
+
218
+ # Emotion responses
219
+ responses = {
220
+ "sadness": [
221
+ "It’s okay to feel down sometimes. I’m here to support you.",
222
+ "I'm really sorry you're going through this. Want to talk more about it?",
223
+ "You're not alone — I’m here for you."
224
+ ],
225
+ "anger": [
226
+ "That must have been frustrating. Want to vent about it?",
227
+ "It's okay to feel this way. I'm listening.",
228
+ "Would it help to talk through it?"
229
+ ],
230
+ "love": [
231
+ "That’s beautiful to hear! What made you feel that way?",
232
+ "It’s amazing to experience moments like that.",
233
+ "Sounds like something truly meaningful."
234
+ ],
235
+ "happiness": [
236
+ "That's awesome! What’s bringing you joy today?",
237
+ "I love hearing good news. 😊",
238
+ "Yay! Want to share more about it?"
239
+ ],
240
+ "neutral": [
241
+ "Got it. I’m here if you want to dive deeper.",
242
+ "Thanks for sharing that. Tell me more if you’d like.",
243
+ "I’m listening. How else can I support you?"
244
+ ]
245
+ }
246
+
247
+ # Suggestions
248
+ relaxation_resources = {
249
+ "exercise": "Try this 5-4-3-2-1 grounding method:\n- 5 things you see\n- 4 you can touch\n- 3 you hear\n- 2 you smell\n- 1 you taste",
250
+ "video": "Here’s a short calming video that might help: https://youtu.be/O-6f5wQXSu8"
251
  }
252
 
253
+ # Keywords
254
+ help_keywords = ["suggest", "help", "calm", "exercise", "relax", "how can i", "any tips", "can u", "can you"]
255
+ negative_inputs = ["not good", "feel bad", "feel sad", "anxious", "depressed", "upset", "feel like shit", "stress", "worried"]
256
+ thank_you_inputs = ["thank", "thanks", "thank you"]
257
+ bye_inputs = ["bye", "goodbye", "see you", "take care", "ok bye", "exit", "quit"]
258
+
259
+ # Conversation state
260
+ awaiting_tip_type = False
261
+
262
+ # Correct spelling
263
+ def correct_spelling(text):
264
+ return str(TextBlob(text).correct())
265
+
266
+ # Get response
267
+ def get_response(emotion, user_input):
268
+ global awaiting_tip_type
269
+ user_input_lower = user_input.lower()
270
+
271
+ if any(bye in user_input_lower for bye in bye_inputs):
272
+ return "Take care! I’m here whenever you want to talk. 🌿", True
273
+
274
+ if any(thank in user_input_lower for thank in thank_you_inputs):
275
+ return "You're most welcome! I'm really glad I could support you. 💙", False
276
+
277
+ # Awaiting video vs exercise clarification
278
+ if awaiting_tip_type:
279
+ if "video" in user_input_lower:
280
+ awaiting_tip_type = False
281
+ return relaxation_resources["video"], False
282
+ elif "exercise" in user_input_lower or "excercise" in user_input_lower or "breathe" in user_input_lower:
283
+ awaiting_tip_type = False
284
+ return relaxation_resources["exercise"], False
285
+ else:
286
+ return "Just checking — would you prefer a calming video or a simple breathing exercise?", False
287
+
288
+ # Offer relaxation suggestions
289
+ if any(kw in user_input_lower for kw in help_keywords):
290
+ awaiting_tip_type = True
291
+ return "Would you prefer a short calming video or a simple breathing exercise?", False
292
+
293
+ # Default: emotional response
294
+ if emotion in responses:
295
+ return random.choice(responses[emotion]), False
296
+ else:
297
+ return random.choice(responses["neutral"]), False
298
+
299
+ # Main chatbot loop
300
+ print("EmotiBot 🌿: Hi! How are you feeling today? (Type 'exit' to quit)")
301
+
302
+ while True:
303
+ user_input_raw = input("You: ").strip()
304
+ user_input = correct_spelling(user_input_raw)
305
+
306
+ if user_input.lower() in ['exit', 'quit']:
307
+ print("EmotiBot 🌿: Take care! I’m here whenever you want to talk.")
308
+ break
309
+
310
+ # Emotion prediction
311
+ if any(phrase in user_input.lower() for phrase in negative_inputs):
312
+ pred_emotion = "sadness"
313
+ else:
314
+ x = preprocess_input(user_input)
315
+ model.train()
316
+ with torch.no_grad():
317
+ probs = torch.stack([F.softmax(model(x), dim=1) for _ in range(5)])
318
+ avg_probs = probs.mean(dim=0)
319
+ pred_idx = torch.argmax(avg_probs, dim=1).item()
320
+ pred_emotion = le.classes_[pred_idx]
321
+
322
+ # Generate response
323
+ reply, should_exit = get_response(pred_emotion, user_input)
324
+ print(f"EmotiBot 🌿: {reply}")
325
+ if should_exit:
326
+ break
327
+