hansche commited on
Commit
dd83f16
·
verified ·
1 Parent(s): 155e783

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -438
app.py DELETED
@@ -1,438 +0,0 @@
1
- import streamlit as st
2
- st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")
3
-
4
- import pandas as pd
5
- import matplotlib.pyplot as plt
6
- import seaborn as sns
7
- import preprocessor, helper
8
- from sentiment import predict_sentiment_batch
9
- import os
10
- os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"
11
-
12
- # Theme customization
13
- st.markdown(
14
- """
15
- <style>
16
- .main {background-color: #f0f2f6;}
17
- </style>
18
- """,
19
- unsafe_allow_html=True
20
- )
21
-
22
- # Set seaborn style
23
- sns.set_theme(style="whitegrid")
24
-
25
- st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard")
26
- st.subheader('Instructions')
27
- st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
28
- st.markdown("2. Wait for the initial processing (minimal delay).")
29
- st.markdown("3. Customize the analysis by selecting users or filters.")
30
- st.markdown("4. Click 'Show Analysis' for detailed results.")
31
-
32
- st.sidebar.title("Whatsapp Chat Analyzer")
33
- uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")
34
-
35
- @st.cache_data
36
- def load_and_preprocess(file_content):
37
- return preprocessor.preprocess(file_content)
38
-
39
- if uploaded_file is not None:
40
- raw_data = uploaded_file.read().decode("utf-8")
41
- with st.spinner("Loading chat data..."):
42
- df, _ = load_and_preprocess(raw_data)
43
- st.session_state.df = df
44
-
45
- st.sidebar.header("🔍 Filters")
46
- user_list = ["Overall"] + sorted(df["user"].unique().tolist())
47
- selected_user = st.sidebar.selectbox("Select User", user_list)
48
-
49
- df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]
50
-
51
- if st.sidebar.button("Show Analysis"):
52
- if df_filtered.empty:
53
- st.warning(f"No data found for user: {selected_user}")
54
- else:
55
- with st.spinner("Analyzing..."):
56
- if 'sentiment' not in df_filtered.columns:
57
- try:
58
- print("Starting sentiment analysis...")
59
- # Get messages as clean strings
60
- message_list = df_filtered["message"].astype(str).tolist()
61
- message_list = [msg for msg in message_list if msg.strip()]
62
-
63
- print(f"Processing {len(message_list)} messages")
64
- print(f"Sample messages: {message_list[:5]}")
65
-
66
- # Directly call the sentiment analysis function
67
- df_filtered['sentiment'] = predict_sentiment_batch(message_list)
68
- print("Sentiment analysis completed successfully")
69
-
70
- except Exception as e:
71
- st.error(f"Sentiment analysis failed: {str(e)}")
72
- print(f"Full error: {str(e)}")
73
-
74
- st.session_state.df_filtered = df_filtered
75
- else:
76
- st.session_state.df_filtered = df_filtered
77
-
78
- # Display statistics and visualizations
79
- num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
80
- st.title("Top Statistics")
81
- col1, col2, col3, col4 = st.columns(4)
82
- with col1:
83
- st.header("Total Messages")
84
- st.title(num_messages)
85
- with col2:
86
- st.header("Total Words")
87
- st.title(words)
88
- with col3:
89
- st.header("Media Shared")
90
- st.title(num_media)
91
- with col4:
92
- st.header("Links Shared")
93
- st.title(num_links)
94
-
95
- st.title("Monthly Timeline")
96
- timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
97
- if not timeline.empty:
98
- plt.figure(figsize=(10, 5))
99
- sns.lineplot(data=timeline, x='time', y='message', color='green')
100
- plt.title("Monthly Timeline")
101
- plt.xlabel("Date")
102
- plt.ylabel("Messages")
103
- st.pyplot(plt)
104
- plt.clf()
105
-
106
- st.title("Daily Timeline")
107
- daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
108
- if not daily_timeline.empty:
109
- plt.figure(figsize=(10, 5))
110
- sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
111
- plt.title("Daily Timeline")
112
- plt.xlabel("Date")
113
- plt.ylabel("Messages")
114
- st.pyplot(plt)
115
- plt.clf()
116
-
117
- st.title("Activity Map")
118
- col1, col2 = st.columns(2)
119
- with col1:
120
- st.header("Most Busy Day")
121
- busy_day = helper.week_activity_map(selected_user, df_filtered)
122
- if not busy_day.empty:
123
- plt.figure(figsize=(10, 5))
124
- sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
125
- plt.title("Most Busy Day")
126
- plt.xlabel("Day of Week")
127
- plt.ylabel("Message Count")
128
- st.pyplot(plt)
129
- plt.clf()
130
- with col2:
131
- st.header("Most Busy Month")
132
- busy_month = helper.month_activity_map(selected_user, df_filtered)
133
- if not busy_month.empty:
134
- plt.figure(figsize=(10, 5))
135
- sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
136
- plt.title("Most Busy Month")
137
- plt.xlabel("Month")
138
- plt.ylabel("Message Count")
139
- st.pyplot(plt)
140
- plt.clf()
141
-
142
- if selected_user == 'Overall':
143
- st.title("Most Busy Users")
144
- x, new_df = helper.most_busy_users(df_filtered)
145
- if not x.empty:
146
- plt.figure(figsize=(10, 5))
147
- sns.barplot(x=x.index, y=x.values, palette="Reds_r")
148
- plt.title("Most Busy Users")
149
- plt.xlabel("User")
150
- plt.ylabel("Message Count")
151
- plt.xticks(rotation=45)
152
- st.pyplot(plt)
153
- st.title("Word Count by User")
154
- plt.clf()
155
- st.dataframe(new_df)
156
-
157
- # Most common words analysis
158
- st.title("Most Common Words")
159
- most_common_df = helper.most_common_words(selected_user, df_filtered)
160
- if not most_common_df.empty:
161
- fig, ax = plt.subplots(figsize=(10, 6))
162
- sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
163
- ax.set_title("Top 20 Most Common Words")
164
- ax.set_xlabel("Frequency")
165
- ax.set_ylabel("Words")
166
- plt.xticks(rotation='vertical')
167
- st.pyplot(fig)
168
- plt.clf()
169
- else:
170
- st.warning("No data available for most common words.")
171
-
172
- # Emoji analysis
173
- st.title("Emoji Analysis")
174
- emoji_df = helper.emoji_helper(selected_user, df_filtered)
175
- if not emoji_df.empty:
176
- col1, col2 = st.columns(2)
177
-
178
- with col1:
179
- st.subheader("Top Emojis Used")
180
- st.dataframe(emoji_df)
181
-
182
- with col2:
183
- fig, ax = plt.subplots(figsize=(8, 8))
184
- ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(),
185
- autopct="%0.2f%%", startangle=90,
186
- colors=sns.color_palette("pastel"))
187
- ax.set_title("Top Emoji Distribution")
188
- st.pyplot(fig)
189
- plt.clf()
190
- else:
191
- st.warning("No data available for emoji analysis.")
192
-
193
- # Sentiment Analysis Visualizations
194
- st.title("📈 Sentiment Analysis")
195
-
196
- # Convert month names to abbreviated format
197
- month_map = {
198
- 'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
199
- 'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
200
- 'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
201
- }
202
- df_filtered['month'] = df_filtered['month'].map(month_map)
203
-
204
- # Group by month and sentiment
205
- monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
206
-
207
- # Plotting: Histogram (Bar Chart) for each sentiment
208
- st.write("### Sentiment Count by Month (Histogram)")
209
-
210
- # Create a figure with subplots for each sentiment
211
- fig, axes = plt.subplots(1, 3, figsize=(18, 5))
212
-
213
- # Plot Positive Sentiment
214
- if 'positive' in monthly_sentiment:
215
- axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
216
- axes[0].set_title('Positive Sentiment')
217
- axes[0].set_xlabel('Month')
218
- axes[0].set_ylabel('Count')
219
-
220
- # Plot Neutral Sentiment
221
- if 'neutral' in monthly_sentiment:
222
- axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
223
- axes[1].set_title('Neutral Sentiment')
224
- axes[1].set_xlabel('Month')
225
- axes[1].set_ylabel('Count')
226
-
227
- # Plot Negative Sentiment
228
- if 'negative' in monthly_sentiment:
229
- axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
230
- axes[2].set_title('Negative Sentiment')
231
- axes[2].set_xlabel('Month')
232
- axes[2].set_ylabel('Count')
233
-
234
- # Display the plots in Streamlit
235
- st.pyplot(fig)
236
- plt.clf()
237
-
238
- # Count sentiments per day of the week
239
- sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
240
-
241
- # Sort days correctly
242
- day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
243
- sentiment_counts = sentiment_counts.reindex(day_order)
244
-
245
- # Daily Sentiment Analysis
246
- st.write("### Daily Sentiment Analysis")
247
-
248
- # Create a Matplotlib figure
249
- fig, ax = plt.subplots(figsize=(10, 5))
250
- sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])
251
-
252
- # Customize the plot
253
- ax.set_xlabel("Day of the Week")
254
- ax.set_ylabel("Count")
255
- ax.set_title("Sentiment Distribution per Day of the Week")
256
- ax.legend(title="Sentiment")
257
-
258
- # Display the plot in Streamlit
259
- st.pyplot(fig)
260
- plt.clf()
261
-
262
- # Count messages per user per sentiment (only for Overall view)
263
- if selected_user == 'Overall':
264
- sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')
265
-
266
- # Calculate total messages per sentiment
267
- total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()
268
-
269
- # Add percentage column
270
- sentiment_counts['Percentage'] = sentiment_counts.apply(
271
- lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
272
- )
273
-
274
- # Separate tables for each sentiment
275
- positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
276
- neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
277
- negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)
278
-
279
- # Sentiment Contribution Analysis
280
- st.write("### Sentiment Contribution by User")
281
-
282
- # Create three columns for side-by-side display
283
- col1, col2, col3 = st.columns(3)
284
-
285
- # Display Positive Table
286
- with col1:
287
- st.subheader("Top Positive Contributors")
288
- if not positive_df.empty:
289
- st.dataframe(positive_df[['user', 'Count', 'Percentage']])
290
- else:
291
- st.warning("No positive sentiment data")
292
-
293
- # Display Neutral Table
294
- with col2:
295
- st.subheader("Top Neutral Contributors")
296
- if not neutral_df.empty:
297
- st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
298
- else:
299
- st.warning("No neutral sentiment data")
300
-
301
- # Display Negative Table
302
- with col3:
303
- st.subheader("Top Negative Contributors")
304
- if not negative_df.empty:
305
- st.dataframe(negative_df[['user', 'Count', 'Percentage']])
306
- else:
307
- st.warning("No negative sentiment data")
308
-
309
- # Topic Analysis Section
310
- st.title("🔍 Area of Focus: Topic Analysis")
311
-
312
- # Check if topic column exists, otherwise perform topic modeling
313
- # if 'topic' not in df_filtered.columns:
314
- # with st.spinner("Performing topic modeling..."):
315
- # try:
316
- # # Add topic modeling here or ensure your helper functions handle it
317
- # df_filtered = helper.perform_topic_modeling(df_filtered)
318
- # except Exception as e:
319
- # st.error(f"Topic modeling failed: {str(e)}")
320
- # st.stop()
321
-
322
- # Plot Topic Distribution
323
- st.header("Topic Distribution")
324
- try:
325
- fig = helper.plot_topic_distribution(df_filtered)
326
- st.pyplot(fig)
327
- plt.clf()
328
- except Exception as e:
329
- st.warning(f"Could not display topic distribution: {str(e)}")
330
-
331
- # Display Sample Messages for Each Topic
332
- st.header("Sample Messages for Each Topic")
333
- if 'topic' in df_filtered.columns:
334
- for topic_id in sorted(df_filtered['topic'].unique()):
335
- st.subheader(f"Topic {topic_id}")
336
-
337
- # Get messages for the current topic
338
- filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
339
-
340
- # Determine sample size
341
- sample_size = min(5, len(filtered_messages))
342
-
343
- if sample_size > 0:
344
- sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
345
- for msg in sample_messages:
346
- st.write(f"- {msg}")
347
- else:
348
- st.write("No messages available for this topic.")
349
- else:
350
- st.warning("Topic information not available")
351
-
352
- # Topic Distribution Over Time
353
- st.header("📅 Topic Trends Over Time")
354
-
355
- # Add time frequency selector
356
- time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
357
-
358
- # Plot topic trends
359
- try:
360
- freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
361
- topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
362
-
363
- # Choose between static and interactive plot
364
- use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
365
-
366
- if use_plotly:
367
- fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
368
- st.plotly_chart(fig, use_container_width=True)
369
- else:
370
- fig = helper.plot_topic_distribution_over_time(topic_distribution)
371
- st.pyplot(fig)
372
- plt.clf()
373
- except Exception as e:
374
- st.warning(f"Could not display topic trends: {str(e)}")
375
-
376
- # Clustering Analysis Section
377
- st.title("🧩 Conversation Clusters")
378
-
379
- # Number of clusters input
380
- n_clusters = st.slider("Select number of clusters",
381
- min_value=2,
382
- max_value=10,
383
- value=5,
384
- key='n_clusters')
385
-
386
- # Perform clustering
387
- with st.spinner("Analyzing conversation clusters..."):
388
- try:
389
- df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
390
-
391
- # Plot clusters
392
- st.header("Cluster Visualization")
393
- fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
394
- st.pyplot(fig)
395
- plt.clf()
396
-
397
- # Cluster Insights
398
- st.header("📌 Cluster Insights")
399
-
400
- # 1. Dominant Conversation Themes
401
- st.subheader("1. Dominant Themes")
402
- cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
403
- for cluster_id, label in cluster_labels.items():
404
- st.write(f"**Cluster {cluster_id}**: {label}")
405
-
406
- # 2. Temporal Patterns
407
- st.subheader("2. Temporal Patterns")
408
- temporal_trends = helper.get_temporal_trends(df_clustered)
409
- for cluster_id, trend in temporal_trends.items():
410
- st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
411
-
412
- # 3. User Contributions
413
- if selected_user == 'Overall':
414
- st.subheader("3. Top Contributors")
415
- user_contributions = helper.get_user_contributions(df_clustered)
416
- for cluster_id, users in user_contributions.items():
417
- st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
418
-
419
- # 4. Sentiment by Cluster
420
- st.subheader("4. Sentiment Analysis")
421
- sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
422
- for cluster_id, sentiment in sentiment_by_cluster.items():
423
- st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
424
-
425
- # Sample messages from each cluster
426
- st.subheader("Sample Messages")
427
- for cluster_id in sorted(df_clustered['cluster'].unique()):
428
- with st.expander(f"Cluster {cluster_id} Messages"):
429
- cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
430
- sample_size = min(3, len(cluster_msgs))
431
- if sample_size > 0:
432
- for msg in cluster_msgs.sample(sample_size, replace=False):
433
- st.write(f"- {msg}")
434
- else:
435
- st.write("No messages available")
436
-
437
- except Exception as e:
438
- st.error(f"Clustering failed: {str(e)}")