hansche commited on
Commit
bcbbed8
·
verified ·
1 Parent(s): 14e4130

Create helper.py

Browse files
Files changed (1) hide show
  1. helper.py +323 -0
helper.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urlextract import URLExtract
2
+ from wordcloud import WordCloud
3
+ import pandas as pd
4
+ from collections import Counter
5
+ import emoji
6
+ import plotly.express as px
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+
10
+ extract = URLExtract()
11
+
12
+ def fetch_stats(selected_user, df):
13
+ if selected_user != 'Overall':
14
+ df = df[df['user'] == selected_user]
15
+ num_messages = df.shape[0]
16
+ words = sum(len(msg.split()) for msg in df['message'])
17
+ num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0]
18
+ links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages'])
19
+ return num_messages, words, num_media_messages, links
20
+
21
+ def most_busy_users(df):
22
+ x = df['user'].value_counts().head()
23
+ df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
24
+ columns={'index': 'percentage', 'user': 'Name'})
25
+ return x, df
26
+
27
+ def create_wordcloud(selected_user, df):
28
+ if selected_user != 'Overall':
29
+ df = df[df['user'] == selected_user]
30
+ temp = df[df['user'] != 'group_notification']
31
+ temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
32
+ wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
33
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
34
+ return df_wc
35
+
36
+ def most_common_words(selected_user, df):
37
+ if selected_user != 'Overall':
38
+ df = df[df['user'] == selected_user]
39
+ temp = df[df['user'] != 'group_notification']
40
+ temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
41
+ words = [word for msg in temp['message'] for word in msg.lower().split()]
42
+ return pd.DataFrame(Counter(words).most_common(20))
43
+
44
+ def emoji_helper(selected_user, df):
45
+ if selected_user != 'Overall':
46
+ df = df[df['user'] == selected_user]
47
+ emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA]
48
+ return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
49
+
50
+ def monthly_timeline(selected_user, df):
51
+ if selected_user != 'Overall':
52
+ df = df[df['user'] == selected_user]
53
+ timeline = df.groupby(['year', 'month']).count()['message'].reset_index()
54
+ timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str)
55
+ return timeline
56
+
57
+ def daily_timeline(selected_user, df):
58
+ if selected_user != 'Overall':
59
+ df = df[df['user'] == selected_user]
60
+ return df.groupby('date').count()['message'].reset_index()
61
+
62
+ def week_activity_map(selected_user, df):
63
+ if selected_user != 'Overall':
64
+ df = df[df['user'] == selected_user]
65
+ return df['day_of_week'].value_counts()
66
+
67
+ def month_activity_map(selected_user, df):
68
+ if selected_user != 'Overall':
69
+ df = df[df['user'] == selected_user]
70
+ return df['month'].value_counts()
71
+
72
+ def plot_topic_distribution(df):
73
+ topic_counts = df['topic'].value_counts().sort_index()
74
+ fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis'])
75
+ return fig
76
+
77
+ def topic_distribution_over_time(df, time_freq='M'):
78
+ df['time_period'] = df['date'].dt.to_period(time_freq)
79
+ return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
80
+
81
+ def plot_topic_distribution_over_time_plotly(topic_distribution):
82
+ topic_distribution = topic_distribution.reset_index()
83
+ topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
84
+ topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
85
+ fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time")
86
+ fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
87
+ return fig
88
+
89
+ def plot_clusters(reduced_features, clusters):
90
+ fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)")
91
+ return fig
92
+ def most_common_words(selected_user, df):
93
+ # f = open('stop_hinglish.txt','r')
94
+ stop_words = df
95
+
96
+ if selected_user != 'Overall':
97
+ df = df[df['user'] == selected_user]
98
+
99
+ temp = df[df['user'] != 'group_notification']
100
+ temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
101
+
102
+ words = []
103
+
104
+ for message in temp['message']:
105
+ for word in message.lower().split():
106
+ if word not in stop_words:
107
+ words.append(word)
108
+
109
+ most_common_df = pd.DataFrame(Counter(words).most_common(20))
110
+ return most_common_df
111
+
112
+ def emoji_helper(selected_user, df):
113
+ if selected_user != 'Overall':
114
+ df = df[df['user'] == selected_user]
115
+
116
+ emojis = []
117
+ for message in df['unfiltered_messages']:
118
+ emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
119
+
120
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
121
+
122
+ return emoji_df
123
+ def plot_topic_distribution(df):
124
+ """
125
+ Plots the distribution of topics in the chat data.
126
+ """
127
+ topic_counts = df['topic'].value_counts().sort_index()
128
+ fig, ax = plt.subplots()
129
+ sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis")
130
+ ax.set_title("Topic Distribution")
131
+ ax.set_xlabel("Topic")
132
+ ax.set_ylabel("Number of Messages")
133
+ return fig
134
+
135
+ def most_frequent_keywords(messages, top_n=10):
136
+ """
137
+ Extracts the most frequent keywords from a list of messages.
138
+ """
139
+ words = [word for msg in messages for word in msg.split()]
140
+ word_freq = Counter(words)
141
+ return word_freq.most_common(top_n)
142
+ def plot_topic_distribution_over_time(topic_distribution):
143
+ """
144
+ Plots the distribution of topics over time using a line chart.
145
+ """
146
+ fig, ax = plt.subplots(figsize=(12, 6))
147
+
148
+ # Plot each topic as a separate line
149
+ for topic in topic_distribution.columns:
150
+ ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
151
+
152
+ ax.set_title("Topic Distribution Over Time")
153
+ ax.set_xlabel("Time Period")
154
+ ax.set_ylabel("Number of Messages")
155
+ ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
156
+ plt.xticks(rotation=45)
157
+ plt.tight_layout()
158
+ return fig
159
+
160
+ def plot_most_frequent_keywords(keywords):
161
+ """
162
+ Plots the most frequent keywords.
163
+ """
164
+ words, counts = zip(*keywords)
165
+ fig, ax = plt.subplots()
166
+ sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis")
167
+ ax.set_title("Most Frequent Keywords")
168
+ ax.set_xlabel("Frequency")
169
+ ax.set_ylabel("Keyword")
170
+ return fig
171
+ def topic_distribution_over_time(df, time_freq='M'):
172
+ """
173
+ Analyzes the distribution of topics over time.
174
+ """
175
+ # Group by time interval and topic
176
+ df['time_period'] = df['date'].dt.to_period(time_freq)
177
+ topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
178
+ return topic_distribution
179
+
180
+ def plot_topic_distribution_over_time(topic_distribution):
181
+ """
182
+ Plots the distribution of topics over time using a line chart.
183
+ """
184
+ fig, ax = plt.subplots(figsize=(12, 6))
185
+
186
+ # Plot each topic as a separate line
187
+ for topic in topic_distribution.columns:
188
+ ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
189
+
190
+ ax.set_title("Topic Distribution Over Time")
191
+ ax.set_xlabel("Time Period")
192
+ ax.set_ylabel("Number of Messages")
193
+ ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
194
+ plt.xticks(rotation=45)
195
+ plt.tight_layout()
196
+ return fig
197
+
198
+ def plot_topic_distribution_over_time_plotly(topic_distribution):
199
+ """
200
+ Plots the distribution of topics over time using Plotly.
201
+ """
202
+ topic_distribution = topic_distribution.reset_index()
203
+ topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
204
+ topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
205
+
206
+ fig = px.line(topic_distribution, x='time_period', y='count', color='topic',
207
+ title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'})
208
+ fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
209
+ return fig
210
+ def plot_clusters(reduced_features, clusters):
211
+ """
212
+ Visualize clusters using t-SNE.
213
+ Args:
214
+ reduced_features (np.array): 2D array of reduced features.
215
+ clusters (np.array): Cluster labels.
216
+ Returns:
217
+ fig (plt.Figure): Matplotlib figure object.
218
+ """
219
+ plt.figure(figsize=(10, 8))
220
+ sns.scatterplot(
221
+ x=reduced_features[:, 0],
222
+ y=reduced_features[:, 1],
223
+ hue=clusters,
224
+ palette="viridis",
225
+ legend="full"
226
+ )
227
+ plt.title("Message Clusters (t-SNE Visualization)")
228
+ plt.xlabel("t-SNE Component 1")
229
+ plt.ylabel("t-SNE Component 2")
230
+ plt.tight_layout()
231
+ return plt.gcf()
232
+ def get_cluster_labels(df, n_clusters):
233
+ """
234
+ Generate descriptive labels for each cluster based on top keywords.
235
+ """
236
+ from sklearn.feature_extraction.text import TfidfVectorizer
237
+ import numpy as np
238
+
239
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
240
+ tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
241
+
242
+ cluster_labels = {}
243
+ for cluster_id in range(n_clusters):
244
+ cluster_indices = df[df['cluster'] == cluster_id].index
245
+ if len(cluster_indices) > 0:
246
+ cluster_tfidf = tfidf_matrix[cluster_indices]
247
+ top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1]
248
+ cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords])
249
+ else:
250
+ cluster_labels[cluster_id] = "No dominant theme"
251
+ return cluster_labels
252
+
253
+ def get_temporal_trends(df):
254
+ """
255
+ Analyze temporal trends for each cluster (peak day and time).
256
+ """
257
+ temporal_trends = {}
258
+ for cluster_id in df['cluster'].unique():
259
+ cluster_data = df[df['cluster'] == cluster_id]
260
+ if not cluster_data.empty:
261
+ peak_day = cluster_data['day_of_week'].mode()[0]
262
+ peak_time = cluster_data['hour'].mode()[0]
263
+ temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"}
264
+ return temporal_trends
265
+
266
+ def get_user_contributions(df):
267
+ """
268
+ Identify top contributors for each cluster.
269
+ """
270
+ user_contributions = {}
271
+ for cluster_id in df['cluster'].unique():
272
+ cluster_data = df[df['cluster'] == cluster_id]
273
+ if not cluster_data.empty:
274
+ top_users = cluster_data['user'].value_counts().head(3).index.tolist()
275
+ user_contributions[cluster_id] = top_users
276
+ return user_contributions
277
+
278
+ def get_sentiment_by_cluster(df):
279
+ """
280
+ Analyze sentiment distribution for each cluster.
281
+ """
282
+ sentiment_by_cluster = {}
283
+ for cluster_id in df['cluster'].unique():
284
+ cluster_data = df[df['cluster'] == cluster_id]
285
+ if not cluster_data.empty:
286
+ sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
287
+ sentiment_by_cluster[cluster_id] = {
288
+ "positive": round(sentiment_counts.get('positive', 0)),
289
+ "neutral": round(sentiment_counts.get('neutral', 0)),
290
+ "negative": round(sentiment_counts.get('negative', 0))
291
+ }
292
+ return sentiment_by_cluster
293
+
294
+ def detect_anomalies(df):
295
+ """
296
+ Detect anomalies in each cluster (e.g., high link or media share).
297
+ """
298
+ anomalies = {}
299
+ for cluster_id in df['cluster'].unique():
300
+ cluster_data = df[df['cluster'] == cluster_id]
301
+ if not cluster_data.empty:
302
+ link_share = (cluster_data['message'].str.contains('http').mean()) * 100
303
+ media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100
304
+ if link_share > 50:
305
+ anomalies[cluster_id] = f"{round(link_share)}% of messages contain links."
306
+ elif media_share > 50:
307
+ anomalies[cluster_id] = f"{round(media_share)}% of messages are media files."
308
+ return anomalies
309
+
310
+ def generate_recommendations(df):
311
+ """
312
+ Generate actionable recommendations based on cluster insights.
313
+ """
314
+ recommendations = []
315
+ for cluster_id in df['cluster'].unique():
316
+ cluster_data = df[df['cluster'] == cluster_id]
317
+ if not cluster_data.empty:
318
+ sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
319
+ if sentiment_counts.get('negative', 0) > 50:
320
+ recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.")
321
+ if cluster_data['message'].str.contains('http').mean() > 0.5:
322
+ recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.")
323
+ return recommendations