| from urlextract import URLExtract |
| from wordcloud import WordCloud |
| import pandas as pd |
| from collections import Counter |
| import emoji |
| import plotly.express as px |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
|
|
| extract = URLExtract() |
|
|
| def fetch_stats(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| num_messages = df.shape[0] |
| words = sum(len(msg.split()) for msg in df['message']) |
| num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0] |
| links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages']) |
| return num_messages, words, num_media_messages, links |
|
|
| def most_busy_users(df): |
| x = df['user'].value_counts().head() |
| df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename( |
| columns={'index': 'percentage', 'user': 'Name'}) |
| return x, df |
|
|
| def create_wordcloud(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| temp = df[df['user'] != 'group_notification'] |
| temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] |
| wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') |
| df_wc = wc.generate(temp['message'].str.cat(sep=" ")) |
| return df_wc |
|
|
| def most_common_words(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| temp = df[df['user'] != 'group_notification'] |
| temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] |
| words = [word for msg in temp['message'] for word in msg.lower().split()] |
| return pd.DataFrame(Counter(words).most_common(20)) |
|
|
| def emoji_helper(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA] |
| return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) |
|
|
| def monthly_timeline(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| timeline = df.groupby(['year', 'month']).count()['message'].reset_index() |
| timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str) |
| return timeline |
|
|
| def daily_timeline(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| return df.groupby('date').count()['message'].reset_index() |
|
|
| def week_activity_map(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| return df['day_of_week'].value_counts() |
|
|
| def month_activity_map(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
| return df['month'].value_counts() |
|
|
| def plot_topic_distribution(df): |
| topic_counts = df['topic'].value_counts().sort_index() |
| fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis']) |
| return fig |
|
|
| def topic_distribution_over_time(df, time_freq='M'): |
| df['time_period'] = df['date'].dt.to_period(time_freq) |
| return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) |
|
|
| def plot_topic_distribution_over_time_plotly(topic_distribution): |
| topic_distribution = topic_distribution.reset_index() |
| topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() |
| topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') |
| fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time") |
| fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) |
| return fig |
|
|
| def plot_clusters(reduced_features, clusters): |
| fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)") |
| return fig |
| def most_common_words(selected_user, df): |
| |
| stop_words = df |
|
|
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
|
|
| temp = df[df['user'] != 'group_notification'] |
| temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] |
|
|
| words = [] |
|
|
| for message in temp['message']: |
| for word in message.lower().split(): |
| if word not in stop_words: |
| words.append(word) |
|
|
| most_common_df = pd.DataFrame(Counter(words).most_common(20)) |
| return most_common_df |
|
|
| def emoji_helper(selected_user, df): |
| if selected_user != 'Overall': |
| df = df[df['user'] == selected_user] |
|
|
| emojis = [] |
| for message in df['unfiltered_messages']: |
| emojis.extend([c for c in message if c in emoji.EMOJI_DATA]) |
|
|
| emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) |
|
|
| return emoji_df |
| def plot_topic_distribution(df): |
| """ |
| Plots the distribution of topics in the chat data. |
| """ |
| topic_counts = df['topic'].value_counts().sort_index() |
| fig, ax = plt.subplots() |
| sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis") |
| ax.set_title("Topic Distribution") |
| ax.set_xlabel("Topic") |
| ax.set_ylabel("Number of Messages") |
| return fig |
|
|
| def most_frequent_keywords(messages, top_n=10): |
| """ |
| Extracts the most frequent keywords from a list of messages. |
| """ |
| words = [word for msg in messages for word in msg.split()] |
| word_freq = Counter(words) |
| return word_freq.most_common(top_n) |
| def plot_topic_distribution_over_time(topic_distribution): |
| """ |
| Plots the distribution of topics over time using a line chart. |
| """ |
| fig, ax = plt.subplots(figsize=(12, 6)) |
| |
| |
| for topic in topic_distribution.columns: |
| ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") |
| |
| ax.set_title("Topic Distribution Over Time") |
| ax.set_xlabel("Time Period") |
| ax.set_ylabel("Number of Messages") |
| ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') |
| plt.xticks(rotation=45) |
| plt.tight_layout() |
| return fig |
|
|
| def plot_most_frequent_keywords(keywords): |
| """ |
| Plots the most frequent keywords. |
| """ |
| words, counts = zip(*keywords) |
| fig, ax = plt.subplots() |
| sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis") |
| ax.set_title("Most Frequent Keywords") |
| ax.set_xlabel("Frequency") |
| ax.set_ylabel("Keyword") |
| return fig |
| def topic_distribution_over_time(df, time_freq='M'): |
| """ |
| Analyzes the distribution of topics over time. |
| """ |
| |
| df['time_period'] = df['date'].dt.to_period(time_freq) |
| topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) |
| return topic_distribution |
|
|
| def plot_topic_distribution_over_time(topic_distribution): |
| """ |
| Plots the distribution of topics over time using a line chart. |
| """ |
| fig, ax = plt.subplots(figsize=(12, 6)) |
| |
| |
| for topic in topic_distribution.columns: |
| ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") |
| |
| ax.set_title("Topic Distribution Over Time") |
| ax.set_xlabel("Time Period") |
| ax.set_ylabel("Number of Messages") |
| ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') |
| plt.xticks(rotation=45) |
| plt.tight_layout() |
| return fig |
|
|
| def plot_topic_distribution_over_time_plotly(topic_distribution): |
| """ |
| Plots the distribution of topics over time using Plotly. |
| """ |
| topic_distribution = topic_distribution.reset_index() |
| topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() |
| topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') |
| |
| fig = px.line(topic_distribution, x='time_period', y='count', color='topic', |
| title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'}) |
| fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) |
| return fig |
| def plot_clusters(reduced_features, clusters): |
| """ |
| Visualize clusters using t-SNE. |
| Args: |
| reduced_features (np.array): 2D array of reduced features. |
| clusters (np.array): Cluster labels. |
| Returns: |
| fig (plt.Figure): Matplotlib figure object. |
| """ |
| plt.figure(figsize=(10, 8)) |
| sns.scatterplot( |
| x=reduced_features[:, 0], |
| y=reduced_features[:, 1], |
| hue=clusters, |
| palette="viridis", |
| legend="full" |
| ) |
| plt.title("Message Clusters (t-SNE Visualization)") |
| plt.xlabel("t-SNE Component 1") |
| plt.ylabel("t-SNE Component 2") |
| plt.tight_layout() |
| return plt.gcf() |
| def get_cluster_labels(df, n_clusters): |
| """ |
| Generate descriptive labels for each cluster based on top keywords. |
| """ |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| import numpy as np |
|
|
| vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') |
| tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message']) |
|
|
| cluster_labels = {} |
| for cluster_id in range(n_clusters): |
| cluster_indices = df[df['cluster'] == cluster_id].index |
| if len(cluster_indices) > 0: |
| cluster_tfidf = tfidf_matrix[cluster_indices] |
| top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1] |
| cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords]) |
| else: |
| cluster_labels[cluster_id] = "No dominant theme" |
| return cluster_labels |
|
|
| def get_temporal_trends(df): |
| """ |
| Analyze temporal trends for each cluster (peak day and time). |
| """ |
| temporal_trends = {} |
| for cluster_id in df['cluster'].unique(): |
| cluster_data = df[df['cluster'] == cluster_id] |
| if not cluster_data.empty: |
| peak_day = cluster_data['day_of_week'].mode()[0] |
| peak_time = cluster_data['hour'].mode()[0] |
| temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"} |
| return temporal_trends |
|
|
| def get_user_contributions(df): |
| """ |
| Identify top contributors for each cluster. |
| """ |
| user_contributions = {} |
| for cluster_id in df['cluster'].unique(): |
| cluster_data = df[df['cluster'] == cluster_id] |
| if not cluster_data.empty: |
| top_users = cluster_data['user'].value_counts().head(3).index.tolist() |
| user_contributions[cluster_id] = top_users |
| return user_contributions |
|
|
| def get_sentiment_by_cluster(df): |
| """ |
| Analyze sentiment distribution for each cluster. |
| """ |
| sentiment_by_cluster = {} |
| for cluster_id in df['cluster'].unique(): |
| cluster_data = df[df['cluster'] == cluster_id] |
| if not cluster_data.empty: |
| sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 |
| sentiment_by_cluster[cluster_id] = { |
| "positive": round(sentiment_counts.get('positive', 0)), |
| "neutral": round(sentiment_counts.get('neutral', 0)), |
| "negative": round(sentiment_counts.get('negative', 0)) |
| } |
| return sentiment_by_cluster |
|
|
| def detect_anomalies(df): |
| """ |
| Detect anomalies in each cluster (e.g., high link or media share). |
| """ |
| anomalies = {} |
| for cluster_id in df['cluster'].unique(): |
| cluster_data = df[df['cluster'] == cluster_id] |
| if not cluster_data.empty: |
| link_share = (cluster_data['message'].str.contains('http').mean()) * 100 |
| media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100 |
| if link_share > 50: |
| anomalies[cluster_id] = f"{round(link_share)}% of messages contain links." |
| elif media_share > 50: |
| anomalies[cluster_id] = f"{round(media_share)}% of messages are media files." |
| return anomalies |
|
|
| def generate_recommendations(df): |
| """ |
| Generate actionable recommendations based on cluster insights. |
| """ |
| recommendations = [] |
| for cluster_id in df['cluster'].unique(): |
| cluster_data = df[df['cluster'] == cluster_id] |
| if not cluster_data.empty: |
| sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 |
| if sentiment_counts.get('negative', 0) > 50: |
| recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.") |
| if cluster_data['message'].str.contains('http').mean() > 0.5: |
| recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.") |
| return recommendations |