Spaces:

hansche
/

Foci_of_Interest

No application file

App Files Files Community

hansche commited on Apr 10, 2025

Commit

bcbbed8

verified ·

1 Parent(s): 14e4130

Create helper.py

Browse files

Files changed (1) hide show

helper.py +323 -0

helper.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from urlextract import URLExtract
+from wordcloud import WordCloud
+import pandas as pd
+from collections import Counter
+import emoji
+import plotly.express as px
+import matplotlib.pyplot as plt
+import seaborn as sns
+extract = URLExtract()
+def fetch_stats(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    num_messages = df.shape[0]
+    words = sum(len(msg.split()) for msg in df['message'])
+    num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0]
+    links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages'])
+    return num_messages, words, num_media_messages, links
+def most_busy_users(df):
+    x = df['user'].value_counts().head()
+    df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
+        columns={'index': 'percentage', 'user': 'Name'})
+    return x, df
+def create_wordcloud(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    temp = df[df['user'] != 'group_notification']
+    temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
+    wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
+    df_wc = wc.generate(temp['message'].str.cat(sep=" "))
+    return df_wc
+def most_common_words(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    temp = df[df['user'] != 'group_notification']
+    temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
+    words = [word for msg in temp['message'] for word in msg.lower().split()]
+    return pd.DataFrame(Counter(words).most_common(20))
+def emoji_helper(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA]
+    return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
+def monthly_timeline(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    timeline = df.groupby(['year', 'month']).count()['message'].reset_index()
+    timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str)
+    return timeline
+def daily_timeline(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    return df.groupby('date').count()['message'].reset_index()
+def week_activity_map(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    return df['day_of_week'].value_counts()
+def month_activity_map(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    return df['month'].value_counts()
+def plot_topic_distribution(df):
+    topic_counts = df['topic'].value_counts().sort_index()
+    fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis'])
+    return fig
+def topic_distribution_over_time(df, time_freq='M'):
+    df['time_period'] = df['date'].dt.to_period(time_freq)
+    return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
+def plot_topic_distribution_over_time_plotly(topic_distribution):
+    topic_distribution = topic_distribution.reset_index()
+    topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
+    topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
+    fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time")
+    fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
+    return fig
+def plot_clusters(reduced_features, clusters):
+    fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)")
+    return fig
+def most_common_words(selected_user, df):
+    # f = open('stop_hinglish.txt','r')
+    stop_words = df
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    temp = df[df['user'] != 'group_notification']
+    temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
+    words = []
+    for message in temp['message']:
+        for word in message.lower().split():
+            if word not in stop_words:
+                words.append(word)
+    most_common_df = pd.DataFrame(Counter(words).most_common(20))
+    return most_common_df
+def emoji_helper(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    emojis = []
+    for message in df['unfiltered_messages']:
+        emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
+    emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
+    return emoji_df
+def plot_topic_distribution(df):
+    """
+    Plots the distribution of topics in the chat data.
+    """
+    topic_counts = df['topic'].value_counts().sort_index()
+    fig, ax = plt.subplots()
+    sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis")
+    ax.set_title("Topic Distribution")
+    ax.set_xlabel("Topic")
+    ax.set_ylabel("Number of Messages")
+    return fig
+def most_frequent_keywords(messages, top_n=10):
+    """
+    Extracts the most frequent keywords from a list of messages.
+    """
+    words = [word for msg in messages for word in msg.split()]
+    word_freq = Counter(words)
+    return word_freq.most_common(top_n)
+def plot_topic_distribution_over_time(topic_distribution):
+    """
+    Plots the distribution of topics over time using a line chart.
+    """
+    fig, ax = plt.subplots(figsize=(12, 6))
+    # Plot each topic as a separate line
+    for topic in topic_distribution.columns:
+        ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
+    ax.set_title("Topic Distribution Over Time")
+    ax.set_xlabel("Time Period")
+    ax.set_ylabel("Number of Messages")
+    ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    return fig
+def plot_most_frequent_keywords(keywords):
+    """
+    Plots the most frequent keywords.
+    """
+    words, counts = zip(*keywords)
+    fig, ax = plt.subplots()
+    sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis")
+    ax.set_title("Most Frequent Keywords")
+    ax.set_xlabel("Frequency")
+    ax.set_ylabel("Keyword")
+    return fig
+def topic_distribution_over_time(df, time_freq='M'):
+    """
+    Analyzes the distribution of topics over time.
+    """
+    # Group by time interval and topic
+    df['time_period'] = df['date'].dt.to_period(time_freq)
+    topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
+    return topic_distribution
+def plot_topic_distribution_over_time(topic_distribution):
+    """
+    Plots the distribution of topics over time using a line chart.
+    """
+    fig, ax = plt.subplots(figsize=(12, 6))
+    # Plot each topic as a separate line
+    for topic in topic_distribution.columns:
+        ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
+    ax.set_title("Topic Distribution Over Time")
+    ax.set_xlabel("Time Period")
+    ax.set_ylabel("Number of Messages")
+    ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    return fig
+def plot_topic_distribution_over_time_plotly(topic_distribution):
+    """
+    Plots the distribution of topics over time using Plotly.
+    """
+    topic_distribution = topic_distribution.reset_index()
+    topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
+    topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
+    fig = px.line(topic_distribution, x='time_period', y='count', color='topic',
+                  title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'})
+    fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
+    return fig
+def plot_clusters(reduced_features, clusters):
+    """
+    Visualize clusters using t-SNE.
+    Args:
+        reduced_features (np.array): 2D array of reduced features.
+        clusters (np.array): Cluster labels.
+    Returns:
+        fig (plt.Figure): Matplotlib figure object.
+    """
+    plt.figure(figsize=(10, 8))
+    sns.scatterplot(
+        x=reduced_features[:, 0],
+        y=reduced_features[:, 1],
+        hue=clusters,
+        palette="viridis",
+        legend="full"
+    )
+    plt.title("Message Clusters (t-SNE Visualization)")
+    plt.xlabel("t-SNE Component 1")
+    plt.ylabel("t-SNE Component 2")
+    plt.tight_layout()
+    return plt.gcf()
+def get_cluster_labels(df, n_clusters):
+    """
+    Generate descriptive labels for each cluster based on top keywords.
+    """
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    import numpy as np
+    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
+    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
+    cluster_labels = {}
+    for cluster_id in range(n_clusters):
+        cluster_indices = df[df['cluster'] == cluster_id].index
+        if len(cluster_indices) > 0:
+            cluster_tfidf = tfidf_matrix[cluster_indices]
+            top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1]
+            cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords])
+        else:
+            cluster_labels[cluster_id] = "No dominant theme"
+    return cluster_labels
+def get_temporal_trends(df):
+    """
+    Analyze temporal trends for each cluster (peak day and time).
+    """
+    temporal_trends = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            peak_day = cluster_data['day_of_week'].mode()[0]
+            peak_time = cluster_data['hour'].mode()[0]
+            temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"}
+    return temporal_trends
+def get_user_contributions(df):
+    """
+    Identify top contributors for each cluster.
+    """
+    user_contributions = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            top_users = cluster_data['user'].value_counts().head(3).index.tolist()
+            user_contributions[cluster_id] = top_users
+    return user_contributions
+def get_sentiment_by_cluster(df):
+    """
+    Analyze sentiment distribution for each cluster.
+    """
+    sentiment_by_cluster = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
+            sentiment_by_cluster[cluster_id] = {
+                "positive": round(sentiment_counts.get('positive', 0)),
+                "neutral": round(sentiment_counts.get('neutral', 0)),
+                "negative": round(sentiment_counts.get('negative', 0))
+            }
+    return sentiment_by_cluster
+def detect_anomalies(df):
+    """
+    Detect anomalies in each cluster (e.g., high link or media share).
+    """
+    anomalies = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            link_share = (cluster_data['message'].str.contains('http').mean()) * 100
+            media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100
+            if link_share > 50:
+                anomalies[cluster_id] = f"{round(link_share)}% of messages contain links."
+            elif media_share > 50:
+                anomalies[cluster_id] = f"{round(media_share)}% of messages are media files."
+    return anomalies
+def generate_recommendations(df):
+    """
+    Generate actionable recommendations based on cluster insights.
+    """
+    recommendations = []
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
+            if sentiment_counts.get('negative', 0) > 50:
+                recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.")
+            if cluster_data['message'].str.contains('http').mean() > 0.5:
+                recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.")
+    return recommendations