Spaces:

afanyu237
/

whatsApp_chat

Sleeping

App Files Files Community

afanyu237 commited on Dec 5, 2025

Commit

6cc22fb

verified ·

1 Parent(s): bb7b43a

Update helper.py

Browse files

Files changed (1) hide show

helper.py +143 -59

helper.py CHANGED Viewed

@@ -5,9 +5,17 @@ from collections import Counter
 import emoji
 import matplotlib.pyplot as plt
 import seaborn as sns
-from collections import Counter
 import plotly.express as px
 extract = URLExtract()
@@ -25,7 +33,7 @@ def fetch_stats(selected_user,df):
         words.extend(message.split())
     # fetch number of media messages
-    num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0]
     # fetch number of links shared
     links = []
@@ -141,42 +149,127 @@ def activity_heatmap(selected_user,df):
     user_heatmap = df.pivot_table(index='day', columns='period', values='message', aggfunc='count').fillna(0)
     return user_heatmap
 def generate_wordcloud(text, color):
     wordcloud = WordCloud(width=400, height=300, background_color=color, colormap="viridis").generate(text)
     return wordcloud
-# def plot_topics(topics):
-#     """
-#     Plots a bar chart for the top words in each topic.
-#     """
-#     if not topics or not isinstance(topics[0], list):
-#         raise ValueError("topics must be a list of lists of words.")
-#     print("Topics received:", topics)  # Debugging
-#     fig, axes = plt.subplots(1, len(topics), figsize=(20, 10))
-#     if len(topics) == 1:
-#         axes = [axes]  # Ensure axes is iterable for single topic
-#     for idx, topic in enumerate(topics):
-#         if not isinstance(topic, list):
-#             raise ValueError(f"Topic {idx} is not a list of words.")
-#         top_words = topic
-#         print(f"Top words for Topic {idx}: {top_words}")  # Debugging
-#         axes[idx].barh(top_words, range(len(top_words)))
-#         axes[idx].set_title(f"Topic {idx}")
-#         axes[idx].set_xlabel("Word Importance")
-#         axes[idx].set_ylabel("Top Words")
-#     plt.tight_layout()
-#     return fig
 def plot_topic_distribution(df):
     """
     Plots the distribution of topics in the chat data.
     """
     topic_counts = df['topic'].value_counts().sort_index()
     fig, ax = plt.subplots()
-    sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis")
     ax.set_title("Topic Distribution")
     ax.set_xlabel("Topic")
     ax.set_ylabel("Number of Messages")
@@ -189,6 +282,16 @@ def most_frequent_keywords(messages, top_n=10):
     words = [word for msg in messages for word in msg.split()]
     word_freq = Counter(words)
     return word_freq.most_common(top_n)
 def plot_topic_distribution_over_time(topic_distribution):
     """
     Plots the distribution of topics over time using a line chart.
@@ -213,37 +316,11 @@ def plot_most_frequent_keywords(keywords):
     """
     words, counts = zip(*keywords)
     fig, ax = plt.subplots()
-    sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis")
     ax.set_title("Most Frequent Keywords")
     ax.set_xlabel("Frequency")
     ax.set_ylabel("Keyword")
     return fig
-def topic_distribution_over_time(df, time_freq='M'):
-    """
-    Analyzes the distribution of topics over time.
-    """
-    # Group by time interval and topic
-    df['time_period'] = df['date'].dt.to_period(time_freq)
-    topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
-    return topic_distribution
-def plot_topic_distribution_over_time(topic_distribution):
-    """
-    Plots the distribution of topics over time using a line chart.
-    """
-    fig, ax = plt.subplots(figsize=(12, 6))
-    # Plot each topic as a separate line
-    for topic in topic_distribution.columns:
-        ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
-    ax.set_title("Topic Distribution Over Time")
-    ax.set_xlabel("Time Period")
-    ax.set_ylabel("Number of Messages")
-    ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
-    plt.xticks(rotation=45)
-    plt.tight_layout()
-    return fig
 def plot_topic_distribution_over_time_plotly(topic_distribution):
     """
@@ -257,6 +334,7 @@ def plot_topic_distribution_over_time_plotly(topic_distribution):
                   title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'})
     fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
     return fig
 def plot_clusters(reduced_features, clusters):
     """
     Visualize clusters using t-SNE.
@@ -279,19 +357,25 @@ def plot_clusters(reduced_features, clusters):
     plt.ylabel("t-SNE Component 2")
     plt.tight_layout()
     return plt.gcf()
 def get_cluster_labels(df, n_clusters):
     """
     Generate descriptive labels for each cluster based on top keywords.
     """
-    from sklearn.feature_extraction.text import TfidfVectorizer
-    import numpy as np
     vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
     tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
     cluster_labels = {}
     for cluster_id in range(n_clusters):
-        cluster_indices = df[df['cluster'] == cluster_id].index
         if len(cluster_indices) > 0:
             cluster_tfidf = tfidf_matrix[cluster_indices]
             top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1]

 import emoji
 import matplotlib.pyplot as plt
 import seaborn as sns
 import plotly.express as px
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+# Import the AI topic titles module
+try:
+    from ai_topic_titles import generate_topic_titles
+    AI_TOPIC_TITLES_AVAILABLE = True
+except ImportError:
+    AI_TOPIC_TITLES_AVAILABLE = False
+    print("Note: ai_topic_titles module not found. Using basic topic titles.")
 extract = URLExtract()
         words.extend(message.split())
     # fetch number of media messages
+    num_media_messages = df[df['unfiltered_messages'].str.contains('<media omitted>', case=False, na=False)].shape[0]
     # fetch number of links shared
     links = []
     user_heatmap = df.pivot_table(index='day', columns='period', values='message', aggfunc='count').fillna(0)
     return user_heatmap
 def generate_wordcloud(text, color):
     wordcloud = WordCloud(width=400, height=300, background_color=color, colormap="viridis").generate(text)
     return wordcloud
+def create_heuristic_title(topic, idx):
+    """
+    Generates a simple title based on the top words in the topic.
+    """
+    return f"Topic {idx + 1}: {', '.join(topic[:3])}"
+def generate_topic_titles_wrapper(topics, hf_token=None, use_ai=True, **kwargs):
+    """
+    Generate titles for topics using AI or basic method.
+    Args:
+        topics (list): List of topics, where each topic is a list of words
+        hf_token (str, optional): Hugging Face token for AI mode
+        use_ai (bool): Whether to use AI (default: True if available)
+        **kwargs: Additional parameters for AI function
+    Returns:
+        list: List of topic titles
+    """
+    if use_ai and AI_TOPIC_TITLES_AVAILABLE:
+        try:
+            # Use AI with Hugging Face
+            api_type = "huggingface" if hf_token else "local"
+            # Get AI-generated titles
+            titles = generate_topic_titles(
+                topics=topics,
+                api_type=api_type,
+                hf_token=hf_token,
+                **kwargs
+            )
+            print("AI-generated topic titles:")
+            for title in titles:
+                print(f"  {title}")
+            return titles
+        except Exception as e:
+            print(f"AI topic title generation failed: {e}")
+            print("Falling back to basic titles...")
+            # Fall through to basic method
+    # Basic method (fallback)
+    titles = []
+    for idx, topic in enumerate(topics):
+        if isinstance(topic, list) and len(topic) >= 3:
+            # Create title from first 3 words
+            title = f"Topic {idx + 1}: {', '.join(topic[:3])}"
+        else:
+            title = f"Topic {idx + 1}: General Discussion"
+        titles.append(title)
+        print("THESE ARE THE TOPICS TITLES: ", title)
+    return titles
+# Keep the old function for backward compatibility
+def generate_topic_titles(topics, hf_token=None, **kwargs):
+    """
+    Generate titles for topics based on their top words.
+    Args:
+        topics (list): List of topics, where each topic is a list of words
+        hf_token (str, optional): Hugging Face token for AI mode
+        **kwargs: Additional parameters for AI function
+    Returns:
+        list: List of topic titles
+    """
+    # By default, try to use AI if available
+    use_ai = kwargs.pop('use_ai', True)
+    return generate_topic_titles_wrapper(topics, hf_token, use_ai, **kwargs)
+def plot_topics(topics, use_ai=True, hf_token=None, **kwargs):
+    """
+    Plots a bar chart for the top words in each topic.
+    Args:
+        topics: List of topics
+        use_ai: Whether to use AI for titles
+        hf_token: Hugging Face token for AI
+        **kwargs: Additional parameters for AI
+    Returns:
+        matplotlib.figure.Figure: The plot figure
+    """
+    if not topics or not isinstance(topics[0], list):
+        raise ValueError("topics must be a list of lists of words.")
+    # Generate titles using the wrapper
+    titles = generate_topic_titles_wrapper(topics, hf_token=hf_token, use_ai=use_ai, **kwargs)
+    fig, axes = plt.subplots(1, len(topics), figsize=(20, 10))
+    if len(topics) == 1:
+        axes = [axes]  # Ensure axes is iterable for single topic
+    for idx, topic in enumerate(topics):
+        if not isinstance(topic, list):
+            raise ValueError(f"Topic {idx} is not a list of words.")
+        top_words = topic[:10]  # Show top 10 words
+        axes[idx].barh(range(len(top_words)), range(len(top_words)))
+        axes[idx].set_yticks(range(len(top_words)))
+        axes[idx].set_yticklabels(top_words)
+        axes[idx].set_title(titles[idx], fontsize=14, fontweight='bold')
+        axes[idx].set_xlabel("Word Importance")
+        axes[idx].set_ylabel("Top Words")
+    plt.tight_layout()
+    return fig
 def plot_topic_distribution(df):
     """
     Plots the distribution of topics in the chat data.
     """
     topic_counts = df['topic'].value_counts().sort_index()
     fig, ax = plt.subplots()
+    sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis", hue=topic_counts.index, legend=False)
     ax.set_title("Topic Distribution")
     ax.set_xlabel("Topic")
     ax.set_ylabel("Number of Messages")
     words = [word for msg in messages for word in msg.split()]
     word_freq = Counter(words)
     return word_freq.most_common(top_n)
+def topic_distribution_over_time(df, time_freq='M'):
+    """
+    Analyzes the distribution of topics over time.
+    """
+    # Group by time interval and topic
+    df['time_period'] = df['date'].dt.to_period(time_freq)
+    topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
+    return topic_distribution
 def plot_topic_distribution_over_time(topic_distribution):
     """
     Plots the distribution of topics over time using a line chart.
     """
     words, counts = zip(*keywords)
     fig, ax = plt.subplots()
+    sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis", hue=list(words), legend=False)
     ax.set_title("Most Frequent Keywords")
     ax.set_xlabel("Frequency")
     ax.set_ylabel("Keyword")
     return fig
 def plot_topic_distribution_over_time_plotly(topic_distribution):
     """
                   title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'})
     fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
     return fig
 def plot_clusters(reduced_features, clusters):
     """
     Visualize clusters using t-SNE.
     plt.ylabel("t-SNE Component 2")
     plt.tight_layout()
     return plt.gcf()
+def remove_emojis(text):
+    """Removes emojis from text to prevent matplotlib warnings."""
+    return text.encode('ascii', 'ignore').decode('ascii')
 def get_cluster_labels(df, n_clusters):
     """
     Generate descriptive labels for each cluster based on top keywords.
     """
     vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
     tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
     cluster_labels = {}
+    # Reset index to ensure alignment with tfidf_matrix
+    df_reset = df.reset_index(drop=True)
     for cluster_id in range(n_clusters):
+        # Get indices where cluster matches
+        cluster_indices = df_reset[df_reset['cluster'] == cluster_id].index
         if len(cluster_indices) > 0:
             cluster_tfidf = tfidf_matrix[cluster_indices]
             top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1]