Spaces:
Runtime error
Runtime error
| import matplotlib.pyplot as plt | |
| from urlextract import URLExtract | |
| from wordcloud import WordCloud | |
| import pandas as pd | |
| from collections import Counter | |
| import emoji | |
| from textblob import TextBlob | |
| # Initialize URL extractor globally | |
| url_extractor = URLExtract() | |
| def fetch_stats(df: pd.DataFrame, selected_user: str): | |
| """ | |
| Fetch basic statistics (message count, word count, media count, and link count) | |
| for a selected user or aggregate statistics if 'Overall' is selected. | |
| """ | |
| # Filter the DataFrame based on the selected user | |
| df_filtered = df if selected_user == 'Overall' else df[df['User'] == selected_user] | |
| # Calculate statistics | |
| num_messages = df_filtered.shape[0] | |
| num_words = df_filtered['Message'].apply(lambda msg: len(msg.split())).sum() | |
| num_media_messages = df_filtered[df_filtered['Message'] == '<Media omitted>'].shape[0] | |
| num_links = df_filtered['Message'].apply(lambda msg: len(url_extractor.find_urls(msg))).sum() | |
| return num_messages, num_words, num_media_messages, num_links | |
| def top_active_users(df: pd.DataFrame, top_n: int = 5): | |
| """ | |
| Calculate the top N most active users based on their message count. | |
| """ | |
| top_users = df['User'].value_counts().head(top_n) | |
| user_percentage = (df['User'].value_counts(normalize=True) * 100).round(2).reset_index() | |
| user_percentage.columns = ['User', 'Percentage'] | |
| return top_users, user_percentage | |
| def create_wordcloud(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Generate a word cloud from the selected user's messages. | |
| """ | |
| with open('stop_hinglish.txt', 'r') as file: | |
| stop_words = set(file.read().split()) | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| filtered_messages = df[(df['User'] != 'group_notification') & (df['Message'] != '<Media omitted>')] | |
| filtered_messages['Message'] = filtered_messages['Message'].apply( | |
| lambda msg: " ".join(word for word in msg.lower().split() if word not in stop_words) | |
| ) | |
| wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') | |
| wordcloud = wc.generate(filtered_messages['Message'].str.cat(sep=" ")) | |
| return wordcloud | |
| def most_common_word(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Get the most common words used by the selected user, excluding stop words. | |
| """ | |
| with open('stop_hinglish.txt', 'r') as file: | |
| stop_words = set(file.read().split()) | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| filtered_messages = df[(df['User'] != 'group_notification') & (df['Message'] != '<Media omitted>')] | |
| words = [ | |
| word | |
| for message in filtered_messages['Message'] | |
| for word in message.lower().split() | |
| if word not in stop_words | |
| ] | |
| word_counts = pd.DataFrame(Counter(words).most_common(20)) | |
| return word_counts | |
| def emojis_analysis(selected_user: str, df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Extract and analyze emojis from messages for a specific user or overall. | |
| """ | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| extracted_emojis = [ | |
| char | |
| for message in df['Message'].dropna() | |
| for char in message | |
| if char in emoji.EMOJI_DATA | |
| ] | |
| emoji_counts = Counter(extracted_emojis) | |
| emoji_df = pd.DataFrame(emoji_counts.most_common()) | |
| return emoji_df | |
| def monthly_timeline(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Generate a timeline of message counts by month and year for the selected user. | |
| """ | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| timeline = df.groupby(['Year', 'month_num', 'Month']).count()['Message'].reset_index() | |
| timeline['time'] = timeline.apply(lambda row: f"{row['Month']}-{row['Year']}", axis=1) | |
| return timeline | |
| def daily_timeline(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Generate a daily timeline of messages for the selected user. | |
| """ | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| daily_timeline = df.groupby('only_date').count()['Message'].reset_index() | |
| return daily_timeline | |
| def week_activity_map(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Generate a weekly activity map of messages for the selected user. | |
| """ | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| return df['day_name'].value_counts() | |
| def month_activity_map(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Generate a monthly activity map of messages for the selected user. | |
| """ | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| return df['Month'].value_counts() | |
| def activity_heatmap(selected_user: str, df: pd.DataFrame): | |
| """ | |
| Generate a heatmap of user activity by day and period for the selected user. | |
| """ | |
| if selected_user != 'Overall': | |
| df = df[df['User'] == selected_user] | |
| user_heatmap = df.pivot_table(index='day_name', columns='period', values='Message', aggfunc='count').fillna(0) | |
| return user_heatmap | |
| # Function to perform sentiment analysis | |
| def perform_sentiment_analysis(data, selected_user): | |
| """Analyze sentiment of messages.""" | |
| if selected_user != "Overall": | |
| data = data[data["User"] == selected_user] | |
| # Calculate polarity and classify sentiment | |
| data["Polarity"] = data["Message"].apply(lambda x: TextBlob(x).sentiment.polarity) | |
| data["Sentiment"] = data["Polarity"].apply( | |
| lambda x: "Positive" if x > 0.1 else ("Negative" if x < -0.1 else "Neutral") | |
| ) | |
| # Aggregate sentiment counts | |
| sentiment_counts = data["Sentiment"].value_counts() | |
| return sentiment_counts, data |