File size: 5,885 Bytes
7089d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import matplotlib.pyplot as plt
from urlextract import URLExtract
from wordcloud import WordCloud
import pandas as pd
from collections import Counter
import emoji

from textblob import TextBlob 

# Initialize URL extractor globally
url_extractor = URLExtract()

def fetch_stats(df: pd.DataFrame, selected_user: str):
    """

    Fetch basic statistics (message count, word count, media count, and link count)

    for a selected user or aggregate statistics if 'Overall' is selected.

    """
    # Filter the DataFrame based on the selected user
    df_filtered = df if selected_user == 'Overall' else df[df['User'] == selected_user]

    # Calculate statistics
    num_messages = df_filtered.shape[0]
    num_words = df_filtered['Message'].apply(lambda msg: len(msg.split())).sum()
    num_media_messages = df_filtered[df_filtered['Message'] == '<Media omitted>'].shape[0]
    num_links = df_filtered['Message'].apply(lambda msg: len(url_extractor.find_urls(msg))).sum()

    return num_messages, num_words, num_media_messages, num_links

def top_active_users(df: pd.DataFrame, top_n: int = 5):
    """

    Calculate the top N most active users based on their message count.

    """
    top_users = df['User'].value_counts().head(top_n)
    user_percentage = (df['User'].value_counts(normalize=True) * 100).round(2).reset_index()
    user_percentage.columns = ['User', 'Percentage']

    return top_users, user_percentage

def create_wordcloud(selected_user: str, df: pd.DataFrame):
    """

    Generate a word cloud from the selected user's messages.

    """
    with open('stop_hinglish.txt', 'r') as file:
        stop_words = set(file.read().split())

    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    filtered_messages = df[(df['User'] != 'group_notification') & (df['Message'] != '<Media omitted>')]
    filtered_messages['Message'] = filtered_messages['Message'].apply(
        lambda msg: " ".join(word for word in msg.lower().split() if word not in stop_words)
    )

    wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
    wordcloud = wc.generate(filtered_messages['Message'].str.cat(sep=" "))

    return wordcloud

def most_common_word(selected_user: str, df: pd.DataFrame):
    """

    Get the most common words used by the selected user, excluding stop words.

    """
    with open('stop_hinglish.txt', 'r') as file:
        stop_words = set(file.read().split())

    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    filtered_messages = df[(df['User'] != 'group_notification') & (df['Message'] != '<Media omitted>')]
    words = [
        word
        for message in filtered_messages['Message']
        for word in message.lower().split()
        if word not in stop_words
    ]

    word_counts = pd.DataFrame(Counter(words).most_common(20))

    return word_counts

def emojis_analysis(selected_user: str, df: pd.DataFrame) -> pd.DataFrame:
    """

    Extract and analyze emojis from messages for a specific user or overall.

    """
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    extracted_emojis = [
        char
        for message in df['Message'].dropna()
        for char in message
        if char in emoji.EMOJI_DATA
    ]

    emoji_counts = Counter(extracted_emojis)
    emoji_df = pd.DataFrame(emoji_counts.most_common())

    return emoji_df

def monthly_timeline(selected_user: str, df: pd.DataFrame):
    """

    Generate a timeline of message counts by month and year for the selected user.

    """
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    timeline = df.groupby(['Year', 'month_num', 'Month']).count()['Message'].reset_index()
    timeline['time'] = timeline.apply(lambda row: f"{row['Month']}-{row['Year']}", axis=1)

    return timeline

def daily_timeline(selected_user: str, df: pd.DataFrame):
    """

    Generate a daily timeline of messages for the selected user.

    """
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    daily_timeline = df.groupby('only_date').count()['Message'].reset_index()

    return daily_timeline

def week_activity_map(selected_user: str, df: pd.DataFrame):
    """

    Generate a weekly activity map of messages for the selected user.

    """
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    return df['day_name'].value_counts()

def month_activity_map(selected_user: str, df: pd.DataFrame):
    """

    Generate a monthly activity map of messages for the selected user.

    """
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    return df['Month'].value_counts()

def activity_heatmap(selected_user: str, df: pd.DataFrame):
    """

    Generate a heatmap of user activity by day and period for the selected user.

    """
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]

    user_heatmap = df.pivot_table(index='day_name', columns='period', values='Message', aggfunc='count').fillna(0)

    return user_heatmap

# Function to perform sentiment analysis
def perform_sentiment_analysis(data, selected_user):
    """Analyze sentiment of messages."""
    if selected_user != "Overall":
        data = data[data["User"] == selected_user]

    # Calculate polarity and classify sentiment
    data["Polarity"] = data["Message"].apply(lambda x: TextBlob(x).sentiment.polarity)
    data["Sentiment"] = data["Polarity"].apply(
        lambda x: "Positive" if x > 0.1 else ("Negative" if x < -0.1 else "Neutral")
    )

    # Aggregate sentiment counts
    sentiment_counts = data["Sentiment"].value_counts()

    return sentiment_counts, data