SandhyaRaghav commited on
Commit
a80ec58
·
verified ·
1 Parent(s): ff4d3b7

Upload 3 files

Browse files
Files changed (2) hide show
  1. src/helper.py +132 -0
  2. src/preprocessor.py +48 -0
src/helper.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urlextract import URLExtract
2
+ from wordcloud import WordCloud
3
+ import pandas as pd
4
+ from collections import Counter
5
+ import emoji
6
+
7
+
8
+ def fetch_stats(selected_user,df):
9
+ if selected_user != 'Overall':
10
+ df = df[df['user'] == selected_user]
11
+
12
+ # fetch the number of messages
13
+ num_messages = df.shape[0]
14
+
15
+ # fetch the total number of words
16
+ words = []
17
+ for message in df['message']:
18
+ words.extend(message.split())
19
+
20
+ # fetch number of media messages
21
+ num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]
22
+
23
+ # fetch number of links shared
24
+ links = []
25
+ extract = URLExtract()
26
+ for message in df['message']:
27
+ links.extend(extract.find_urls(message))
28
+
29
+ return num_messages,len(words),num_media_messages,len(links)
30
+
31
+ def most_busy_users(df):
32
+ top_users = df['user'].value_counts().head()
33
+ user_percent = round((df['user'].value_counts(normalize=True) * 100), 2).reset_index()
34
+ user_percent.columns = ['name', 'percent']
35
+ return top_users, user_percent
36
+
37
+
38
+ def create_wordcloud(selected_user,df):
39
+
40
+ f = open('stop_hinglish.txt', 'r')
41
+ stop_words = f.read()
42
+
43
+ if selected_user != 'Overall':
44
+ df = df[df['user'] == selected_user]
45
+
46
+ temp = df[df['user'] != 'group_notification']
47
+ temp = temp[temp['message'] != '<Media omitted>\n']
48
+
49
+ def remove_stop_words(message):
50
+ y = []
51
+ for word in message.lower().split():
52
+ if word not in stop_words:
53
+ y.append(word)
54
+ return " ".join(y)
55
+
56
+ wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
57
+ temp['message'] = temp['message'].apply(remove_stop_words)
58
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
59
+ return df_wc
60
+
61
+ def most_common_words(selected_user,df):
62
+
63
+ f = open('stop_hinglish.txt','r')
64
+ stop_words = f.read()
65
+
66
+ if selected_user != 'Overall':
67
+ df = df[df['user'] == selected_user]
68
+
69
+ temp = df[df['user'] != 'group_notification']
70
+ temp = temp[temp['message'] != '<Media omitted>\n']
71
+
72
+ words = []
73
+
74
+ for message in temp['message']:
75
+ for word in message.lower().split():
76
+ if word not in stop_words:
77
+ words.append(word)
78
+
79
+ most_common_df = pd.DataFrame(Counter(words).most_common(20))
80
+ return most_common_df
81
+
82
+
83
+ def emoji_helper(selected_user,df):
84
+ if selected_user != 'Overall':
85
+ df = df[df['user'] == selected_user]
86
+
87
+ emojis = []
88
+ for message in df['message']:
89
+ #emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
90
+ emojis.extend([c for c in message if emoji.is_emoji(c)])
91
+
92
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
93
+
94
+ return emoji_df
95
+
96
+ def monthly_timeline(selected_user,df):
97
+
98
+ if selected_user != 'Overall':
99
+ df = df[df['user'] == selected_user]
100
+
101
+ timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
102
+
103
+ time = []
104
+ for i in range(timeline.shape[0]):
105
+ time.append(timeline['month'][i] + "-" + str(timeline['year'][i]))
106
+
107
+ timeline['time'] = time
108
+
109
+ return timeline
110
+
111
+ def daily_timeline(selected_user,df):
112
+
113
+ if selected_user != 'Overall':
114
+ df = df[df['user'] == selected_user]
115
+
116
+ daily_timeline = df.groupby('only_date').count()['message'].reset_index()
117
+
118
+ return daily_timeline
119
+
120
+ def week_activity_map(selected_user,df):
121
+
122
+ if selected_user != 'Overall':
123
+ df = df[df['user'] == selected_user]
124
+
125
+ return df['day_name'].value_counts()
126
+
127
+ def month_activity_map(selected_user,df):
128
+
129
+ if selected_user != 'Overall':
130
+ df = df[df['user'] == selected_user]
131
+
132
+ return df['month'].value_counts()
src/preprocessor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def preprocess(data):
5
+ print("Preprocess started")
6
+
7
+ pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s(?:1[0-2]|0?[1-9]):[0-5][0-9][\s\u202f\u00a0]?(?:AM|PM|am|pm)\s-\s'
8
+ messages = re.split(pattern, data)[1:]
9
+ date = re.findall(pattern, data)
10
+ print(f"Found {len(messages)} messages and {len(date)} dates")
11
+
12
+ dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
13
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
14
+
15
+ try:
16
+ df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%y, %I:%M %p - ')
17
+ except Exception as e:
18
+ print("Date parsing error:", e)
19
+ return None
20
+
21
+ df.rename(columns={'message_date': 'date'}, inplace=True)
22
+
23
+ users = []
24
+ messages_list = []
25
+ for message in df['user_message']:
26
+ entry = re.split(r'([\w\W]+?):\s', message)
27
+ if entry[1:]: # user exists
28
+ users.append(entry[1])
29
+ messages_list.append(" ".join(entry[2:]))
30
+ else:
31
+ users.append('group_notification')
32
+ messages_list.append(entry[0])
33
+
34
+ df['user'] = users
35
+ df['message'] = messages_list
36
+
37
+ df.drop(columns=['user_message'], inplace=True)
38
+ df['only_date'] = df['date'].dt.date
39
+ df['year'] = df['date'].dt.year
40
+ df['month_num'] = df['date'].dt.month
41
+ df['month'] = df['date'].dt.month_name()
42
+ df['day'] = df['date'].dt.day
43
+ df['day_name'] = df['date'].dt.day_name()
44
+ df['hour'] = df['date'].dt.hour
45
+ df['minute'] = df['date'].dt.minute
46
+
47
+ return df
48
+