WhatsappGroupAnalysis / preprocess.py
sabirbagwan's picture
Update preprocess.py
2e6e05f
import re
import pandas as pd
def preprocess(data):
data = data.replace('\u202f', '')
# patt = r'\[\d{2}\/\d{2}\/\d{2},\s\d{2}:\d{2}:\d{2}]\s'
pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -'
pattern4 = r'\d{1,2}/\d{1,2}/\d{2}, \d{2}:\d{2} -'
pattern5 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[AP]M -'
pattern6 = r'\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2} [APM]{2} -'
# data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."
result = []
messages = []
dates = []
if re.search(pattern1, data):
result.append(data)
messages = re.split(pattern1, data)[1:]
dates = re.findall(pattern1, data)
# dates[:10]
if re.search(pattern2, data):
result.append(data)
messages = re.split(pattern2, data)[1:]
dates = re.findall(pattern2, data)
if re.search(pattern3, data):
result.append(data)
messages = re.split(pattern3, data)[1:]
dates = re.findall(pattern3, data)
if re.search(pattern4, data):
result.append(data)
messages = re.split(pattern4, data)[1:]
dates = re.findall(pattern4, data)
if re.search(pattern5, data):
result.append(data)
messages = re.split(pattern5, data)[1:]
dates = re.findall(pattern5, data)
if re.search(pattern6, data):
result.append(data)
messages = re.split(pattern6, data)[1:]
dates = re.findall(pattern6, data)
if not result:
result = None
# messages = re.split(patt, data)[1:]
# dates = re.findall(patt, data)
df = pd.DataFrame({'date': dates, 'messageog': messages})
df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)
df['date'] = pd.to_datetime(df['date'])
# df['date'][0]
df = df[1:]
df['messageog'] = df['messageog'].astype(str) # Convert column to string type
df['messageog'] = df['messageog'].fillna('')
df['sender'] = ''
df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True)
df = df[['date', 'sender', 'message']]
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second
df['month'] = df['date'].dt.month_name()
# df = df.loc[df['sender'] != group_name]
df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c']
df = df.loc[~df['sender'].str.contains('changed the subject')]
df = df.loc[~df['sender'].str.contains('message')]
df = df.loc[~df['sender'].str.contains('added')]
df = df.loc[~df['sender'].str.contains('created group')]
df = df.loc[~df['sender'].str.contains('left')]
df = df.loc[~df['sender'].str.contains('removed')]
df = df.loc[~df['sender'].str.contains('admin')]
df = df.loc[~df['sender'].str.contains('changed')]
df = df.loc[~df['sender'].str.contains('encrypted')]
df = df[['date', 'sender', 'message']]
return df