Spaces:
Sleeping
Sleeping
File size: 3,286 Bytes
613a736 bbaf876 613a736 bbaf876 e08d181 08b0576 0d1237a 2e6e05f 613a736 bbaf876 e08d181 08b0576 0d1237a 2e6e05f 0d1237a bbaf876 613a736 e08d181 613a736 d4e8bc1 613a736 bbaf876 613a736 e08d181 1bd404f 613a736 e08d181 613a736 bbaf876 4e83b01 dee2562 bbaf876 613a736 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import re
import pandas as pd
def preprocess(data):
data = data.replace('\u202f', '')
# patt = r'\[\d{2}\/\d{2}\/\d{2},\s\d{2}:\d{2}:\d{2}]\s'
pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -'
pattern4 = r'\d{1,2}/\d{1,2}/\d{2}, \d{2}:\d{2} -'
pattern5 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[AP]M -'
pattern6 = r'\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2} [APM]{2} -'
# data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."
result = []
messages = []
dates = []
if re.search(pattern1, data):
result.append(data)
messages = re.split(pattern1, data)[1:]
dates = re.findall(pattern1, data)
# dates[:10]
if re.search(pattern2, data):
result.append(data)
messages = re.split(pattern2, data)[1:]
dates = re.findall(pattern2, data)
if re.search(pattern3, data):
result.append(data)
messages = re.split(pattern3, data)[1:]
dates = re.findall(pattern3, data)
if re.search(pattern4, data):
result.append(data)
messages = re.split(pattern4, data)[1:]
dates = re.findall(pattern4, data)
if re.search(pattern5, data):
result.append(data)
messages = re.split(pattern5, data)[1:]
dates = re.findall(pattern5, data)
if re.search(pattern6, data):
result.append(data)
messages = re.split(pattern6, data)[1:]
dates = re.findall(pattern6, data)
if not result:
result = None
# messages = re.split(patt, data)[1:]
# dates = re.findall(patt, data)
df = pd.DataFrame({'date': dates, 'messageog': messages})
df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)
df['date'] = pd.to_datetime(df['date'])
# df['date'][0]
df = df[1:]
df['messageog'] = df['messageog'].astype(str) # Convert column to string type
df['messageog'] = df['messageog'].fillna('')
df['sender'] = ''
df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True)
df = df[['date', 'sender', 'message']]
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second
df['month'] = df['date'].dt.month_name()
# df = df.loc[df['sender'] != group_name]
df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c']
df = df.loc[~df['sender'].str.contains('changed the subject')]
df = df.loc[~df['sender'].str.contains('message')]
df = df.loc[~df['sender'].str.contains('added')]
df = df.loc[~df['sender'].str.contains('created group')]
df = df.loc[~df['sender'].str.contains('left')]
df = df.loc[~df['sender'].str.contains('removed')]
df = df.loc[~df['sender'].str.contains('admin')]
df = df.loc[~df['sender'].str.contains('changed')]
df = df.loc[~df['sender'].str.contains('encrypted')]
df = df[['date', 'sender', 'message']]
return df
|