Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| def preprocess(data): | |
| data = data.replace('\u202f', '') | |
| # patt = r'\[\d{2}\/\d{2}\/\d{2},\s\d{2}:\d{2}:\d{2}]\s' | |
| pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -' | |
| pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]' | |
| pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -' | |
| pattern4 = r'\d{1,2}/\d{1,2}/\d{2}, \d{2}:\d{2} -' | |
| pattern5 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[AP]M -' | |
| pattern6 = r'\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2} [APM]{2} -' | |
| # data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages." | |
| result = [] | |
| messages = [] | |
| dates = [] | |
| if re.search(pattern1, data): | |
| result.append(data) | |
| messages = re.split(pattern1, data)[1:] | |
| dates = re.findall(pattern1, data) | |
| # dates[:10] | |
| if re.search(pattern2, data): | |
| result.append(data) | |
| messages = re.split(pattern2, data)[1:] | |
| dates = re.findall(pattern2, data) | |
| if re.search(pattern3, data): | |
| result.append(data) | |
| messages = re.split(pattern3, data)[1:] | |
| dates = re.findall(pattern3, data) | |
| if re.search(pattern4, data): | |
| result.append(data) | |
| messages = re.split(pattern4, data)[1:] | |
| dates = re.findall(pattern4, data) | |
| if re.search(pattern5, data): | |
| result.append(data) | |
| messages = re.split(pattern5, data)[1:] | |
| dates = re.findall(pattern5, data) | |
| if re.search(pattern6, data): | |
| result.append(data) | |
| messages = re.split(pattern6, data)[1:] | |
| dates = re.findall(pattern6, data) | |
| if not result: | |
| result = None | |
| # messages = re.split(patt, data)[1:] | |
| # dates = re.findall(patt, data) | |
| df = pd.DataFrame({'date': dates, 'messageog': messages}) | |
| df['date'] = df['date'].astype(str).str.replace('[', '', regex=False) | |
| df['date'] = df['date'].astype(str).str.replace(']', '', regex=False) | |
| df['date'] = df['date'].astype(str).str.replace('-', '', regex=False) | |
| df['date'] = pd.to_datetime(df['date']) | |
| # df['date'][0] | |
| df = df[1:] | |
| df['messageog'] = df['messageog'].astype(str) # Convert column to string type | |
| df['messageog'] = df['messageog'].fillna('') | |
| df['sender'] = '' | |
| df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True) | |
| df = df[['date', 'sender', 'message']] | |
| df['year'] = df['date'].dt.year | |
| df['day'] = df['date'].dt.day | |
| df['hour'] = df['date'].dt.hour | |
| df['minute'] = df['date'].dt.minute | |
| df['second'] = df['date'].dt.second | |
| df['month'] = df['date'].dt.month_name() | |
| # df = df.loc[df['sender'] != group_name] | |
| df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c'] | |
| df = df.loc[~df['sender'].str.contains('changed the subject')] | |
| df = df.loc[~df['sender'].str.contains('message')] | |
| df = df.loc[~df['sender'].str.contains('added')] | |
| df = df.loc[~df['sender'].str.contains('created group')] | |
| df = df.loc[~df['sender'].str.contains('left')] | |
| df = df.loc[~df['sender'].str.contains('removed')] | |
| df = df.loc[~df['sender'].str.contains('admin')] | |
| df = df.loc[~df['sender'].str.contains('changed')] | |
| df = df.loc[~df['sender'].str.contains('encrypted')] | |
| df = df[['date', 'sender', 'message']] | |
| return df | |