import re import pandas as pd def preprocess(data): data = data.replace('\u202f', '') # patt = r'\[\d{2}\/\d{2}\/\d{2},\s\d{2}:\d{2}:\d{2}]\s' pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -' pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]' pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -' pattern4 = r'\d{1,2}/\d{1,2}/\d{2}, \d{2}:\d{2} -' pattern5 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[AP]M -' pattern6 = r'\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2} [APM]{2} -' # data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages." result = [] messages = [] dates = [] if re.search(pattern1, data): result.append(data) messages = re.split(pattern1, data)[1:] dates = re.findall(pattern1, data) # dates[:10] if re.search(pattern2, data): result.append(data) messages = re.split(pattern2, data)[1:] dates = re.findall(pattern2, data) if re.search(pattern3, data): result.append(data) messages = re.split(pattern3, data)[1:] dates = re.findall(pattern3, data) if re.search(pattern4, data): result.append(data) messages = re.split(pattern4, data)[1:] dates = re.findall(pattern4, data) if re.search(pattern5, data): result.append(data) messages = re.split(pattern5, data)[1:] dates = re.findall(pattern5, data) if re.search(pattern6, data): result.append(data) messages = re.split(pattern6, data)[1:] dates = re.findall(pattern6, data) if not result: result = None # messages = re.split(patt, data)[1:] # dates = re.findall(patt, data) df = pd.DataFrame({'date': dates, 'messageog': messages}) df['date'] = df['date'].astype(str).str.replace('[', '', regex=False) df['date'] = df['date'].astype(str).str.replace(']', '', regex=False) df['date'] = df['date'].astype(str).str.replace('-', '', regex=False) df['date'] = pd.to_datetime(df['date']) # df['date'][0] df = df[1:] df['messageog'] = df['messageog'].astype(str) # Convert column to string type df['messageog'] = df['messageog'].fillna('') df['sender'] = '' df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True) df = df[['date', 'sender', 'message']] df['year'] = df['date'].dt.year df['day'] = df['date'].dt.day df['hour'] = df['date'].dt.hour df['minute'] = df['date'].dt.minute df['second'] = df['date'].dt.second df['month'] = df['date'].dt.month_name() # df = df.loc[df['sender'] != group_name] df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c'] df = df.loc[~df['sender'].str.contains('changed the subject')] df = df.loc[~df['sender'].str.contains('message')] df = df.loc[~df['sender'].str.contains('added')] df = df.loc[~df['sender'].str.contains('created group')] df = df.loc[~df['sender'].str.contains('left')] df = df.loc[~df['sender'].str.contains('removed')] df = df.loc[~df['sender'].str.contains('admin')] df = df.loc[~df['sender'].str.contains('changed')] df = df.loc[~df['sender'].str.contains('encrypted')] df = df[['date', 'sender', 'message']] return df