docker_project / preprocessor.py
Codingacademey
Add Flask app with Dockerfile
b0ae909
Raw
History Blame Contribute Delete
1.09 kB
import pandas as pd
import numpy as np
import re
def preprocess(data):
pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s?(?:AM|PM|am|pm|[AaPp][Mm])\s-\s'
message=re.split(pattern,data)[1:]
print(message)
dates=re.findall(pattern,data)
df=pd.DataFrame({'user_message':message,'message_date':dates})
df["message_date"]=pd.to_datetime(df['message_date'], format="%d/%m/%Y, %I:%M\u202f%p - ")
df.rename(columns={'message_date':'date'},inplace=True)
users = []
messages = []
for message in df['user_message']:
entry= re.split('([\w\W]+?):\s', message)
if entry [1:]:# user name
users.append(entry[1])
messages.append(entry[2])
else:
users.append('group_notification')
messages.append(entry[0])
df['user'] = users
df['message'] = messages
df.drop(columns=['user_message'], inplace=True)
df['year']=df["date"].dt.year
df['month']=df['date'].dt.month_name()
df['day']=df['date'].dt.day
df['hour']=df['date'].dt.hour
df['min']=df['date'].dt.minute
df['month_num']=df['date'].dt.month
return df