Spaces:
Sleeping
Sleeping
Create preprocessor.py
Browse files- preprocessor.py +51 -0
preprocessor.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
def preprocess(data):
|
| 5 |
+
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
|
| 6 |
+
|
| 7 |
+
messages = re.split(pattern, data)[1:]
|
| 8 |
+
dates = re.findall(pattern, data)
|
| 9 |
+
|
| 10 |
+
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
|
| 11 |
+
# convert message_date type
|
| 12 |
+
df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
|
| 13 |
+
|
| 14 |
+
df.rename(columns={'message_date': 'date'}, inplace=True)
|
| 15 |
+
|
| 16 |
+
users = []
|
| 17 |
+
messages = []
|
| 18 |
+
for message in df['user_message']:
|
| 19 |
+
entry = re.split('([\w\W]+?):\s', message)
|
| 20 |
+
if entry[1:]: # user name
|
| 21 |
+
users.append(entry[1])
|
| 22 |
+
messages.append(" ".join(entry[2:]))
|
| 23 |
+
else:
|
| 24 |
+
users.append('group_notification')
|
| 25 |
+
messages.append(entry[0])
|
| 26 |
+
|
| 27 |
+
df['user'] = users
|
| 28 |
+
df['message'] = messages
|
| 29 |
+
df.drop(columns=['user_message'], inplace=True)
|
| 30 |
+
|
| 31 |
+
df['only_date'] = df['date'].dt.date
|
| 32 |
+
df['year'] = df['date'].dt.year
|
| 33 |
+
df['month_num'] = df['date'].dt.month
|
| 34 |
+
df['month'] = df['date'].dt.month_name()
|
| 35 |
+
df['day'] = df['date'].dt.day
|
| 36 |
+
df['day_name'] = df['date'].dt.day_name()
|
| 37 |
+
df['hour'] = df['date'].dt.hour
|
| 38 |
+
df['minute'] = df['date'].dt.minute
|
| 39 |
+
|
| 40 |
+
period = []
|
| 41 |
+
for hour in df[['day_name', 'hour']]['hour']:
|
| 42 |
+
if hour == 23:
|
| 43 |
+
period.append(str(hour) + "-" + str('00'))
|
| 44 |
+
elif hour == 0:
|
| 45 |
+
period.append(str('00') + "-" + str(hour + 1))
|
| 46 |
+
else:
|
| 47 |
+
period.append(str(hour) + "-" + str(hour + 1))
|
| 48 |
+
|
| 49 |
+
df['period'] = period
|
| 50 |
+
|
| 51 |
+
return df
|