Update preprocessor.py
Browse files- preprocessor.py +4 -8
preprocessor.py
CHANGED
|
@@ -2,17 +2,13 @@ import re
|
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
def preprocess(data):
|
| 5 |
-
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}
|
| 6 |
-
|
| 7 |
-
|
| 8 |
messages_and_dates = re.findall(pattern, data)
|
| 9 |
-
messages = [message_group[0] for message_group in messages_and_dates] # Extract messages (first group)
|
| 10 |
-
dates = [message_group[1] for message_group in messages_and_dates] # Extract dates (second group)
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
users = []
|
| 18 |
messages = []
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
def preprocess(data):
|
| 5 |
+
pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}-\s' # Capture date and time in separate groups
|
|
|
|
|
|
|
| 6 |
messages_and_dates = re.findall(pattern, data)
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
messages = [message_group[0] for message_group in messages_and_dates]
|
| 9 |
+
dates = [message_group[1] or "NA" for message_group in messages_and_dates] # Assign "NA" for empty strings
|
| 10 |
|
| 11 |
+
df = pd.DataFrame({'user_message': messages, 'date': pd.to_datetime(dates, format='%d/%m/%Y, %H:%M - ')})
|
| 12 |
|
| 13 |
users = []
|
| 14 |
messages = []
|