Update preprocessor.py
Browse files- preprocessor.py +13 -8
preprocessor.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
| 1 |
import re
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
|
| 5 |
-
pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}-\s' # Capture date and time in separate groups
|
| 6 |
-
messages_and_dates = re.findall(pattern, data)
|
| 7 |
-
|
| 8 |
-
messages = [message_group[0] for message_group in messages_and_dates]
|
| 9 |
-
dates = [message_group[1] or "NA" for message_group in messages_and_dates] # Assign "NA" for empty strings
|
| 10 |
-
|
| 11 |
-
df = pd.DataFrame({'user_message': messages, 'date': pd.to_datetime(dates, format='%d/%m/%Y, %H:%M - ')})
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
users = []
|
| 14 |
messages = []
|
| 15 |
for message in df['user_message']:
|
|
|
|
| 1 |
import re
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
import dateutil.parser
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
def preprocess(data):
|
| 7 |
+
messages = []
|
| 8 |
+
dates = []
|
| 9 |
+
for line in data.splitlines(): # Assuming data is line-based
|
| 10 |
+
try:
|
| 11 |
+
date_obj = dateutil.parser.parse(line.split(',')[0]) # Assuming date is in first part
|
| 12 |
+
messages.append(line.split(',')[1]) # Assuming message is in second part
|
| 13 |
+
dates.append(date_obj.strftime("%d/%m/%Y %H:%M")) # Format as desired
|
| 14 |
+
except (ValueError, IndexError):
|
| 15 |
+
pass # Handle parsing errors or missing data silently (optional)
|
| 16 |
+
df = pd.DataFrame({'user_message': messages, 'date': dates})
|
| 17 |
+
|
| 18 |
users = []
|
| 19 |
messages = []
|
| 20 |
for message in df['user_message']:
|