SurajJha21 commited on
Commit
73bbfe4
·
verified ·
1 Parent(s): b448d70

Update preprocessor.py

Browse files
Files changed (1) hide show
  1. preprocessor.py +13 -8
preprocessor.py CHANGED
@@ -1,15 +1,20 @@
1
  import re
2
  import pandas as pd
3
 
4
- def preprocess(data):
5
- pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}-\s' # Capture date and time in separate groups
6
- messages_and_dates = re.findall(pattern, data)
7
-
8
- messages = [message_group[0] for message_group in messages_and_dates]
9
- dates = [message_group[1] or "NA" for message_group in messages_and_dates] # Assign "NA" for empty strings
10
-
11
- df = pd.DataFrame({'user_message': messages, 'date': pd.to_datetime(dates, format='%d/%m/%Y, %H:%M - ')})
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  users = []
14
  messages = []
15
  for message in df['user_message']:
 
1
  import re
2
  import pandas as pd
3
 
4
+ import dateutil.parser
 
 
 
 
 
 
 
5
 
6
+ def preprocess(data):
7
+ messages = []
8
+ dates = []
9
+ for line in data.splitlines(): # Assuming data is line-based
10
+ try:
11
+ date_obj = dateutil.parser.parse(line.split(',')[0]) # Assuming date is in first part
12
+ messages.append(line.split(',')[1]) # Assuming message is in second part
13
+ dates.append(date_obj.strftime("%d/%m/%Y %H:%M")) # Format as desired
14
+ except (ValueError, IndexError):
15
+ pass # Handle parsing errors or missing data silently (optional)
16
+ df = pd.DataFrame({'user_message': messages, 'date': dates})
17
+
18
  users = []
19
  messages = []
20
  for message in df['user_message']: