sabirbagwan commited on
Commit
e08d181
·
1 Parent(s): 9d0e415

Update preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +11 -19
preprocess.py CHANGED
@@ -7,6 +7,7 @@ def preprocess(data):
7
 
8
  pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
9
  pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
 
10
 
11
  # data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."
12
 
@@ -25,6 +26,12 @@ def preprocess(data):
25
  messages = re.split(pattern2, data)[1:]
26
  dates = re.findall(pattern2, data)
27
 
 
 
 
 
 
 
28
  if not result:
29
  result = None
30
 
@@ -36,33 +43,21 @@ def preprocess(data):
36
 
37
  # dates = re.findall(patt, data)
38
 
39
- df = pd.DataFrame({'date': dates, 'message': messages})
40
 
41
  df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
42
  df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
43
  df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)
44
 
45
-
46
- # group_name = df.loc[df['message'].str.contains('added you')]['message'].str.split(':').str[0]
47
- # group_name = df.loc[df['message'].str.contains('added you') | df['message'].str.contains('Messages and calls are end-to-end encrypted')]['message'].str.split(':').str[0]
48
-
49
- # group_name = group_name.values[0]
50
- # group_name
51
-
52
- # df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%y, %H:%M:%S ')
53
  df['date'] = pd.to_datetime(df['date'])
54
  # df['date'][0]
55
  df = df[1:]
56
 
57
- # df['sender'] = df.message.str.split(':').str[0]
58
- # df['message'] = df.message.str.split(':').str[1]
59
 
60
- # df[['sender', 'message']] = df['message'].str.split(':', n=1, expand=True)
61
- df['message'] = df['message'].astype(str) # Convert column to string type
62
- df[['sender', 'message']] = df['message'].str.split(':', n=1, expand=True)
63
 
64
-
65
- # df['message'] = df['message'].str.encode('utf-8')
66
  df = df[['date', 'sender', 'message']]
67
 
68
  df['year'] = df['date'].dt.year
@@ -84,7 +79,4 @@ def preprocess(data):
84
 
85
  df = df[['date', 'sender', 'message']]
86
 
87
- # df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c']
88
- # df = df.loc[df['sender'] != 'Haldi Mehendi -Dance prep']
89
-
90
  return df
 
7
 
8
  pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
9
  pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
10
+ pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -'
11
 
12
  # data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."
13
 
 
26
  messages = re.split(pattern2, data)[1:]
27
  dates = re.findall(pattern2, data)
28
 
29
+ if re.search(pattern3, data):
30
+ result.append(data)
31
+ messages = re.split(pattern3, data)[1:]
32
+ dates = re.findall(pattern3, data)
33
+
34
+
35
  if not result:
36
  result = None
37
 
 
43
 
44
  # dates = re.findall(patt, data)
45
 
46
+ df = pd.DataFrame({'date': dates, 'messageog': messages})
47
 
48
  df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
49
  df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
50
  df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)
51
 
 
 
 
 
 
 
 
 
52
  df['date'] = pd.to_datetime(df['date'])
53
  # df['date'][0]
54
  df = df[1:]
55
 
56
+ df['messageog'] = df['messageog'].astype(str) # Convert column to string type
57
+ df['messageog'] = df['messageog'].fillna('')
58
 
59
+ df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True)
 
 
60
 
 
 
61
  df = df[['date', 'sender', 'message']]
62
 
63
  df['year'] = df['date'].dt.year
 
79
 
80
  df = df[['date', 'sender', 'message']]
81
 
 
 
 
82
  return df