Spaces:
Sleeping
Sleeping
Commit
·
e08d181
1
Parent(s):
9d0e415
Update preprocess.py
Browse files- preprocess.py +11 -19
preprocess.py
CHANGED
|
@@ -7,6 +7,7 @@ def preprocess(data):
|
|
| 7 |
|
| 8 |
pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
|
| 9 |
pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
|
|
|
|
| 10 |
|
| 11 |
# data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."
|
| 12 |
|
|
@@ -25,6 +26,12 @@ def preprocess(data):
|
|
| 25 |
messages = re.split(pattern2, data)[1:]
|
| 26 |
dates = re.findall(pattern2, data)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
if not result:
|
| 29 |
result = None
|
| 30 |
|
|
@@ -36,33 +43,21 @@ def preprocess(data):
|
|
| 36 |
|
| 37 |
# dates = re.findall(patt, data)
|
| 38 |
|
| 39 |
-
df = pd.DataFrame({'date': dates, '
|
| 40 |
|
| 41 |
df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
|
| 42 |
df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
|
| 43 |
df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)
|
| 44 |
|
| 45 |
-
|
| 46 |
-
# group_name = df.loc[df['message'].str.contains('added you')]['message'].str.split(':').str[0]
|
| 47 |
-
# group_name = df.loc[df['message'].str.contains('added you') | df['message'].str.contains('Messages and calls are end-to-end encrypted')]['message'].str.split(':').str[0]
|
| 48 |
-
|
| 49 |
-
# group_name = group_name.values[0]
|
| 50 |
-
# group_name
|
| 51 |
-
|
| 52 |
-
# df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%y, %H:%M:%S ')
|
| 53 |
df['date'] = pd.to_datetime(df['date'])
|
| 54 |
# df['date'][0]
|
| 55 |
df = df[1:]
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
df['message'] = df['message'].astype(str) # Convert column to string type
|
| 62 |
-
df[['sender', 'message']] = df['message'].str.split(':', n=1, expand=True)
|
| 63 |
|
| 64 |
-
|
| 65 |
-
# df['message'] = df['message'].str.encode('utf-8')
|
| 66 |
df = df[['date', 'sender', 'message']]
|
| 67 |
|
| 68 |
df['year'] = df['date'].dt.year
|
|
@@ -84,7 +79,4 @@ def preprocess(data):
|
|
| 84 |
|
| 85 |
df = df[['date', 'sender', 'message']]
|
| 86 |
|
| 87 |
-
# df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c']
|
| 88 |
-
# df = df.loc[df['sender'] != 'Haldi Mehendi -Dance prep']
|
| 89 |
-
|
| 90 |
return df
|
|
|
|
| 7 |
|
| 8 |
pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
|
| 9 |
pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
|
| 10 |
+
pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -'
|
| 11 |
|
| 12 |
# data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."
|
| 13 |
|
|
|
|
| 26 |
messages = re.split(pattern2, data)[1:]
|
| 27 |
dates = re.findall(pattern2, data)
|
| 28 |
|
| 29 |
+
if re.search(pattern3, data):
|
| 30 |
+
result.append(data)
|
| 31 |
+
messages = re.split(pattern3, data)[1:]
|
| 32 |
+
dates = re.findall(pattern3, data)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
if not result:
|
| 36 |
result = None
|
| 37 |
|
|
|
|
| 43 |
|
| 44 |
# dates = re.findall(patt, data)
|
| 45 |
|
| 46 |
+
df = pd.DataFrame({'date': dates, 'messageog': messages})
|
| 47 |
|
| 48 |
df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
|
| 49 |
df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
|
| 50 |
df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
df['date'] = pd.to_datetime(df['date'])
|
| 53 |
# df['date'][0]
|
| 54 |
df = df[1:]
|
| 55 |
|
| 56 |
+
df['messageog'] = df['messageog'].astype(str) # Convert column to string type
|
| 57 |
+
df['messageog'] = df['messageog'].fillna('')
|
| 58 |
|
| 59 |
+
df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True)
|
|
|
|
|
|
|
| 60 |
|
|
|
|
|
|
|
| 61 |
df = df[['date', 'sender', 'message']]
|
| 62 |
|
| 63 |
df['year'] = df['date'].dt.year
|
|
|
|
| 79 |
|
| 80 |
df = df[['date', 'sender', 'message']]
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 82 |
return df
|