File size: 3,286 Bytes
613a736
 
 
 
bbaf876
 
613a736
bbaf876
 
e08d181
08b0576
0d1237a
2e6e05f
613a736
bbaf876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e08d181
 
 
 
 
 
08b0576
 
 
 
 
0d1237a
 
 
 
2e6e05f
 
 
 
 
0d1237a
bbaf876
 
 
 
 
 
 
 
 
 
613a736
e08d181
613a736
d4e8bc1
 
 
613a736
bbaf876
613a736
 
 
e08d181
 
1bd404f
613a736
e08d181
613a736
 
 
 
 
 
 
 
 
bbaf876
 
 
 
 
 
 
 
 
4e83b01
dee2562
 
 
bbaf876
613a736
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
import pandas as pd

def preprocess(data):
    data = data.replace('\u202f', '')
    # patt = r'\[\d{2}\/\d{2}\/\d{2},\s\d{2}:\d{2}:\d{2}]\s'

    pattern1 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[ap]m -'
    pattern2 = r'\[\d{2}/\d{2}/\d{2}, \d{2}:\d{2}:\d{2}\]'
    pattern3 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} -'
    pattern4 = r'\d{1,2}/\d{1,2}/\d{2}, \d{2}:\d{2} -'
    pattern5 = r'\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}[AP]M -'
    pattern6 = r'\d{1,2}/\d{1,2}/\d{2}, \d{1,2}:\d{2} [APM]{2} -'

    # data = "30/06/21, 1:46pm - Gauri Mami turned off disappearing messages."

    result = []
    messages = []
    dates = []

    if re.search(pattern1, data):
        result.append(data)
        messages = re.split(pattern1, data)[1:]
        dates = re.findall(pattern1, data)
    #     dates[:10]

    if re.search(pattern2, data):
        result.append(data)
        messages = re.split(pattern2, data)[1:]
        dates = re.findall(pattern2, data)

    if re.search(pattern3, data):
        result.append(data)
        messages = re.split(pattern3, data)[1:]
        dates = re.findall(pattern3, data)
    

    if re.search(pattern4, data):
        result.append(data)
        messages = re.split(pattern4, data)[1:]
        dates = re.findall(pattern4, data)

    if re.search(pattern5, data):
        result.append(data)
        messages = re.split(pattern5, data)[1:]
        dates = re.findall(pattern5, data)

    if re.search(pattern6, data):
        result.append(data)
        messages = re.split(pattern6, data)[1:]
        dates = re.findall(pattern6, data)
    
    if not result:
        result = None





    # messages = re.split(patt, data)[1:]

    # dates = re.findall(patt, data)

    df = pd.DataFrame({'date': dates, 'messageog': messages})
    
    df['date'] = df['date'].astype(str).str.replace('[', '', regex=False)
    df['date'] = df['date'].astype(str).str.replace(']', '', regex=False)
    df['date'] = df['date'].astype(str).str.replace('-', '', regex=False)

    df['date'] = pd.to_datetime(df['date'])
    # df['date'][0]
    df = df[1:]

    df['messageog'] = df['messageog'].astype(str)  # Convert column to string type
    df['messageog'] = df['messageog'].fillna('')
    df['sender'] = ''

    df[['sender', 'message']] = df['messageog'].str.split(':', n=1, expand=True)

    df = df[['date', 'sender', 'message']]

    df['year'] = df['date'].dt.year
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute
    df['second'] = df['date'].dt.second
    df['month'] = df['date'].dt.month_name()
    # df = df.loc[df['sender'] != group_name]
    df = df.loc[df['sender'] != '\u202a+91\xa095456\xa017572\u202c']

    df = df.loc[~df['sender'].str.contains('changed the subject')]
    df = df.loc[~df['sender'].str.contains('message')]
    df = df.loc[~df['sender'].str.contains('added')]
    df = df.loc[~df['sender'].str.contains('created group')]
    df = df.loc[~df['sender'].str.contains('left')]
    df = df.loc[~df['sender'].str.contains('removed')]
    df = df.loc[~df['sender'].str.contains('admin')]
    df = df.loc[~df['sender'].str.contains('changed')]
    df = df.loc[~df['sender'].str.contains('encrypted')]


    
    df = df[['date', 'sender', 'message']]

    return df