File size: 2,820 Bytes
c93d3d2
 
 
 
 
d5a85b5
 
 
c93d3d2
 
 
 
 
 
 
 
 
 
d5a85b5
 
 
 
c93d3d2
 
 
 
 
 
 
 
d5a85b5
 
 
 
 
 
c93d3d2
 
 
 
 
 
 
d5a85b5
 
 
 
 
 
c93d3d2
361ccdf
c93d3d2
 
 
 
 
 
 
eb4e9b5
 
 
 
c93d3d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
import pandas as pd


def checker(data):
    pattern_12 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s[a,m,p]+\s-\s'
    pattern_24 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s-\s'

    check_1 = re.search(pattern_12, data)
    check_2 = re.search(pattern_24, data)

    if check_1 or check_2:
        return True
    else:
        return False


def preprocess(data):
    pattern_12 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s[a,m,p]+\s-\s'
    pattern_24 = '\d{1,2}\/\d{1,2}\/\d{2,4},\s\d{1,2}:\d{1,2}\s-\s'
    pattern_ph_num = '\d{1,2} \d{5} \d{5}'

    x = re.search(pattern_12, data)

    # For 12 hour format
    if x:
        messages = re.split(pattern_12, data)[1:]
        dates = re.findall(pattern_12, data)
        df = pd.DataFrame({'user_message': messages, 'message_date': dates})

        try:
            df['message_date'] = pd.to_datetime(
                df['message_date'], format='%d/%m/%y, %I:%M %p - ')
        except ValueError:
            df['message_date'] = pd.to_datetime(
                df['message_date'], format='%d/%m/%Y, %I:%M %p - ')

    else:
        messages = re.split(pattern_24, data)[1:]
        dates = re.findall(pattern_24, data)

        df = pd.DataFrame({'user_message': messages, 'message_date': dates})

        try:
            df['message_date'] = pd.to_datetime(
                df['message_date'], format='%d/%m/%y, %H:%M - ')
        except ValueError:
            df['message_date'] = pd.to_datetime(
                df['message_date'], format='%d/%m/%Y, %H:%M - ')

    df.rename(columns={'message_date': 'date'}, inplace=True)

    users = []
    messages = []

    for message in df['user_message']:
        entry = re.split('([\w\W]+?):\s', message)
        if entry[1:]:
            if re.search(pattern_ph_num, entry[1]):
                users.append('M:' + str(entry[1]))
            else:
                users.append(entry[1])
            messages.append(entry[2])
        else:
            users.append('group_notification')
            messages.append(entry[0])

    df['user'] = users
    df['message'] = messages
    df.drop(columns=['user_message'], inplace=True)

    df['only_date'] = df['date'].dt.date
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['day_name'] = df['date'].dt.day_name()
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute

    period = []
    for hour in df[['day_name', 'hour']]['hour']:
        if hour == 23:
            period.append(str(hour) + ' - ' + '00')
        elif hour == 0:
            period.append('00' + ' - ' + str(hour + 1))
        else:
            period.append(str(hour) + ' - ' + str(hour + 1))
        # period.append(str(hour) + 'abc')

    df['period'] = period

    return df