File size: 5,056 Bytes
c69597c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import re
import pandas as pd
from urllib.parse import urlparse

URL_PATTERN = r'https?://\S+'
PHONE_PATTERN = r'(\+?\d[\d\-\(\)\s]{6,}\d)'


SHORTENING_PATTERN = re.compile(
    r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|'
    r'cli\.gs|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|snipurl\.com|'
    r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|'
    r'loopt\.us|doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|'
    r'lnkd\.in|db\.tt|qr\.ae|adf\.ly|bitly\.com|cur\.lv|tinyurl\.com|ity\.im|q\.gs|'
    r'po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
    r'prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
    r'tr\.im|link\.zip\.net',
    re.IGNORECASE
)

SUSPICIOUS_PATTERN = re.compile(
    r'PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr'
    r'|verify|secure|password|support|alert|warning|confirm|suspend|action-required'
    r'|activity|limited|access-restricted|authentication|recover|reset'
    r'|invoice|payment|billing|purchase|transaction|refund'
    r'|microsoft|google|amazon|apple|netflix|fedex|dhl|ups'
    r'|redirect|cgi-bin|admin|\.exe|\.zip|\.rar|\.js|\.scr|\.bat|\.php'
    r'|\.xyz|\.top|\.icu|\.biz|\.info|\.live|\.link|\.click',
    re.IGNORECASE
)



def extract_urls_from_body(row):
    """Extract URLs from email body text."""
    if isinstance(row.get('body', ''), str):
        found = re.findall(URL_PATTERN, row['body'])
        if found:
            return " [NEXT] ".join(found)
    return ""


#       --- 2. URL COUNT ---


def count_urls(cell):
    return 0 if cell == "" else cell.count(" [NEXT]") + 1

#--- 3. ATTACHMENT EXTRACTION ---


def extract_attachment_names(body):
    pattern = r'filename="([^"]+)"'
    found = re.findall(pattern, body) if isinstance(body, str) else []
    return "\n".join(found) if found else ""


# --- 4. CLEANED TEXT BUILDING ---


def create_combined_text(row):
    subject = str(row['subject']) if pd.notnull(row['subject']) else ""
    body = str(row['body']) if pd.notnull(row['body']) else ""

    body = re.sub(URL_PATTERN, "[LINK]", body)         # Replace URLs
    body = re.sub(PHONE_PATTERN, "[PHONE]", body)      # Replace phone numbers

    return f"[SSUB] {subject.strip()} [ESUB] [SBODY] {body.strip()} [EBODY]"


#--- 5. IP ADDRESS DETECTION ---


def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5]))|'
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2}))|'
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',
        url
    )
    return 1 if match else 0

#     --- 6. BASIC URL FEATURES (LENGTH / SUBDOMAIN) ---


def split_urls(sample):
    if pd.isna(sample) or not isinstance(sample, str):
        return []
    return [u.strip() for u in sample.split('[NEXT]') if u.strip()]

def url_length(url): return len(url)

def subdomain_count(url):
    try:
        hostname = urlparse(url).hostname
        if hostname is None: return 0
        parts = hostname.split('.')
        return max(len(parts) - 2, 0)
    except:
        return 0

def extract_basic_url_stats(sample):
    urls = split_urls(sample)
    if not urls:
        return pd.Series([0,0,0,0],
            index=['url_length_max','url_length_avg','url_subdom_max','url_subdom_avg']
        )

    lengths = [len(u) for u in urls]
    subs = [subdomain_count(u) for u in urls]

    return pd.Series([
        max(lengths),
        sum(lengths)/len(lengths),
        max(subs),
        sum(subs)/len(subs)
    ],
    index=['url_length_max','url_length_avg','url_subdom_max','url_subdom_avg'])

#        --- 7. SHORT URL COUNT ---


def count_shortened_urls(sample):
    urls = split_urls(sample)
    return sum(1 for u in urls if SHORTENING_PATTERN.search(u))

#        --- 8. SUSPICIOUS KEYWORD COUNT ---


def suspicious_words_count(sample):
    urls = split_urls(sample)
    return sum(1 for u in urls if SUSPICIOUS_PATTERN.search(u))


#     --- 9. DOT COUNT FEATURES ---


def dot_count(url): return url.count('.')

def extract_dot_features(sample):
    urls = split_urls(sample)
    if not urls: 
        return pd.Series([0,0], index=['dot_count_max','dot_count_avg'])
    dots = [dot_count(u) for u in urls]
    return pd.Series([max(dots), sum(dots)/len(dots)], 
                     index=['dot_count_max','dot_count_avg'])


# 10. GENERIC CHARACTER-COUNT FEATURES ---


def char_count(url, char): 
    return url.count(char) if url else 0

def extract_char_features(sample, char, name):
    urls = split_urls(sample)
    if not urls:
        return pd.Series([0,0], index=[f'{name}_max',f'{name}_avg'])
    counts = [char_count(u, char) for u in urls]
    return pd.Series([max(counts), sum(counts)/len(counts)],
                     index=[f'{name}_max',f'{name}_avg'])