Spaces:
Sleeping
Sleeping
File size: 5,056 Bytes
c69597c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import re
import pandas as pd
from urllib.parse import urlparse
URL_PATTERN = r'https?://\S+'
PHONE_PATTERN = r'(\+?\d[\d\-\(\)\s]{6,}\d)'
SHORTENING_PATTERN = re.compile(
r'bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|'
r'cli\.gs|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|snipurl\.com|'
r'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|'
r'loopt\.us|doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|'
r'lnkd\.in|db\.tt|qr\.ae|adf\.ly|bitly\.com|cur\.lv|tinyurl\.com|ity\.im|q\.gs|'
r'po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
r'prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
r'tr\.im|link\.zip\.net',
re.IGNORECASE
)
SUSPICIOUS_PATTERN = re.compile(
r'PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr'
r'|verify|secure|password|support|alert|warning|confirm|suspend|action-required'
r'|activity|limited|access-restricted|authentication|recover|reset'
r'|invoice|payment|billing|purchase|transaction|refund'
r'|microsoft|google|amazon|apple|netflix|fedex|dhl|ups'
r'|redirect|cgi-bin|admin|\.exe|\.zip|\.rar|\.js|\.scr|\.bat|\.php'
r'|\.xyz|\.top|\.icu|\.biz|\.info|\.live|\.link|\.click',
re.IGNORECASE
)
def extract_urls_from_body(row):
"""Extract URLs from email body text."""
if isinstance(row.get('body', ''), str):
found = re.findall(URL_PATTERN, row['body'])
if found:
return " [NEXT] ".join(found)
return ""
# --- 2. URL COUNT ---
def count_urls(cell):
return 0 if cell == "" else cell.count(" [NEXT]") + 1
#--- 3. ATTACHMENT EXTRACTION ---
def extract_attachment_names(body):
pattern = r'filename="([^"]+)"'
found = re.findall(pattern, body) if isinstance(body, str) else []
return "\n".join(found) if found else ""
# --- 4. CLEANED TEXT BUILDING ---
def create_combined_text(row):
subject = str(row['subject']) if pd.notnull(row['subject']) else ""
body = str(row['body']) if pd.notnull(row['body']) else ""
body = re.sub(URL_PATTERN, "[LINK]", body) # Replace URLs
body = re.sub(PHONE_PATTERN, "[PHONE]", body) # Replace phone numbers
return f"[SSUB] {subject.strip()} [ESUB] [SBODY] {body.strip()} [EBODY]"
#--- 5. IP ADDRESS DETECTION ---
def having_ip_address(url):
match = re.search(
'(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
'([01]?\\d\\d?|2[0-4]\\d|25[0-5]))|'
'((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2}))|'
'(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}',
url
)
return 1 if match else 0
# --- 6. BASIC URL FEATURES (LENGTH / SUBDOMAIN) ---
def split_urls(sample):
if pd.isna(sample) or not isinstance(sample, str):
return []
return [u.strip() for u in sample.split('[NEXT]') if u.strip()]
def url_length(url): return len(url)
def subdomain_count(url):
try:
hostname = urlparse(url).hostname
if hostname is None: return 0
parts = hostname.split('.')
return max(len(parts) - 2, 0)
except:
return 0
def extract_basic_url_stats(sample):
urls = split_urls(sample)
if not urls:
return pd.Series([0,0,0,0],
index=['url_length_max','url_length_avg','url_subdom_max','url_subdom_avg']
)
lengths = [len(u) for u in urls]
subs = [subdomain_count(u) for u in urls]
return pd.Series([
max(lengths),
sum(lengths)/len(lengths),
max(subs),
sum(subs)/len(subs)
],
index=['url_length_max','url_length_avg','url_subdom_max','url_subdom_avg'])
# --- 7. SHORT URL COUNT ---
def count_shortened_urls(sample):
urls = split_urls(sample)
return sum(1 for u in urls if SHORTENING_PATTERN.search(u))
# --- 8. SUSPICIOUS KEYWORD COUNT ---
def suspicious_words_count(sample):
urls = split_urls(sample)
return sum(1 for u in urls if SUSPICIOUS_PATTERN.search(u))
# --- 9. DOT COUNT FEATURES ---
def dot_count(url): return url.count('.')
def extract_dot_features(sample):
urls = split_urls(sample)
if not urls:
return pd.Series([0,0], index=['dot_count_max','dot_count_avg'])
dots = [dot_count(u) for u in urls]
return pd.Series([max(dots), sum(dots)/len(dots)],
index=['dot_count_max','dot_count_avg'])
# 10. GENERIC CHARACTER-COUNT FEATURES ---
def char_count(url, char):
return url.count(char) if url else 0
def extract_char_features(sample, char, name):
urls = split_urls(sample)
if not urls:
return pd.Series([0,0], index=[f'{name}_max',f'{name}_avg'])
counts = [char_count(u, char) for u in urls]
return pd.Series([max(counts), sum(counts)/len(counts)],
index=[f'{name}_max',f'{name}_avg'])
|