File size: 1,978 Bytes
9b5992c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1127d65
9b5992c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02c4993
 
 
9b5992c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d576279
 
 
02c4993
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
from urllib.parse import urlparse

# Function to extract all the features from the given URL

def extract_features(url):
    # Analyse de l'URL
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain
    domain_body_match = re.search(r'www\.(.+?)\.', domain)
    domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0]
    
    # Initialise caracteristics
    features = {}
    
    # URL 
    # features['URL'] = url
    
    # URLLength
    features['URLLength'] = len(url)
    
    # Domain
    features['Domain'] = domain
    
    # DomainLength
    features['DomainLength'] = len(domain)
    
    # TLD
    tld_match = re.search(r'\.[a-z]+$', domain_no_www)
    features['TLD'] = tld_match.group(0)[1:] if tld_match else ''
    
    # CharContinuationRate
    char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
    total_chars = sum(len(seq) for seq in char_sequences)
    features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0

    # TLDLength
    features['TLDLength'] = len(features['TLD'])
    
    # NoOfSubDomain
    subdomains = domain_no_www.split('.')[:-1] 
    features['NoOfSubDomain'] = len(subdomains)
    
    # DegitRatioInURL
    digits = re.findall(r'[0-9]', url)
    features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0
    
    # SpacialCharRatioInURL
    special_chars = re.findall(r'[!@#$%^&*(),.?":{}|<>]', url)
    features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0
    
    # IsHTTPS
    features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0
    
    return features

# url_example = "https://www.southbankmosaics.com"
# url_example = "https://www.ooty.ind.in"
# features = extract_features(url_example)
# print(features)
# for key, value in features.items():
#     print(f"{key}: {value}")