albin commited on
Commit
9b5992c
·
1 Parent(s): 4cd2ca2

add model and features extraction files

Browse files
extraction_features.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from urllib.parse import urlparse
3
+
4
+ # Function to extract all the features from the given URL
5
+
6
+ def extract_features(url):
7
+ # Analyse de l'URL
8
+ parsed_url = urlparse(url)
9
+ domain = parsed_url.netloc
10
+ domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain
11
+ domain_body_match = re.search(r'www\.(.+?)\.', domain)
12
+ domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0]
13
+
14
+ # Initialise caracteristics
15
+ features = {}
16
+
17
+ # URL
18
+ features['URL'] = url
19
+
20
+ # URLLength
21
+ features['URLLength'] = len(url)
22
+
23
+ # Domain
24
+ features['Domain'] = domain
25
+
26
+ # DomainLength
27
+ features['DomainLength'] = len(domain)
28
+
29
+ # TLD
30
+ tld_match = re.search(r'\.[a-z]+$', domain_no_www)
31
+ features['TLD'] = tld_match.group(0)[1:] if tld_match else ''
32
+
33
+ # TLDLength
34
+ features['TLDLength'] = len(features['TLD'])
35
+
36
+ # CharContinuationRate
37
+ char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
38
+ total_chars = sum(len(seq) for seq in char_sequences)
39
+ features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0
40
+
41
+ # NoOfSubDomain
42
+ subdomains = domain_no_www.split('.')[:-1]
43
+ features['NoOfSubDomain'] = len(subdomains)
44
+
45
+ # DegitRatioInURL
46
+ digits = re.findall(r'[0-9]', url)
47
+ features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0
48
+
49
+ # SpacialCharRatioInURL
50
+ special_chars = re.findall(r'[!@#$%^&*(),.?":{}|<>]', url)
51
+ features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0
52
+
53
+ # IsHTTPS
54
+ features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0
55
+
56
+ return features
57
+
58
+ # url_example = "https://www.southbankmosaics.com"
59
+ url_example = "https://www.ooty.ind.in"
60
+ features = extract_features(url_example)
61
+
62
+ for key, value in features.items():
63
+ print(f"{key}: {value}")
logistic_regression_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3af2a4ab355b48bb17a05c8859a3ce0fb7afaa76eb69f7b0ec85671b82c4d21d
3
+ size 1627