Spaces:
Sleeping
Sleeping
File size: 1,978 Bytes
9b5992c 1127d65 9b5992c 02c4993 9b5992c d576279 02c4993 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import re
from urllib.parse import urlparse
# Function to extract all the features from the given URL
def extract_features(url):
# Analyse de l'URL
parsed_url = urlparse(url)
domain = parsed_url.netloc
domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain
domain_body_match = re.search(r'www\.(.+?)\.', domain)
domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0]
# Initialise caracteristics
features = {}
# URL
# features['URL'] = url
# URLLength
features['URLLength'] = len(url)
# Domain
features['Domain'] = domain
# DomainLength
features['DomainLength'] = len(domain)
# TLD
tld_match = re.search(r'\.[a-z]+$', domain_no_www)
features['TLD'] = tld_match.group(0)[1:] if tld_match else ''
# CharContinuationRate
char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
total_chars = sum(len(seq) for seq in char_sequences)
features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0
# TLDLength
features['TLDLength'] = len(features['TLD'])
# NoOfSubDomain
subdomains = domain_no_www.split('.')[:-1]
features['NoOfSubDomain'] = len(subdomains)
# DegitRatioInURL
digits = re.findall(r'[0-9]', url)
features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0
# SpacialCharRatioInURL
special_chars = re.findall(r'[!@#$%^&*(),.?":{}|<>]', url)
features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0
# IsHTTPS
features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0
return features
# url_example = "https://www.southbankmosaics.com"
# url_example = "https://www.ooty.ind.in"
# features = extract_features(url_example)
# print(features)
# for key, value in features.items():
# print(f"{key}: {value}") |