import re from urllib.parse import urlparse # Function to extract all the features from the given URL def extract_features(url): # Analyse de l'URL parsed_url = urlparse(url) domain = parsed_url.netloc domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain domain_body_match = re.search(r'www\.(.+?)\.', domain) domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0] # Initialise caracteristics features = {} # URL features['URL'] = url # URLLength features['URLLength'] = len(url) # Domain features['Domain'] = domain # DomainLength features['DomainLength'] = len(domain) # TLD tld_match = re.search(r'\.[a-z]+$', domain_no_www) features['TLD'] = tld_match.group(0)[1:] if tld_match else '' # CharContinuationRate char_sequences = re.findall(r'[a-zA-Z]+', domain_body) total_chars = sum(len(seq) for seq in char_sequences) features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0 # TLDLength features['TLDLength'] = len(features['TLD']) # NoOfSubDomain subdomains = domain_no_www.split('.')[:-1] features['NoOfSubDomain'] = len(subdomains) # DegitRatioInURL digits = re.findall(r'[0-9]', url) features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0 # SpacialCharRatioInURL special_chars = re.findall(r'[!@#$%^&*(),.?":{}|<>]', url) features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0 # IsHTTPS features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0 return features # url_example = "https://www.southbankmosaics.com" # url_example = "https://www.ooty.ind.in" # features = extract_features(url_example) # print(features) # for key, value in features.items(): # print(f"{key}: {value}")