Spaces:
Sleeping
Sleeping
| import re | |
| from urllib.parse import urlparse | |
| # Function to extract all the features from the given URL | |
| def extract_features(url): | |
| # Analyse de l'URL | |
| parsed_url = urlparse(url) | |
| domain = parsed_url.netloc | |
| domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain | |
| domain_body_match = re.search(r'www\.(.+?)\.', domain) | |
| domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0] | |
| # Initialise caracteristics | |
| features = {} | |
| # URL | |
| # features['URL'] = url | |
| # URLLength | |
| features['URLLength'] = len(url) | |
| # Domain | |
| features['Domain'] = domain | |
| # DomainLength | |
| features['DomainLength'] = len(domain) | |
| # TLD | |
| tld_match = re.search(r'\.[a-z]+$', domain_no_www) | |
| features['TLD'] = tld_match.group(0)[1:] if tld_match else '' | |
| # CharContinuationRate | |
| char_sequences = re.findall(r'[a-zA-Z]+', domain_body) | |
| total_chars = sum(len(seq) for seq in char_sequences) | |
| features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0 | |
| # TLDLength | |
| features['TLDLength'] = len(features['TLD']) | |
| # NoOfSubDomain | |
| subdomains = domain_no_www.split('.')[:-1] | |
| features['NoOfSubDomain'] = len(subdomains) | |
| # DegitRatioInURL | |
| digits = re.findall(r'[0-9]', url) | |
| features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0 | |
| # SpacialCharRatioInURL | |
| special_chars = re.findall(r'[!@#$%^&*(),.?":{}|<>]', url) | |
| features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0 | |
| # IsHTTPS | |
| features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0 | |
| return features | |
| # url_example = "https://www.southbankmosaics.com" | |
| # url_example = "https://www.ooty.ind.in" | |
| # features = extract_features(url_example) | |
| # print(features) | |
| # for key, value in features.items(): | |
| # print(f"{key}: {value}") |