Spaces:

binoubinks
/

ADSP_finalProjectBack

Sleeping

ADSP_finalProjectBack / extraction_features.py

albin

remove url parameter

1127d65 about 1 year ago

1.98 kB

	import re
	from urllib.parse import urlparse

	# Function to extract all the features from the given URL

	def extract_features(url):
	# Analyse de l'URL
	parsed_url = urlparse(url)
	domain = parsed_url.netloc
	domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain
	domain_body_match = re.search(r'www\.(.+?)\.', domain)
	domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0]

	# Initialise caracteristics
	features = {}

	# URL
	# features['URL'] = url

	# URLLength
	features['URLLength'] = len(url)

	# Domain
	features['Domain'] = domain

	# DomainLength
	features['DomainLength'] = len(domain)

	# TLD
	tld_match = re.search(r'\.[a-z]+$', domain_no_www)
	features['TLD'] = tld_match.group(0)[1:] if tld_match else ''

	# CharContinuationRate
	char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
	total_chars = sum(len(seq) for seq in char_sequences)
	features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0

	# TLDLength
	features['TLDLength'] = len(features['TLD'])

	# NoOfSubDomain
	subdomains = domain_no_www.split('.')[:-1]
	features['NoOfSubDomain'] = len(subdomains)

	# DegitRatioInURL
	digits = re.findall(r'[0-9]', url)
	features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0

	# SpacialCharRatioInURL
	special_chars = re.findall(r'[!@#$%^&*(),.?":{}\|<>]', url)
	features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0

	# IsHTTPS
	features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0

	return features

	# url_example = "https://www.southbankmosaics.com"
	# url_example = "https://www.ooty.ind.in"
	# features = extract_features(url_example)
	# print(features)
	# for key, value in features.items():
	# print(f"{key}: {value}")