Spaces:

binoubinks
/

ADSP_finalProjectBack

Sleeping

App Files Files Community

albin commited on Nov 28, 2024

Commit

9b5992c

1 Parent(s): 4cd2ca2

add model and features extraction files

Browse files

Files changed (2) hide show

extraction_features.py +63 -0
logistic_regression_model.pkl +3 -0

extraction_features.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import re
+from urllib.parse import urlparse
+# Function to extract all the features from the given URL
+def extract_features(url):
+    # Analyse de l'URL
+    parsed_url = urlparse(url)
+    domain = parsed_url.netloc
+    domain_no_www = domain.replace('www.', '') if domain.startswith('www.') else domain
+    domain_body_match = re.search(r'www\.(.+?)\.', domain)
+    domain_body = domain_body_match.group(1) if domain_body_match else domain_no_www.split('.')[0]
+    # Initialise caracteristics
+    features = {}
+    # URL
+    features['URL'] = url
+    # URLLength
+    features['URLLength'] = len(url)
+    # Domain
+    features['Domain'] = domain
+    # DomainLength
+    features['DomainLength'] = len(domain)
+    # TLD
+    tld_match = re.search(r'\.[a-z]+$', domain_no_www)
+    features['TLD'] = tld_match.group(0)[1:] if tld_match else ''
+    # TLDLength
+    features['TLDLength'] = len(features['TLD'])
+    # CharContinuationRate
+    char_sequences = re.findall(r'[a-zA-Z]+', domain_body)
+    total_chars = sum(len(seq) for seq in char_sequences)
+    features['CharContinuationRate'] = total_chars / len(domain_body) if len(domain_body) > 0 else 0
+    # NoOfSubDomain
+    subdomains = domain_no_www.split('.')[:-1]
+    features['NoOfSubDomain'] = len(subdomains)
+    # DegitRatioInURL
+    digits = re.findall(r'[0-9]', url)
+    features['DegitRatioInURL'] = len(digits) / len(url) if len(url) > 0 else 0
+    # SpacialCharRatioInURL
+    special_chars = re.findall(r'[!@#$%^&*(),.?":{}|<>]', url)
+    features['SpacialCharRatioInURL'] = len(special_chars) / len(url) if len(url) > 0 else 0
+    # IsHTTPS
+    features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0
+    return features
+# url_example = "https://www.southbankmosaics.com"
+url_example = "https://www.ooty.ind.in"
+features = extract_features(url_example)
+for key, value in features.items():
+    print(f"{key}: {value}")

logistic_regression_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3af2a4ab355b48bb17a05c8859a3ce0fb7afaa76eb69f7b0ec85671b82c4d21d
+size 1627