Spaces:
Sleeping
Sleeping
FINAL NA TALAGA TO
Browse files
app.py
CHANGED
|
@@ -126,7 +126,7 @@ def extract_features(url):
|
|
| 126 |
"nb_tilde": url.count('~'),
|
| 127 |
"nb_percent": url.count('%'),
|
| 128 |
"nb_slash": url.count('/'),
|
| 129 |
-
"nb_star":
|
| 130 |
"nb_colon": url.count(':'),
|
| 131 |
"nb_comma": url.count(','),
|
| 132 |
"nb_semicolumn": url.count(';'),
|
|
@@ -135,13 +135,13 @@ def extract_features(url):
|
|
| 135 |
"nb_www": 1 if "www" in url else 0,
|
| 136 |
"nb_com": 1 if ".com" in url else 0,
|
| 137 |
"nb_dslash": url.count('//'),
|
| 138 |
-
"http_in_path": 1 if "http"
|
| 139 |
"https_token": 1 if "https" in url else 0,
|
| 140 |
"ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
|
| 141 |
"ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
|
| 142 |
"punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
|
| 143 |
"port": parsed.port if parsed.port else 0,
|
| 144 |
-
"
|
| 145 |
"tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
|
| 146 |
"abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
|
| 147 |
"nb_subdomains": len(hostname.split('.')) - 1,
|
|
@@ -151,12 +151,12 @@ def extract_features(url):
|
|
| 151 |
"length_words_raw": len(url.split()),
|
| 152 |
"char_repeat": len(set(url)),
|
| 153 |
"shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
|
| 154 |
-
"longest_words_raw": max(len(word) for
|
| 155 |
"shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
|
| 156 |
"longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
|
| 157 |
"shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
|
| 158 |
-
"longest_word_path": max(len(word) for word in path
|
| 159 |
-
"avg_words_raw": np.mean([
|
| 160 |
"avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
|
| 161 |
"avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
|
| 162 |
"phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,
|
|
|
|
| 126 |
"nb_tilde": url.count('~'),
|
| 127 |
"nb_percent": url.count('%'),
|
| 128 |
"nb_slash": url.count('/'),
|
| 129 |
+
"nb_star": url.count('*'),
|
| 130 |
"nb_colon": url.count(':'),
|
| 131 |
"nb_comma": url.count(','),
|
| 132 |
"nb_semicolumn": url.count(';'),
|
|
|
|
| 135 |
"nb_www": 1 if "www" in url else 0,
|
| 136 |
"nb_com": 1 if ".com" in url else 0,
|
| 137 |
"nb_dslash": url.count('//'),
|
| 138 |
+
"http_in_path": 1 if "http" in path else 0,
|
| 139 |
"https_token": 1 if "https" in url else 0,
|
| 140 |
"ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
|
| 141 |
"ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
|
| 142 |
"punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
|
| 143 |
"port": parsed.port if parsed.port else 0,
|
| 144 |
+
"tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
|
| 145 |
"tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
|
| 146 |
"abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
|
| 147 |
"nb_subdomains": len(hostname.split('.')) - 1,
|
|
|
|
| 151 |
"length_words_raw": len(url.split()),
|
| 152 |
"char_repeat": len(set(url)),
|
| 153 |
"shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
|
| 154 |
+
"longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0,
|
| 155 |
"shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
|
| 156 |
"longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
|
| 157 |
"shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
|
| 158 |
+
"longest_word_path": max(len(word) for word in path.split('/')) if path else 0,
|
| 159 |
+
"avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0,
|
| 160 |
"avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
|
| 161 |
"avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
|
| 162 |
"phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,
|