genee123 commited on
Commit
915113d
·
verified ·
1 Parent(s): 7a5154e

FINAL NA TALAGA TO

Browse files
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -126,7 +126,7 @@ def extract_features(url):
126
  "nb_tilde": url.count('~'),
127
  "nb_percent": url.count('%'),
128
  "nb_slash": url.count('/'),
129
- "nb_star":
130
  "nb_colon": url.count(':'),
131
  "nb_comma": url.count(','),
132
  "nb_semicolumn": url.count(';'),
@@ -135,13 +135,13 @@ def extract_features(url):
135
  "nb_www": 1 if "www" in url else 0,
136
  "nb_com": 1 if ".com" in url else 0,
137
  "nb_dslash": url.count('//'),
138
- "http_in_path": 1 if "http"
139
  "https_token": 1 if "https" in url else 0,
140
  "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
141
  "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
142
  "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
143
  "port": parsed.port if parsed.port else 0,
144
- "t极
145
  "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
146
  "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
147
  "nb_subdomains": len(hostname.split('.')) - 1,
@@ -151,12 +151,12 @@ def extract_features(url):
151
  "length_words_raw": len(url.split()),
152
  "char_repeat": len(set(url)),
153
  "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
154
- "longest_words_raw": max(len(word) for
155
  "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
156
  "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
157
  "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
158
- "longest_word_path": max(len(word) for word in path
159
- "avg_words_raw": np.mean([
160
  "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
161
  "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
162
  "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,
 
126
  "nb_tilde": url.count('~'),
127
  "nb_percent": url.count('%'),
128
  "nb_slash": url.count('/'),
129
+ "nb_star": url.count('*'),
130
  "nb_colon": url.count(':'),
131
  "nb_comma": url.count(','),
132
  "nb_semicolumn": url.count(';'),
 
135
  "nb_www": 1 if "www" in url else 0,
136
  "nb_com": 1 if ".com" in url else 0,
137
  "nb_dslash": url.count('//'),
138
+ "http_in_path": 1 if "http" in path else 0,
139
  "https_token": 1 if "https" in url else 0,
140
  "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
141
  "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
142
  "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
143
  "port": parsed.port if parsed.port else 0,
144
+ "tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
145
  "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
146
  "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
147
  "nb_subdomains": len(hostname.split('.')) - 1,
 
151
  "length_words_raw": len(url.split()),
152
  "char_repeat": len(set(url)),
153
  "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
154
+ "longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0,
155
  "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
156
  "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
157
  "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
158
+ "longest_word_path": max(len(word) for word in path.split('/')) if path else 0,
159
+ "avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0,
160
  "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
161
  "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
162
  "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,