Spaces:

genee123
/

my_xgb_api

Sleeping

App Files Files Community

genee123 commited on Aug 24, 2025

Commit

915113d

verified ·

1 Parent(s): 7a5154e

FINAL NA TALAGA TO

Browse files

Files changed (1) hide show

app.py +6 -6

app.py CHANGED Viewed

@@ -126,7 +126,7 @@ def extract_features(url):
         "nb_tilde": url.count('~'),
         "nb_percent": url.count('%'),
         "nb_slash": url.count('/'),
-        "nb_star":极
         "nb_colon": url.count(':'),
         "nb_comma": url.count(','),
         "nb_semicolumn": url.count(';'),
@@ -135,13 +135,13 @@ def extract_features(url):
         "nb_www": 1 if "www" in url else 0,
         "nb_com": 1 if ".com" in url else 0,
         "nb_dslash": url.count('//'),
-        "http_in_path": 1 if "http"极
         "https_token": 1 if "https" in url else 0,
         "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
         "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
         "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
         "port": parsed.port if parsed.port else 0,
-        "t极
         "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
         "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
         "nb_subdomains": len(hostname.split('.')) - 1,
@@ -151,12 +151,12 @@ def extract_features(url):
         "length_words_raw": len(url.split()),
         "char_repeat": len(set(url)),
         "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
-        "longest_words_raw": max(len(word) for极
         "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
         "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
         "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
-        "longest_word_path": max(len(word) for word in path极
-        "avg_words_raw": np.mean([极
         "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
         "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
         "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,

         "nb_tilde": url.count('~'),
         "nb_percent": url.count('%'),
         "nb_slash": url.count('/'),
+        "nb_star": url.count('*'),
         "nb_colon": url.count(':'),
         "nb_comma": url.count(','),
         "nb_semicolumn": url.count(';'),
         "nb_www": 1 if "www" in url else 0,
         "nb_com": 1 if ".com" in url else 0,
         "nb_dslash": url.count('//'),
+        "http_in_path": 1 if "http" in path else 0,
         "https_token": 1 if "https" in url else 0,
         "ratio_digits_url": sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
         "ratio_digits_host": sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0,
         "punycode": 1 if re.search(r'xn--', url, re.IGNORECASE) else 0,
         "port": parsed.port if parsed.port else 0,
+        "tld_in_path": 1 if any(tld in path for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
         "tld_in_subdomain": 1 if any(tld in hostname for tld in ['.com', '.net', '.org', '.gov', '.edu']) else 0,
         "abnormal_subdomain": 1 if len(hostname.split('.')) > 3 else 0,
         "nb_subdomains": len(hostname.split('.')) - 1,
         "length_words_raw": len(url.split()),
         "char_repeat": len(set(url)),
         "shortest_words_raw": min(len(word) for word in url.split()) if url.split() else 0,
+        "longest_words_raw": max(len(word) for word in url.split()) if url.split() else 0,
         "shortest_word_host": min(len(word) for word in hostname.split('.')) if hostname else 0,
         "longest_word_host": max(len(word) for word in hostname.split('.')) if hostname else 0,
         "shortest_word_path": min(len(word) for word in path.split('/')) if path else 0,
+        "longest_word_path": max(len(word) for word in path.split('/')) if path else 0,
+        "avg_words_raw": np.mean([len(word) for word in url.split()]) if url.split() else 0,
         "avg_word_host": np.mean([len(word) for word in hostname.split('.')]) if hostname else 0,
         "avg_word_path": np.mean([len(word) for word in path.split('/')]) if path else 0,
         "phish_hints": 1 if any(kw in url.lower() for kw in ['login', 'secure', 'verify', 'account']) else 0,