Spaces:
Sleeping
Sleeping
| from urllib.parse import urlparse, urlencode | |
| import ipaddress | |
| import re | |
| from bs4 import BeautifulSoup | |
| import whois | |
| import urllib | |
| import urllib.request | |
| from datetime import datetime | |
| import requests | |
| import pickle | |
| import gradio as gr | |
| loaded_model = pickle.load(open("XGBoostClassifier1.pickle.dat", "rb")) | |
| shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \ | |
| r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \ | |
| r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \ | |
| r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \ | |
| r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \ | |
| r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \ | |
| r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \ | |
| r"tr\.im|link\.zip\.net" | |
| def getDomain(url): | |
| domain = urlparse(url).netloc | |
| if re.match(r"^www.",domain): | |
| domain = domain.replace("www.","") | |
| return domain | |
| def havingIP(url): | |
| try: | |
| ipaddress.ip_address(url) | |
| ip = 1 | |
| except: | |
| ip = 0 | |
| return ip | |
| def haveAtSign(url): | |
| if "@" in url: | |
| at = 1 | |
| else: | |
| at = 0 | |
| return at | |
| def getLength(url): | |
| if len(url) < 54: | |
| length = 0 | |
| else: | |
| length = 1 | |
| return length | |
| def getDepth(url): | |
| s = urlparse(url).path.split('/') | |
| depth = 0 | |
| for j in range(len(s)): | |
| if len(s[j]) != 0: | |
| depth = depth+1 | |
| return depth | |
| def redirection(url): | |
| pos = url.rfind('//') | |
| if pos > 6: | |
| if pos > 7: | |
| return 1 | |
| else: | |
| return 0 | |
| else: | |
| return 0 | |
| def httpDomain(url): | |
| domain = urlparse(url).netloc | |
| if 'https' in domain: | |
| return 1 | |
| else: | |
| return 0 | |
| def tinyURL(url): | |
| match=re.search(shortening_services,url) | |
| if match: | |
| return 1 | |
| else: | |
| return 0 | |
| def prefixSuffix(url): | |
| if '-' in urlparse(url).netloc: | |
| return 1 # phishing | |
| else: | |
| return 0 # legitimate | |
| def web_traffic(url): | |
| # try: | |
| # url = urllib.parse.quote(url) | |
| # rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find( | |
| # "REACH")['RANK'] | |
| # rank = int(rank) | |
| # except TypeError: | |
| # return 1 | |
| # if rank <100000: | |
| # return 1 | |
| # else: | |
| return 0 | |
| def domainAge(domain_name): | |
| creation_date = domain_name.creation_date | |
| expiration_date = domain_name.expiration_date | |
| if (isinstance(creation_date,str) or isinstance(expiration_date,str)): | |
| try: | |
| creation_date = datetime.strptime(creation_date,'%Y-%m-%d') | |
| expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") | |
| except: | |
| return 1 | |
| if ((expiration_date is None) or (creation_date is None)): | |
| return 1 | |
| elif ((type(expiration_date) is list) or (type(creation_date) is list)): | |
| return 1 | |
| else: | |
| ageofdomain = abs((expiration_date - creation_date).days) | |
| if ((ageofdomain/30) < 6): | |
| age = 1 | |
| else: | |
| age = 0 | |
| return age | |
| def domainEnd(domain_name): | |
| expiration_date = domain_name.expiration_date | |
| if isinstance(expiration_date,str): | |
| try: | |
| expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") | |
| except: | |
| return 1 | |
| if (expiration_date is None): | |
| return 1 | |
| elif (type(expiration_date) is list): | |
| return 1 | |
| else: | |
| today = datetime.now() | |
| end = abs((expiration_date - today).days) | |
| if ((end/30) < 6): | |
| end = 0 | |
| else: | |
| end = 1 | |
| return end | |
| def iframe(response): | |
| if response == "": | |
| return 1 | |
| else: | |
| if re.findall(r"[<iframe>|<frameBorder>]", response.text): | |
| return 0 | |
| else: | |
| return 1 | |
| def mouseOver(response): | |
| if response == "" : | |
| return 1 | |
| else: | |
| if re.findall("<script>.+onmouseover.+</script>", response.text): | |
| return 1 | |
| else: | |
| return 0 | |
| def rightClick(response): | |
| if response == "": | |
| return 1 | |
| else: | |
| if re.findall(r"event.button ?== ?2", response.text): | |
| return 0 | |
| else: | |
| return 1 | |
| def forwarding(response): | |
| if response == "": | |
| return 1 | |
| else: | |
| if len(response.history) <= 2: | |
| return 0 | |
| else: | |
| return 1 | |
| def featureExtraction(url): | |
| features = [] | |
| # features.append(getDomain(url)) | |
| features.append(havingIP(url)) | |
| features.append(haveAtSign(url)) | |
| features.append(getLength(url)) | |
| features.append(getDepth(url)) | |
| features.append(redirection(url)) | |
| features.append(httpDomain(url)) | |
| features.append(tinyURL(url)) | |
| features.append(prefixSuffix(url)) | |
| #Domain based features (4) | |
| dns = 0 | |
| try: | |
| domain_name = whois.whois(urlparse(url).netloc) | |
| except: | |
| dns = 1 | |
| features.append(dns) | |
| features.append(web_traffic(url)) | |
| features.append(1 if dns == 1 else domainAge(domain_name)) | |
| features.append(1 if dns == 1 else domainEnd(domain_name)) | |
| # HTML & Javascript based features (4) | |
| try: | |
| response = requests.get(url) | |
| except: | |
| response = "" | |
| features.append(iframe(response)) | |
| features.append(mouseOver(response)) | |
| features.append(rightClick(response)) | |
| features.append(forwarding(response)) | |
| return features | |
| def index(url): | |
| features = featureExtraction(url) | |
| prediction = loaded_model.predict([features]) | |
| print(features) | |
| print(prediction) | |
| if(prediction[0] == 0): | |
| return "Safe" | |
| else: | |
| return "Unsafe" | |
| inputs_image_url = [ | |
| gr.Textbox(type="text", label="URL"), | |
| ] | |
| outputs_result_dict = [ | |
| gr.Textbox(type="text", label="Result Dictionary"), | |
| ] | |
| interface_image_url = gr.Interface( | |
| fn=index, | |
| inputs=inputs_image_url, | |
| outputs=outputs_result_dict, | |
| title="URL Detection", | |
| cache_examples=False, | |
| ) | |
| gr.TabbedInterface( | |
| [interface_image_url], | |
| tab_names=['URL inference'] | |
| ).queue().launch() | |
| # 0 -> Riyal | |
| # 1 -> Phishing |