import joblib import xgboost import pandas as pd from bs4 import BeautifulSoup import numpy as np import requests from time import sleep from random import randint import re from lxml import etree from urllib.request import urlopen import gradio as gr uscities = pd.read_csv("final_cities_list.csv") uscities["city-state"] = ((uscities["name"]+"-"+uscities["state"])) def scrape_city_data(town_name): try: page = requests.get(f"https://www.city-data.com/city/{town_name}.html").text page2 = requests.get(f"https://www.city-data.com/poverty/poverty-{town_name}.html").text doc = BeautifulSoup(page, "html.parser") doc2 = BeautifulSoup(page2, "html.parser") sex_population = str(doc.find(id="population-by-sex")) (males, females) = [float(x) for x in re.findall(r"(?<=\()[0-9]+\.[0-9]+(?=\%\))", sex_population)] age_population = str(doc.find(id="median-age")) medianage = float(re.search("Median resident age:.*\>([0-9]*\.[0-9]*).*median age", age_population).groups()[0]) coordinates = str(doc.find(id="coordinates")) latitude = float(re.findall(r"(?<=Latitude: )[0-9]*.[0-9]*", coordinates)[0]) longitude = float(re.findall(r"(?<=Longitude: )[0-9]*.[0-9]*", coordinates)[0]) education_level = str(doc.find(id="education-info")) highschoolgrads = float(re.findall("(?<=High school or higher:<\/b> )[0-9]*.[0-9]*", education_level)[0]) phds = float(re.findall(r"(?<=professional degree:<\/b> )[0-9]*.[0-9]*", education_level)[0]) poverty_level = str(doc2.find(id="rt")) below_poverty_level = float(re.findall(r"[0-9]*\.[0-9]*", poverty_level)[0]) total_population = str(doc.find(id="city-population")) residents = float(re.findall(r"(?<= )(?:[0-9]*\,*)*", total_population)[0].replace(",", "")) religion_population = doc.find(id="religion").find_all('tr') data = [] for row in religion_population: columns = row.find_all('td') if columns: religion = columns[0].get_text(strip=True) number = columns[1].get_text(strip=True).replace(",", "").replace("-", "0") data.append([religion, int(number)]) df = pd.DataFrame(data, columns=['religion', 'number']) df['percentage'] = (df['number'] / df['number'].sum()) * 100 atheist = df[df.religion == "None"].iloc[0]["percentage"] evangelicals = df[df.religion == "Evangelical Protestant"].iloc[0]["percentage"] homosexual_households = str(doc.find(id="households-stats")) lesbians = float(re.findall("(?<=Lesbian couples:<\/b> )[0-9]*.[0-9]", homosexual_households)[0]) gays = float(re.findall(r"(?<=Gay men:<\/b> )[0-9]*.[0-9]*", homosexual_households)[0]) rsd = pd.DataFrame( {"City-State": town_name, "PercentageMales": males, "MedianAge": medianage, "Latitude": latitude, "Longitude": longitude, "PercentageHighSchoolGrads": highschoolgrads, "PercentagePHDs": phds, "PercentageBelowPovertyLevel": below_poverty_level, "Population": residents, "PercentageNoReligion": atheist, "PercentageEvangelicals": evangelicals, "gays": gays, "lesbians": lesbians}, index=[0]) return rsd except: print("INFORMATION IS NOT AVAILABLE") return pd.DataFrame({"City-State": "0", "PercentageMales": 0, "MedianAge": 0, "Latitude": 0, "Longitude": 0, "PercentageHighSchoolGrads": 0, "PercentagePHDs": 0, "PercentageBelowPovertyLevel": 0, "Population": 0, "PercentageNoReligion": 0, "PercentageEvangelicals": 0, "gays": 0, "lesbians": 0}, index=[0]) def predict_city_score(clas_model, reg_model, input_data): class_result=clas_model.predict(input_data) if class_result==1: return 100 else: return reg_model.predict(input_data)[0] cityfinalscores = pd.read_csv("cities.csv") clas_model=joblib.load('classifier_model.sav') reg_model=joblib.load('regressor_model.sav') # # 1. Ask the user for an input # city_input=input("Enter the name of the city and state [city]-[state]: ") def Finalresult (Pacific,Mountain,Central,Eastern,Other): city_input = Pacific + Mountain + Central + Eastern + Other data = scrape_city_data(city_input) if city_input in list(cityfinalscores["City-State"]): return ("the city " + city_input + " has a score of: ", cityfinalscores[cityfinalscores["City-State"]==city_input]["Final-Score"].item()) else: # 2. Scrape the data for that city if data["City-State"].item() == "0": return("NO INFORMATION AVAILABLE, PLEASE TRY WITH A DIFFERENT CITY") else: predicted_score=predict_city_score(clas_model, reg_model, input_data=data.iloc[:,1:]) return("the city "+city_input+" has a score of: ", predicted_score) pst = uscities[uscities["Time Zone"]=="PST"]["city-state"].drop_duplicates().to_list() mst = uscities[uscities["Time Zone"]=="MST"]["city-state"].drop_duplicates().to_list() cst = uscities[uscities["Time Zone"]=="CST"]["city-state"].drop_duplicates().to_list() est = uscities[uscities["Time Zone"]=="EST"]["city-state"].drop_duplicates().to_list() other = uscities[(uscities["Time Zone"]=="AKST")|(uscities["Time Zone"]=="HST")]["city-state"].drop_duplicates().to_list() outputs = gr.outputs.Textbox() Pacific = gr.Dropdown(choices= pst,value= "") Mountain = gr.Dropdown(choices= mst,value= "") Central = gr.Dropdown(choices= cst,value= "") Eastern = gr.Dropdown(choices= est,value= "") Other = gr.Dropdown(choices= other,value= "") app = gr.Interface(fn = Finalresult, inputs=[Pacific,Mountain,Central,Eastern,Other], outputs = outputs, description = "From the drop down list select the city and state you're interested in to find out its projected LGBTQIA2+ Equality Index Score. Scores range from 0 to 100, a perfect score.") app.launch()