import joblib
import xgboost
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
from lxml import etree
from urllib.request import urlopen
import gradio as gr

uscities = pd.read_csv("final_cities_list.csv")
uscities["city-state"] = ((uscities["name"]+"-"+uscities["state"]))
def scrape_city_data(town_name):


    try:
        page = requests.get(f"https://www.city-data.com/city/{town_name}.html").text
        page2 = requests.get(f"https://www.city-data.com/poverty/poverty-{town_name}.html").text
        doc = BeautifulSoup(page, "html.parser")
        doc2 = BeautifulSoup(page2, "html.parser")

        sex_population = str(doc.find(id="population-by-sex"))
        (males, females) = [float(x) for x in re.findall(r"(?<=\()[0-9]+\.[0-9]+(?=\%\))", sex_population)]

        age_population = str(doc.find(id="median-age"))
        medianage = float(re.search("Median resident age:.*\>([0-9]*\.[0-9]*).*median age", age_population).groups()[0])

        coordinates = str(doc.find(id="coordinates"))
        latitude = float(re.findall(r"(?<=Latitude:</b> )[0-9]*.[0-9]*", coordinates)[0])
        longitude = float(re.findall(r"(?<=Longitude:</b> )[0-9]*.[0-9]*", coordinates)[0])

        education_level = str(doc.find(id="education-info"))
        highschoolgrads = float(re.findall("(?<=High school or higher:<\/b> )[0-9]*.[0-9]*", education_level)[0])
        phds = float(re.findall(r"(?<=professional degree:<\/b> )[0-9]*.[0-9]*", education_level)[0])

        poverty_level = str(doc2.find(id="rt"))
        below_poverty_level = float(re.findall(r"[0-9]*\.[0-9]*", poverty_level)[0])

        total_population = str(doc.find(id="city-population"))
        residents = float(re.findall(r"(?<=</b> )(?:[0-9]*\,*)*", total_population)[0].replace(",", ""))

        religion_population = doc.find(id="religion").find_all('tr')
        data = []
        for row in religion_population:
            columns = row.find_all('td')
            if columns:
                religion = columns[0].get_text(strip=True)
                number = columns[1].get_text(strip=True).replace(",", "").replace("-", "0")
                data.append([religion, int(number)])
        df = pd.DataFrame(data, columns=['religion', 'number'])
        df['percentage'] = (df['number'] / df['number'].sum()) * 100
        atheist = df[df.religion == "None"].iloc[0]["percentage"]
        evangelicals = df[df.religion == "Evangelical Protestant"].iloc[0]["percentage"]

        homosexual_households = str(doc.find(id="households-stats"))
        lesbians = float(re.findall("(?<=Lesbian couples:<\/b> )[0-9]*.[0-9]", homosexual_households)[0])
        gays = float(re.findall(r"(?<=Gay men:<\/b> )[0-9]*.[0-9]*", homosexual_households)[0])

        rsd = pd.DataFrame(
            {"City-State": town_name, "PercentageMales": males, "MedianAge": medianage, "Latitude": latitude,
             "Longitude": longitude, "PercentageHighSchoolGrads": highschoolgrads, "PercentagePHDs": phds,
             "PercentageBelowPovertyLevel": below_poverty_level, "Population": residents,
             "PercentageNoReligion": atheist, "PercentageEvangelicals": evangelicals, "gays": gays,
             "lesbians": lesbians}, index=[0])

        return rsd
    except:
        print("INFORMATION IS NOT AVAILABLE")
        return pd.DataFrame({"City-State": "0", "PercentageMales": 0, "MedianAge": 0, "Latitude": 0, "Longitude": 0,
                             "PercentageHighSchoolGrads": 0, "PercentagePHDs": 0, "PercentageBelowPovertyLevel": 0,
                             "Population": 0, "PercentageNoReligion": 0, "PercentageEvangelicals": 0, "gays": 0,
                             "lesbians": 0}, index=[0])


def predict_city_score(clas_model, reg_model, input_data):
   
    class_result=clas_model.predict(input_data) 
    
    if class_result==1:
        return 100
    else:
        return reg_model.predict(input_data)[0]


cityfinalscores = pd.read_csv("cities.csv")

clas_model=joblib.load('classifier_model.sav')
reg_model=joblib.load('regressor_model.sav')

# # 1. Ask the user for an input
# city_input=input("Enter the name of the city and state [city]-[state]: ")

def Finalresult (Pacific,Mountain,Central,Eastern,Other):
    city_input = Pacific + Mountain + Central + Eastern + Other
    data = scrape_city_data(city_input)
    if city_input in list(cityfinalscores["City-State"]):
        return ("the city " + city_input + " has a score of: ", cityfinalscores[cityfinalscores["City-State"]==city_input]["Final-Score"].item())

    else:
        # 2. Scrape the data for that city


        if data["City-State"].item() == "0":
            return("NO INFORMATION AVAILABLE, PLEASE TRY WITH A DIFFERENT CITY")
        else:
            predicted_score=predict_city_score(clas_model, reg_model, input_data=data.iloc[:,1:])
            return("the city "+city_input+" has a score of: ", predicted_score)


pst = uscities[uscities["Time Zone"]=="PST"]["city-state"].drop_duplicates().to_list()
mst = uscities[uscities["Time Zone"]=="MST"]["city-state"].drop_duplicates().to_list()
cst = uscities[uscities["Time Zone"]=="CST"]["city-state"].drop_duplicates().to_list()
est = uscities[uscities["Time Zone"]=="EST"]["city-state"].drop_duplicates().to_list()
other = uscities[(uscities["Time Zone"]=="AKST")|(uscities["Time Zone"]=="HST")]["city-state"].drop_duplicates().to_list()

outputs = gr.outputs.Textbox()
Pacific = gr.Dropdown(choices= pst,value= "")
Mountain = gr.Dropdown(choices= mst,value= "")
Central = gr.Dropdown(choices= cst,value= "")
Eastern = gr.Dropdown(choices= est,value= "")
Other = gr.Dropdown(choices= other,value= "")
app = gr.Interface(fn = Finalresult, inputs=[Pacific,Mountain,Central,Eastern,Other], outputs = outputs, description = "From the drop down list select the city and state you're interested in to find out its projected LGBTQIA2+ Equality Index Score. Scores range from 0 to 100, a perfect score.")
app.launch()