Spaces:

Macau-Weather-Forecast
/

DecisionTree-RandomForest

Runtime error

File size: 8,532 Bytes

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt
import seaborn as sns
import gradio as gr
import numpy as np
import joblib

missing_values = ["--"]
data = pd.read_csv('./macau_weather.csv', na_values = missing_values)
del data['num']
before_rows = data.shape[0]
data = data.dropna()
after_rows = data.shape[0]
adp = after_rows/before_rows*100
edp = (before_rows - after_rows)/before_rows *100

def make_ad_plot():    
    fig = plt.figure()
    plt.title("Percentage of available data")
    data = [adp, edp]
    explode = [0.2,0.2]
    colors = sns.color_palette("Paired")
    plt.pie(data, colors = colors, autopct = '%0.0f%%', explode = explode)
    return fig

def make_ra_table(data):
    table_data = pd.DataFrame(columns=['Before Rain Accumulation', 'Convered Rain Accumulation'])
    clean_data = data.copy()
    clean_data['rain_accum'] = (clean_data['rain_accum']>1) *1
    table_data['Before Rain Accumulation'] = data['rain_accum']
    table_data['Convered Rain Accumulation'] = clean_data['rain_accum']
    return table_data, clean_data

def make_rc_plot():
    corr=clean_data.corr()
    fig = plt.figure(figsize=(8,8))
    sns.heatmap(corr,annot=True,cmap='crest',linewidths=0.2)
    return fig

def make_r_plot(libraries, sd,cd,dd,ld,rc,rd,rl,rn,rf):
    sns.set_theme(style="darkgrid")
    colors = sns.color_palette("Paired")
    rfc = RandomForestClassifier(random_state=25, n_estimators=rn, criterion=rc, max_depth=rd, min_samples_leaf=rl,max_features=rf )
    rfc_s = pd.DataFrame(cross_val_score(rfc,X_train,y_train,cv=10),columns=['RandomForest Score'])
    clf = DecisionTreeClassifier(random_state=25,splitter=sd,criterion=cd, max_depth=dd, min_samples_leaf=ld)
    clf_s = pd.DataFrame(cross_val_score(clf,X_train,y_train,cv=10),columns=['DecisioTree Score'])
    total_socre =pd.DataFrame()
    total_socre['RandomForest'] = rfc_s
    total_socre['DecisionTree'] = clf_s
    
    fig = plt.figure(figsize=(10,5))
    for lib in libraries:
        plt.plot(total_socre[lib],  marker = 'o')
    
    plt.legend(['DecisionTree Score', 'RandomForest Score'])

    plt.title("Final Score ")
    plt.ylabel("Socre")
    plt.xlabel("No. Cross Validation")
    return fig

def make_clf_t_plot():
    fig = plt.figure(figsize=(16,8))
    index = np.arange(0, 2 * 0.2, 0.2) * 2.5
    index = index[0:2]
    bar = plt.bar(index, [0.860465,0.883721], 0.2, label="Testing Score", color="crimson")
    plt.xticks( index, ['DecisioTree Score','RandomForest Score'])
    plt.yticks(np.arange(0, 1, 0.05))
    # plt.grid(True)
    plt.xlabel("Model")
    plt.ylabel("Test score")
    return fig

def download_clf():
    # joblib.dump(clf,"dtc_model.m")
    return "./dtc_model.m"

def download_rfc():
    # joblib.dump(rfc,"rfc_model.m")
    return "./rfc_model.m"



if __name__ == '__main__':

    table_data, clean_data = make_ra_table(data)

    morning_features = ['air_pressure', 'aver_tem', 'humidity',
        'sunlight_time', 'wind_direction', 'wind_speed']
    feature=clean_data[morning_features].copy()
    label = clean_data['rain_accum'].copy()
    X_train,X_test,y_train,y_test = train_test_split(feature,label,test_size=0.1,random_state=324)
    clf = joblib.load("dtc_model.m") # DecisionTreeClassifier(random_state=25)
    rfc = joblib.load("rfc_model.m") # RandomForestClassifier(random_state=25, n_estimators=11)
    # clf.fit(X_train,y_train)
    # rfc.fit(X_train,y_train)
    clf_score = clf.score(X_test, y_test)
    rfc_score = rfc.score(X_test, y_test)
    score = pd.DataFrame([[clf_score,rfc_score],['DecisioTree Score','RandomForest Score']],columns=['DecisioTree Score','RandomForest Score'])

    with gr.Blocks() as demo:
        gr.Markdown("""
        ## Data Collection
        We first collect two years (2020-2021) data from [SMG](https://www.smg.gov.mo/zh/subpage/345/embed-path/p/query-weather-c_panel).
        Below table is sample of the data we collocted
        """
        )

        gr.Dataframe(value = data.head(), overflow_row_behaviour='show_ends'),
        gr.Markdown("""
                        ## Data pre-procesing: 
                        """),        
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                            ### Data Cleaning: 
                            We mark the **"NaN"** data as **"--"** by pandas, 
                            and we find that there is 623 data is avaliable, anther 3 have NaN column.
                            However, it is just 0.3% in whole data, we decide to delete these data
                            """),
                demo.load(fn=make_ad_plot, inputs=None, outputs=gr.Plot(label = "Pie Plot"))
            with gr.Column():
                gr.Markdown("""
                            ### Data Type Conversion: 
                            Because Decision Tree only accept discrete feature, we conver the rain accumulation. If Rain Accumulation > 1, we think the weather had rained, else it dosen't.\\
                            Below talbe is our convered data:
                            """),
                gr.Dataframe(value = table_data.head(9), overflow_row_behaviour='show_ends')

        gr.Markdown("""
                        ### Feature Selection: 
                        We choose the columns which have high correlation conefficient as feature, rain accumulation as label.
                        """),
        with gr.Row():
            demo.load(fn=make_rc_plot, inputs=None, outputs=gr.Plot(label = "Feature correlation conefficient"))
            gr.Dataframe(value = feature.head(10))

        gr.Markdown("""
                        ## Model Training: 
                        We use DecisionTree and RandomForest to train our data
                        ### Adjust Hyper-parameter
                        """),
        with gr.Box():
            libraries = gr.CheckboxGroup(choices=["DecisionTree","RandomForest"], label="Select Model to display", value=["DecisionTree","RandomForest"])
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""#### DecisionTree""")
                    sd = gr.Radio(['best','random'],value="best",label="splitter of DecisionTree")
                    cd=gr.Radio(['gini', 'entropy'],value="entropy",label="criterion of Decisiontree")
                    dd = gr.Slider(label="max_depth of Decisiontree", value=4, minimum=1, maximum=10, step=1)
                    ld = gr.Slider(label="min_samples_leaf of Decisiontree", value=1, minimum=1, maximum=50, step=5)
                with gr.Column():
                        gr.Markdown("""#### RandomForest""")

                        rc=gr.Radio(['gini', 'entropy'],value="entropy",label="criterion of RandomForest")
                        rd = gr.Slider(label="max_depth of RandomForest", value=4, minimum=1, maximum=10, step=1)
                        rl=gr.Slider(label="min_samples_leaf of RandomForest", value=10, minimum=1, maximum=50, step=5)
                        rn = gr.Slider(label="n_estimators of RandomForest", value=11, minimum=5, maximum=15, step=1)
                        rf =gr.Slider(label="max_features of RandomForest", value=20,minimun=5, maximum=30, step=1)
            with gr.Row():
                train = gr.Button(value="Train")
            train.click(fn=make_r_plot, inputs=[libraries,sd,cd,dd,ld,rc,rd,rl,rn,rf], outputs=gr.Plot(label = "Vaildation Score Plot"))
        gr.Markdown("""
                    ## Testing: 
                    There are the final testing scores
                    """)
        with gr.Row():
            demo.load(fn=make_clf_t_plot, inputs=None, outputs=gr.Plot(label = "Final Score"))
        gr.Markdown("""
                    ## Download Model: 
                    """)    
        with gr.Row():
            with gr.Column():
                clf_model = gr.Button(value="Download DecisionTree Model")
                clf_model.click(fn=download_clf, inputs=None, outputs=gr.File(label="DecisionTree Model"))
            with gr.Column():
                rfc_model = gr.Button(value="Download RandomForest Model")
                rfc_model.click(fn=download_rfc, inputs=None, outputs=gr.File(label="RandomForest Model"))
    demo.launch()