File size: 8,532 Bytes
ba0505f
 
 
 
 
 
 
 
 
 
 
bdc6991
ba0505f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f05d85e
 
 
 
 
 
 
 
 
 
 
 
bdc6991
a4b03e0
bdc6991
 
 
a4b03e0
bdc6991
 
a4b03e0
ba0505f
 
a4b03e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba0505f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdc6991
 
ba0505f
bdc6991
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt
import seaborn as sns
import gradio as gr
import numpy as np
import joblib

missing_values = ["--"]
data = pd.read_csv('./macau_weather.csv', na_values = missing_values)
del data['num']
before_rows = data.shape[0]
data = data.dropna()
after_rows = data.shape[0]
adp = after_rows/before_rows*100
edp = (before_rows - after_rows)/before_rows *100

def make_ad_plot():    
    fig = plt.figure()
    plt.title("Percentage of available data")
    data = [adp, edp]
    explode = [0.2,0.2]
    colors = sns.color_palette("Paired")
    plt.pie(data, colors = colors, autopct = '%0.0f%%', explode = explode)
    return fig

def make_ra_table(data):
    table_data = pd.DataFrame(columns=['Before Rain Accumulation', 'Convered Rain Accumulation'])
    clean_data = data.copy()
    clean_data['rain_accum'] = (clean_data['rain_accum']>1) *1
    table_data['Before Rain Accumulation'] = data['rain_accum']
    table_data['Convered Rain Accumulation'] = clean_data['rain_accum']
    return table_data, clean_data

def make_rc_plot():
    corr=clean_data.corr()
    fig = plt.figure(figsize=(8,8))
    sns.heatmap(corr,annot=True,cmap='crest',linewidths=0.2)
    return fig

def make_r_plot(libraries, sd,cd,dd,ld,rc,rd,rl,rn,rf):
    sns.set_theme(style="darkgrid")
    colors = sns.color_palette("Paired")
    rfc = RandomForestClassifier(random_state=25, n_estimators=rn, criterion=rc, max_depth=rd, min_samples_leaf=rl,max_features=rf )
    rfc_s = pd.DataFrame(cross_val_score(rfc,X_train,y_train,cv=10),columns=['RandomForest Score'])
    clf = DecisionTreeClassifier(random_state=25,splitter=sd,criterion=cd, max_depth=dd, min_samples_leaf=ld)
    clf_s = pd.DataFrame(cross_val_score(clf,X_train,y_train,cv=10),columns=['DecisioTree Score'])
    total_socre =pd.DataFrame()
    total_socre['RandomForest'] = rfc_s
    total_socre['DecisionTree'] = clf_s
    
    fig = plt.figure(figsize=(10,5))
    for lib in libraries:
        plt.plot(total_socre[lib],  marker = 'o')
    
    plt.legend(['DecisionTree Score', 'RandomForest Score'])

    plt.title("Final Score ")
    plt.ylabel("Socre")
    plt.xlabel("No. Cross Validation")
    return fig

def make_clf_t_plot():
    fig = plt.figure(figsize=(16,8))
    index = np.arange(0, 2 * 0.2, 0.2) * 2.5
    index = index[0:2]
    bar = plt.bar(index, [0.860465,0.883721], 0.2, label="Testing Score", color="crimson")
    plt.xticks( index, ['DecisioTree Score','RandomForest Score'])
    plt.yticks(np.arange(0, 1, 0.05))
    # plt.grid(True)
    plt.xlabel("Model")
    plt.ylabel("Test score")
    return fig

def download_clf():
    # joblib.dump(clf,"dtc_model.m")
    return "./dtc_model.m"

def download_rfc():
    # joblib.dump(rfc,"rfc_model.m")
    return "./rfc_model.m"



if __name__ == '__main__':

    table_data, clean_data = make_ra_table(data)

    morning_features = ['air_pressure', 'aver_tem', 'humidity',
        'sunlight_time', 'wind_direction', 'wind_speed']
    feature=clean_data[morning_features].copy()
    label = clean_data['rain_accum'].copy()
    X_train,X_test,y_train,y_test = train_test_split(feature,label,test_size=0.1,random_state=324)
    clf = joblib.load("dtc_model.m") # DecisionTreeClassifier(random_state=25)
    rfc = joblib.load("rfc_model.m") # RandomForestClassifier(random_state=25, n_estimators=11)
    # clf.fit(X_train,y_train)
    # rfc.fit(X_train,y_train)
    clf_score = clf.score(X_test, y_test)
    rfc_score = rfc.score(X_test, y_test)
    score = pd.DataFrame([[clf_score,rfc_score],['DecisioTree Score','RandomForest Score']],columns=['DecisioTree Score','RandomForest Score'])

    with gr.Blocks() as demo:
        gr.Markdown("""
        ## Data Collection
        We first collect two years (2020-2021) data from [SMG](https://www.smg.gov.mo/zh/subpage/345/embed-path/p/query-weather-c_panel).
        Below table is sample of the data we collocted
        """
        )

        gr.Dataframe(value = data.head(), overflow_row_behaviour='show_ends'),
        gr.Markdown("""
                        ## Data pre-procesing: 
                        """),        
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                            ### Data Cleaning: 
                            We mark the **"NaN"** data as **"--"** by pandas, 
                            and we find that there is 623 data is avaliable, anther 3 have NaN column.
                            However, it is just 0.3% in whole data, we decide to delete these data
                            """),
                demo.load(fn=make_ad_plot, inputs=None, outputs=gr.Plot(label = "Pie Plot"))
            with gr.Column():
                gr.Markdown("""
                            ### Data Type Conversion: 
                            Because Decision Tree only accept discrete feature, we conver the rain accumulation. If Rain Accumulation > 1, we think the weather had rained, else it dosen't.\\
                            Below talbe is our convered data:
                            """),
                gr.Dataframe(value = table_data.head(9), overflow_row_behaviour='show_ends')

        gr.Markdown("""
                        ### Feature Selection: 
                        We choose the columns which have high correlation conefficient as feature, rain accumulation as label.
                        """),
        with gr.Row():
            demo.load(fn=make_rc_plot, inputs=None, outputs=gr.Plot(label = "Feature correlation conefficient"))
            gr.Dataframe(value = feature.head(10))

        gr.Markdown("""
                        ## Model Training: 
                        We use DecisionTree and RandomForest to train our data
                        ### Adjust Hyper-parameter
                        """),
        with gr.Box():
            libraries = gr.CheckboxGroup(choices=["DecisionTree","RandomForest"], label="Select Model to display", value=["DecisionTree","RandomForest"])
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""#### DecisionTree""")
                    sd = gr.Radio(['best','random'],value="best",label="splitter of DecisionTree")
                    cd=gr.Radio(['gini', 'entropy'],value="entropy",label="criterion of Decisiontree")
                    dd = gr.Slider(label="max_depth of Decisiontree", value=4, minimum=1, maximum=10, step=1)
                    ld = gr.Slider(label="min_samples_leaf of Decisiontree", value=1, minimum=1, maximum=50, step=5)
                with gr.Column():
                        gr.Markdown("""#### RandomForest""")

                        rc=gr.Radio(['gini', 'entropy'],value="entropy",label="criterion of RandomForest")
                        rd = gr.Slider(label="max_depth of RandomForest", value=4, minimum=1, maximum=10, step=1)
                        rl=gr.Slider(label="min_samples_leaf of RandomForest", value=10, minimum=1, maximum=50, step=5)
                        rn = gr.Slider(label="n_estimators of RandomForest", value=11, minimum=5, maximum=15, step=1)
                        rf =gr.Slider(label="max_features of RandomForest", value=20,minimun=5, maximum=30, step=1)
            with gr.Row():
                train = gr.Button(value="Train")
            train.click(fn=make_r_plot, inputs=[libraries,sd,cd,dd,ld,rc,rd,rl,rn,rf], outputs=gr.Plot(label = "Vaildation Score Plot"))
        gr.Markdown("""
                    ## Testing: 
                    There are the final testing scores
                    """)
        with gr.Row():
            demo.load(fn=make_clf_t_plot, inputs=None, outputs=gr.Plot(label = "Final Score"))
        gr.Markdown("""
                    ## Download Model: 
                    """)    
        with gr.Row():
            with gr.Column():
                clf_model = gr.Button(value="Download DecisionTree Model")
                clf_model.click(fn=download_clf, inputs=None, outputs=gr.File(label="DecisionTree Model"))
            with gr.Column():
                rfc_model = gr.Button(value="Download RandomForest Model")
                rfc_model.click(fn=download_rfc, inputs=None, outputs=gr.File(label="RandomForest Model"))
    demo.launch()