File size: 5,836 Bytes
500cc34
 
 
 
 
d2dea50
 
 
 
 
 
 
 
 
9f7f620
d2dea50
 
 
 
 
 
 
 
 
500cc34
 
 
 
 
d2dea50
 
 
 
 
60f7930
d2dea50
 
 
 
9f7f620
d2dea50
 
 
500cc34
 
 
 
d2dea50
 
500cc34
 
d2dea50
 
 
500cc34
 
 
 
 
 
 
ccf7836
 
500cc34
 
 
d2dea50
6ec63bb
d2dea50
9446416
 
 
314bdac
9446416
d2dea50
 
500cc34
 
 
ccf7836
 
500cc34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48c3e65
 
5948e98
 
48c3e65
 
5948e98
 
500cc34
 
 
d2dea50
500cc34
 
b581d09
aa3a0c8
500cc34
f1a3640
500cc34
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import gradio as gr

import pandas as pd

def apply_scaling(method, train_df, test_df):

  if method == "Min-Max normalization":
     from sklearn.preprocessing import MinMaxScaler
     scaler = MinMaxScaler()
     scaler.fit(train_df) # get min, max for each column

     min_fea = scaler.data_min_
     max_fea = scaler.data_max_
  else:
     from sklearn.preprocessing import StandardScaler
     scaler = StandardScaler()
     scaler.fit(train_df) # get min, max for each column

     mean_fea = scaler.mean_
     var_fea = scaler.var_
      
  scaled_train_data = scaler.transform(train_df) # apply min-max or standardization
  scaled_test_data = scaler.transform(test_df) # apply min-max or standardization

  scaled_train_data = pd.DataFrame(scaled_train_data, columns=["Normalized Feature 1", "Normalized Feature 2"])
  scaled_train_data["Instance" ] = [i for i in range(0,len(scaled_train_data))]
  scaled_train_data = scaled_train_data[["Instance", "Normalized Feature 1", "Normalized Feature 2"]]

  if method == "Min-Max normalization":
      temp = {"Instance": 'Min used for scaling', "Normalized Feature 1": min_fea[0],  "Normalized Feature 2": min_fea[1]}
      temp_stat1 = pd.DataFrame(temp, index=[0])
      temp = {"Instance": 'Max used for scaling', "Normalized Feature 1": max_fea[0],  "Normalized Feature 2": max_fea[1]}
      temp_stat2 = pd.DataFrame(temp, index=[0])
  else:
      temp = {"Instance": 'Mean used for scaling', "Normalized Feature 1": mean_fea[0],  "Normalized Feature 2": mean_fea[1]}
      temp_stat1 = pd.DataFrame(temp, index=[0])
      temp = {"Instance": 'Std used for scaling', "Normalized Feature 1": np.sqrt(var_fea[0]),  "Normalized Feature 2": np.sqrt(var_fea[1])}
      temp_stat2 = pd.DataFrame(temp, index=[0])

    
  scaled_train_data = pd.concat([scaled_train_data, temp_stat1], ignore_index=True)
  scaled_train_data = pd.concat([scaled_train_data, temp_stat2], ignore_index=True)

  scaled_test_data = pd.DataFrame(scaled_test_data, columns=["Normalized Feature 1", "Normalized Feature 2"])
  scaled_test_data["Instance" ] = [i for i in range(0,len(scaled_test_data))]
  scaled_test_data = scaled_test_data[["Instance", "Normalized Feature 1", "Normalized Feature 2"]]
  scaled_test_data = pd.concat([scaled_test_data, temp_stat1], ignore_index=True)
  scaled_test_data = pd.concat([scaled_test_data, temp_stat2], ignore_index=True)

  scaled_train_data["Normalized Feature 1"] = scaled_train_data["Normalized Feature 1"].round(3)
  scaled_train_data["Normalized Feature 2"] = scaled_train_data["Normalized Feature 2"].round(3)
  scaled_test_data["Normalized Feature 1"] = scaled_test_data["Normalized Feature 1"].round(3)
  scaled_test_data["Normalized Feature 2"] = scaled_test_data["Normalized Feature 2"].round(3)

  return scaled_train_data, scaled_test_data


input_df1 = gr.Dataframe(
            headers=["Feature 1", "Feature 2"],
            datatype=["number", "number"],
            row_count=(4, "fixed"),
            col_count=(2, "fixed"),
            label = 'Training dataset'
        )


#input_method = gr.CheckboxGroup(["Min-Max normalization", "Standardization"], label="Feature Scaling Method")

#input_method = gr.Textbox(value = "Min-Max normalization", label="Feature Scaling Method")

input_method = gr.Dropdown(
            ["Min-Max normalization", "Standardization"], value="Min-Max normalization", label="Feature Scaling Method"
        )


input_df2 = gr.Dataframe(
            headers=["Feature 1", "Feature 2"],
            datatype=["number", "number"],
            row_count=(4, "fixed"),
            col_count=(2, "fixed"),
            label = 'Test dataset'
        )

output_df1 = gr.Dataframe(
            headers=["Instance", "Feature 1", "Feature 2"],
            datatype=["str","number", "number"],
            row_count=(6, "fixed"),
            col_count=(3, "fixed"),
            label = 'Normalized Training dataset'
        )



output_df2 = gr.Dataframe(
            headers=["Instance", "Feature 1", "Feature 2"],
            datatype=["str","number", "number"],
            row_count=(6, "fixed"),
            col_count=(3, "fixed"),
            label = 'Normalized Test dataset'
        )



train_df1 = pd.DataFrame(np.array([[3, 10],
                 [6, 20],
                 [3, 5],
                 [8, 12]]), columns = ["Feature 1", "Feature 2"])
test_df1 = pd.DataFrame(np.array([[2, 7],
                 [1, 9],
                 [3, 6],
                 [4, 12]]), columns = ["Feature 1", "Feature 2"])



train_df2 = pd.DataFrame(np.array([[1, 10],
                 [3, 20],
                 [4, 5],
                 [5, 12]]), columns = ["Feature 1", "Feature 2"])
test_df2 = pd.DataFrame(np.array([[2, 11],
                 [3, 12],
                 [4, 7],
                 [2, 18]]), columns = ["Feature 1", "Feature 2"])


train_df3 = pd.DataFrame(np.array([[2, 10],
                 [4, 20],
                 [4, 5],
                 [5, 12]]), columns = ["Feature 1", "Feature 2"])
test_df3 = pd.DataFrame(np.array([[3, 11],
                 [1, 12],
                 [4, 7],
                 [2, 18]]), columns = ["Feature 1", "Feature 2"])

### configure Gradio
interface = gr.Interface(fn=apply_scaling, 
                         inputs=[input_method, input_df1, input_df2], 
                         outputs= [output_df1, output_df2],
                         
                         title="CSCI4750/5750(hw01-PartI): Mathematics for KNN (Question 5: Feature Scaling)", 
                         description= "Click examples below for a quick demo, or change input values by clicking cells",
                         theme = 'huggingface',
                         examples = [["Min-Max normalization", train_df2, test_df2]],
                         )


interface.launch(debug=True)