anly656 commited on
Commit
932986d
·
verified ·
1 Parent(s): d112001

Upload IntervalDecisionTree_Template.py

Browse files
Decision_Tree/IntervalDecisionTree_Template.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Updated on Oct 2, 2025
5
+ @purpose: Decision Tree Example for Interval Targets
6
+ @data: Fracking Oil Production in Texas, n=4752 with 13 features (2 Nominal)
7
+ @author: eJones
8
+ @email: eJones@tamu.edu
9
+ """
10
+ # ANSI color codes - to print in color, the package colorama must be installed
11
+ RED = "\033[38;5;197m"
12
+ GOLD = "\033[38;5;185m"
13
+ TEAL = "\033[38;5;50m"
14
+ GREEN = "\033[38;5;82m"
15
+ RESET = "\033[0m"
16
+
17
+ import pandas as pd
18
+ import numpy as np
19
+ from AdvancedAnalytics.ReplaceImputeEncode import DT, ReplaceImputeEncode
20
+ from AdvancedAnalytics.Tree import tree_regressor
21
+ from sklearn.tree import DecisionTreeRegressor
22
+ from sklearn.model_selection import train_test_split, cross_validate
23
+ from sklearn.metrics import mean_squared_error
24
+ from copy import deepcopy
25
+
26
+ data_map = {
27
+ "Log_Cum_Production": [DT.Interval, (8, 15)],
28
+ "Log_Proppant_LB": [DT.Interval, (6, 18)],
29
+ "Log_Carbonate": [DT.Interval, (-4, 4)],
30
+ "Log_Frac_Fluid_GL": [DT.Interval, (7, 18)],
31
+ "Log_GrossPerforatedInterval": [DT.Interval, (4, 9)],
32
+ "Log_LowerPerforation_xy": [DT.Interval, (8, 10)],
33
+ "Log_UpperPerforation_xy": [DT.Interval, (8, 10)],
34
+ "Log_TotalDepth": [DT.Interval, (8, 10)],
35
+ "N_Stages": [DT.Interval, (2, 14)],
36
+ "X_Well": [DT.Interval, (-100, -95)],
37
+ "Y_Well": [DT.Interval, (30, 35)],
38
+ "Operator": [DT.Nominal, tuple(range(1, 29))],
39
+ "County": [DT.Nominal, tuple(range(1, 15))]
40
+ }
41
+
42
+ def print_boundary(lbl):
43
+ b_width = 60
44
+ print("")
45
+ margin = b_width - len(lbl) - 2
46
+ lmargin = int(margin/2)
47
+ rmargin = lmargin
48
+ if lmargin+rmargin < margin:
49
+ lmargin += 1
50
+ print(f"{TEAL}", "="*b_width, f"{RESET}")
51
+ print(f"{GREEN}", lmargin*"*", lbl, rmargin*"*"+f"{RESET}")
52
+ print(f"{TEAL}", "="*b_width, f"{RESET}")
53
+
54
+ print(f"{GOLD}")
55
+ print(15*"=", "DATA MAP", 15*"=")
56
+ lk = len(max(data_map, key=len)) + 1
57
+ ignored = 0
58
+ for col, (dt_type, valid_values) in data_map.items():
59
+ if dt_type.name == "ID" or dt_type.name=="Ignore":
60
+ ignored += 1
61
+ print(f" {TEAL}{col:.<{lk}s} {GOLD}{dt_type.name:9s}{GREEN}{valid_values}")
62
+ print(f"{GOLD} === Data Map has{RED}", len(data_map)-ignored,
63
+ f"{GOLD}attribute columns", 3*"=",f"{RESET}")
64
+
65
+ lbl = "Step 1: Read Data"
66
+ print_boundary(lbl)
67
+ """ READ OIL PRODUCTION FILE USING PANDAS """
68
+ df = pd.read_csv("../data/OilProduction.csv")
69
+ print("Read", df.shape[0], "observations with", df.shape[1], "attributes\n")
70
+
71
+ lbl = "Step 2: ReplaceImputeEncode (RIE) Processing"
72
+ print_boundary(lbl)
73
+
74
+ target = "Log_Cum_Production"
75
+ print(f"{GOLD}")
76
+ # Apply ReplaceImputeEncode preprocessing
77
+ rie = ReplaceImputeEncode(data_map=data_map,
78
+ interval_scale=None, # No standardization of interval features
79
+ no_impute=[target], # Do not impute target variable
80
+ binary_encoding="one-hot",
81
+ nominal_encoding="one-hot",
82
+ drop=False, # Drop one column from each encoded nominal set
83
+ display=True)
84
+ # Transform the data
85
+ encoded_df = rie.fit_transform(df)
86
+
87
+ # Create version without dropped columns for stepwise analysis
88
+ rie = ReplaceImputeEncode(data_map=data_map,
89
+ interval_scale=None,
90
+ no_impute=[target],
91
+ binary_encoding="one-hot",
92
+ nominal_encoding="one-hot",
93
+ drop=True, # Keep all columns for stepwise
94
+ display=False)
95
+ encoded_drp_df = rie.fit_transform(df)
96
+
97
+ print(f"{RESET}")
98
+ print(f"\n{RED}encoded_drp_df{RESET}:",
99
+ f"{encoded_drp_df.shape[0]} cases and",
100
+ f"{encoded_drp_df.shape[1]} columns,\n",
101
+ " including targets, excludes last one-hot columns.")
102
+
103
+ print(f"\n{RED}encoded_df {RESET}:",
104
+ f"{encoded_df.shape[0]} cases and",
105
+ f"{encoded_df.shape[1]} columns,\n",
106
+ " including targets.")
107
+ print(f"{RESET}")
108
+
109
+ #***************************************************************************
110
+ #**************** All Features Logistic Regression *************************
111
+ lbl = " STEP 3: Decision Tree Hyperparameter Optimization"
112
+ print_boundary(lbl)
113
+ y = encoded_df[target]
114
+ X = encoded_df.drop(target, axis=1)
115
+
116
+ candidate_depths = [5, 6, 7, 8, 9, 10, 11, 12, 15, None]
117
+ candidate_leafs = [25, 30, 35, 47]
118
+ best_metric = np.inf
119
+ metric = 'neg_mean_squared_error' # In Sklearn this is -ASE
120
+ n = X.shape[0]
121
+
122
+ Xt, Xv, yt, yv = train_test_split(X, y, train_size=0.7, random_state=31415)
123
+ """ Hyperparameter Optimization """
124
+ for depth in candidate_depths:
125
+ for leaf in candidate_leafs:
126
+ split = 2*leaf
127
+ dt = DecisionTreeRegressor(max_depth=depth,
128
+ min_samples_split=split,
129
+ min_samples_leaf=leaf,
130
+ random_state=31415)
131
+ dt = dt.fit(Xt,yt)
132
+ train_pred = dt.predict(Xt)
133
+ train_ase = mean_squared_error(yt, train_pred)
134
+ val_pred = dt.predict(Xv)
135
+ val_ase = mean_squared_error(yv, val_pred)
136
+ ratio = val_ase/train_ase
137
+
138
+ if ratio >= 1.2:
139
+ color = RED
140
+ else:
141
+ color = TEAL
142
+ print(f"{TEAL}")
143
+ print("Maximum Depth=", f"{GOLD}{depth}{TEAL}",
144
+ "Min Leaf Size=", f"{GOLD}{leaf}{TEAL}")
145
+ print(f"Train ASE:{train_ase:7.4f} Validation ASE:{RED}{val_ase:7.4f}",
146
+ f"{TEAL}Ratio:{color}{ratio:7.4f}{RESET}")
147
+ if val_ase < best_metric:
148
+ best_metric = val_ase
149
+ best_depth = depth
150
+ best_leaf = leaf
151
+ best_ratio = ratio
152
+ best_tree = deepcopy(dt)
153
+
154
+ print(f"{GOLD}")
155
+ tree_regressor.display_split_metrics(best_tree, Xt, yt, Xv, yv)
156
+ if best_ratio >= 1.2:
157
+ color = RED
158
+ else:
159
+ color = TEAL
160
+ print(f"\nOverfitting Ratio Val_ase/Train_ase: {color}{best_ratio:7.4f}{TEAL}")
161
+ tree_regressor.display_importance(best_tree, X.columns, top=10, plot=True)
162
+
163
+ """ Validation using K-Fold Cross-Validation """
164
+ lbl = " STEP 4: Decision Tree K-Fold Cross Validation"
165
+ print_boundary(lbl)
166
+
167
+ best_metric = np.inf
168
+ for k in range(2, 11):
169
+ best_split = 2*best_leaf
170
+ dt = DecisionTreeRegressor(max_depth=best_depth,
171
+ min_samples_split=best_split,
172
+ min_samples_leaf=best_leaf,
173
+ random_state=31415)
174
+ scores = cross_validate(dt, X, y,
175
+ scoring=metric,
176
+ cv=k, return_train_score=True )
177
+ print(f"\n{GOLD}Decision Tree K-Fold CV with K={k}")
178
+ print("{:.<18s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev."))
179
+ var = "test_score"
180
+ mean = -scores["test_score"].mean()
181
+ std = scores["test_score"].std()
182
+ print("{:.<18s}{:>7.4f}{:>10.4f}".format("ASE", mean, std))
183
+ if mean<best_metric:
184
+ best_fold = k
185
+ best_metric = mean
186
+ best_std = std
187
+ train_mean = -scores["train_score"].mean()
188
+ train_std = scores["train_score"].std()
189
+ best_ratio = best_metric/train_mean
190
+ best_tree = deepcopy(dt)
191
+
192
+ print(f"{TEAL}")
193
+ if best_ratio >= 1.2:
194
+ color = RED
195
+ else:
196
+ color = TEAL
197
+ print("Maximum Depth=", f"{GOLD}{best_depth}{TEAL}",
198
+ "Min Leaf Size=", f"{GOLD}{best_leaf}{TEAL}",
199
+ "Best Fold=", f"{GOLD}{best_fold}{TEAL}")
200
+ print(f"Train ASE:{train_ase:7.4f} Validation ASE:{RED}{val_ase:7.4f}",
201
+ f"{TEAL}Ratio:{color}{ratio:7.4f}{RESET}")
202
+ dt = DecisionTreeRegressor(max_depth=best_depth,
203
+ min_samples_leaf=best_leaf,
204
+ min_samples_split=2*best_leaf,
205
+ random_state=31415)
206
+ dt = dt.fit(X,y)
207
+ print(f"{GOLD}")
208
+ tree_regressor.display_metrics(dt, X, y)
209
+ tree_regressor.display_importance(dt, X.columns, top=10)
210
+ print(f"{RESET}")