prakharg24 commited on
Commit
95bf9d2
Β·
verified Β·
1 Parent(s): bfc5666

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +177 -86
utils.py CHANGED
@@ -67,123 +67,214 @@ def add_red_text(text_to_display):
67
 
68
  # Define pipeline stages
69
  pipeline_data = {
70
- "Data Collection": {
71
- "explain_text": "**Data Collection:** Decisions about what data to collect and how.",
72
- "Data Sources": {
73
- "explain_text": "**Data Sources:** What data sources will be used to collect data?",
74
- "sub_decisions": ["Collect existing dataset or new sensor data?", "Public datasets or Private datasets?", "Design Web Scraping or use APIs?"]
 
 
 
 
75
  },
76
- "Data Usage": {
77
- "explain_text": "**Data Usage:** How should the data be used, given any license or permission constraints?",
78
- "sub_decisions": ["Ethical concerns to be addressed?", "Commercial use policies?", "Geographic limits?"]
 
 
 
 
79
  },
80
- "Data Quality": {
81
- "explain_text": "**Data Quality:** What kind of quality checks are done to decide data collection?",
82
- "sub_decisions": ["Missing value checks to see if critical field are affected?", "Potential duplicates?", "Format consistency and encoding issues?"]
 
 
 
 
83
  },
84
- "Data Sampling": {
85
- "explain_text": "**Data Sampling:** How to sample from a potentially bigger data source?",
86
- "sub_decisions": ["Random sampling/stratified sampling/cluster sampling?", "Sample size?", "Potential imbalance?", "Additional synthetic data?"]
 
 
 
 
 
87
  },
88
- "Data Storage": {
89
- "explain_text": "**Data Storage:** How and where to store the data?",
90
- "sub_decisions": ["Backup frequency?", "File format choice?"]
 
 
 
91
  },
92
  },
93
 
94
- "Data Processing": {
95
- "explain_text": "**Data Processing:** Decisions about how to process and prepare the data.",
96
- "Data Cleaning": {
97
- "explain_text": "**Data Cleaning:** How should raw data be cleaned and standardized?",
98
- "sub_decisions": ["How to handle missing values?", "How to detect/remove duplicates?", "How to fix formatting errors?"]
 
 
 
 
99
  },
100
- "Feature Selection": {
101
- "explain_text": "**Feature Selection:** Which features should be included in the model?",
102
- "sub_decisions": ["Manual vs automated selection?", "How to check for data leakage?", "Should dimensionality reduction be applied?"]
 
 
 
 
103
  },
104
- "Feature Engineering": {
105
- "explain_text": "**Feature Engineering:** How to create or transform features for better performance?",
106
- "sub_decisions": ["What new features should be created?", "How to combine existing features?", "How to encode categorical variables?"]
 
 
 
 
107
  },
108
- "Outlier Handling": {
109
- "explain_text": "**Outlier Handling:** How to deal with unusual or extreme data points?",
110
- "sub_decisions": ["Which detection method to use (Z-score, IQR, clustering)?", "Remove, cap, or keep outliers?"]
 
 
 
111
  },
112
- "Data Scaling": {
113
- "explain_text": "**Data Scaling:** How to scale or transform features before modeling?",
114
- "sub_decisions": ["Should Min-Max or Standard scaling be applied?", "Is log or Box-Cox transformation needed?"]
 
 
 
115
  }
116
  },
117
 
118
- "Model Selection": {
119
- "explain_text": "**Model Selection:** Decisions about which model to train and the hyperparameter choices.",
120
- "Model Architecture": {
121
- "explain_text": "**Model Architecture:** Which type of model is best suited to the problem?",
122
- "sub_decisions": ["Linear vs tree-based vs neural networks?", "How interpretable should the model be?", "What are computational constraints?"]
 
 
 
 
123
  },
124
- "Baseline Model": {
125
- "explain_text": "**Baseline Model:** What simple models can set a performance baseline?",
126
- "sub_decisions": ["Should a logistic regression or decision tree be used?", "What baseline metric is most relevant?"]
 
 
 
127
  },
128
- "Pre-trained Models": {
129
- "explain_text": "**Pre-trained Models:** Can existing models be leveraged?",
130
- "sub_decisions": ["Which pre-trained models are relevant (image, NLP, tabular)?", "Fine-tune or use as feature extractors?"]
 
 
 
131
  },
132
- "Hyperparameters": {
133
- "explain_text": "**Hyperparameters:** How to optimize model hyperparameters?",
134
- "sub_decisions": ["Grid search vs random search vs Bayesian?", "How many trials and folds to run?", "What budget or time limit applies?"]
 
 
 
 
135
  },
136
- "Model Complexity": {
137
- "explain_text": "**Model Complexity:** Is the model efficient enough for deployment?",
138
- "sub_decisions": ["How many parameters and FLOPs?", "What is memory usage and latency?", "Are there deployment constraints (edge vs cloud)?"]
 
 
 
 
139
  }
140
  },
141
 
142
- "Model Training": {
143
- "explain_text": "**Model Training:** Decisions about the training algorithm used.",
144
- "Data Splitting": {
145
- "explain_text": "**Data Splitting:** How should data be divided for training and testing?",
146
- "sub_decisions": ["Train-test split ratio?", "Cross-validation vs stratified split?"]
 
 
 
147
  },
148
- "Loss Function": {
149
- "explain_text": "**Loss Function:** Which loss function aligns with the task?",
150
- "sub_decisions": ["MSE vs MAE vs cross-entropy?", "Is robustness to outliers needed?", "Does it align with evaluation metrics?"]
 
 
 
 
151
  },
152
- "Optimization Method": {
153
- "explain_text": "**Optimization Method:** Which optimization algorithm should be used?",
154
- "sub_decisions": ["SGD vs Adam vs RMSProp?", "What learning rate schedule?", "What batch size?"]
 
 
 
 
155
  },
156
- "Regularization": {
157
- "explain_text": "**Regularization:** How to prevent overfitting?",
158
- "sub_decisions": ["L1 vs L2 regularization?", "Dropout rate?", "Should early stopping be applied?"]
 
 
 
 
159
  },
160
- "Training Monitoring": {
161
- "explain_text": "**Training Monitoring:** How to track and manage training progress?",
162
- "sub_decisions": ["Which metrics should be monitored?", "How often to checkpoint models?"]
 
 
 
163
  }
164
  },
165
 
166
- "Model Evaluation": {
167
- "explain_text": "**Model Evaluation:** Decisions about the evaluation criteria.",
168
- "Evaluation Metric": {
169
- "explain_text": "**Evaluation Metric:** Which metrics best reflect model performance?",
170
- "sub_decisions": ["Accuracy vs Precision/Recall/F1?", "How to handle class imbalance?", "Including probabilistic metrics (AUC, log loss)?"]
 
 
 
 
171
  },
172
- "Test Data": {
173
- "explain_text": "**Test Data:** How should testing be performed?",
174
- "sub_decisions": ["Hold-out set vs cross-validation?", "An external test dataset?"]
 
 
 
175
  },
176
- "Fairness": {
177
- "explain_text": "**Fairness:** How to ensure fairness across groups?",
178
- "sub_decisions": ["Which fairness metric to use (demographic parity, equalized odds)?", "How to detect bias in predictions?"]
 
 
 
179
  },
180
- "Robustness": {
181
- "explain_text": "**Robustness:** How reliable is the model under stress?",
182
- "sub_decisions": ["How does the model handle noisy inputs?", "How to test against distribution shifts?"]
 
 
 
183
  },
184
- "Interpretability": {
185
- "explain_text": "**Interpretability:** How understandable are the model predictions?",
186
- "sub_decisions": ["Which methods to use (feature importance, SHAP, LIME)?", "How stable are explanations?", "Are explanations actionable for stakeholders?"]
 
 
 
 
187
  }
188
  }
189
- }
 
67
 
68
  # Define pipeline stages
69
  pipeline_data = {
70
+ "πŸ“₯ Data Collection": {
71
+ "explain_text": "**πŸ“₯ Data Collection:** Decisions about what data to collect and how.",
72
+ "πŸ“Š Data Sources": {
73
+ "explain_text": "**πŸ“Š Data Sources:** What data sources will be used to collect data?",
74
+ "sub_decisions": [
75
+ "Collect existing dataset or new sensor data?",
76
+ "Public datasets or Private datasets?",
77
+ "Design Web Scraping or use APIs?"
78
+ ]
79
  },
80
+ "πŸ“œ Data Usage": {
81
+ "explain_text": "**πŸ“œ Data Usage:** How should the data be used, given any license or permission constraints?",
82
+ "sub_decisions": [
83
+ "Ethical concerns to be addressed?",
84
+ "Commercial use policies?",
85
+ "Geographic limits?"
86
+ ]
87
  },
88
+ "🧹 Data Quality": {
89
+ "explain_text": "**🧹 Data Quality:** What kind of quality checks are done to decide data collection?",
90
+ "sub_decisions": [
91
+ "Missing value checks to see if critical field are affected?",
92
+ "Potential duplicates?",
93
+ "Format consistency and encoding issues?"
94
+ ]
95
  },
96
+ "🎲 Data Sampling": {
97
+ "explain_text": "**🎲 Data Sampling:** How to sample from a potentially bigger data source?",
98
+ "sub_decisions": [
99
+ "Random sampling/stratified sampling/cluster sampling?",
100
+ "Sample size?",
101
+ "Potential imbalance?",
102
+ "Additional synthetic data?"
103
+ ]
104
  },
105
+ "πŸ’Ύ Data Storage": {
106
+ "explain_text": "**πŸ’Ύ Data Storage:** How and where to store the data?",
107
+ "sub_decisions": [
108
+ "Backup frequency?",
109
+ "File format choice?"
110
+ ]
111
  },
112
  },
113
 
114
+ "βš™οΈ Data Processing": {
115
+ "explain_text": "**βš™οΈ Data Processing:** Decisions about how to process and prepare the data.",
116
+ "🧽 Data Cleaning": {
117
+ "explain_text": "**🧽 Data Cleaning:** How should raw data be cleaned and standardized?",
118
+ "sub_decisions": [
119
+ "How to handle missing values?",
120
+ "How to detect/remove duplicates?",
121
+ "How to fix formatting errors?"
122
+ ]
123
  },
124
+ "🎯 Feature Selection": {
125
+ "explain_text": "**🎯 Feature Selection:** Which features should be included in the model?",
126
+ "sub_decisions": [
127
+ "Manual vs automated selection?",
128
+ "How to check for data leakage?",
129
+ "Should dimensionality reduction be applied?"
130
+ ]
131
  },
132
+ "πŸ”§ Feature Engineering": {
133
+ "explain_text": "**πŸ”§ Feature Engineering:** How to create or transform features for better performance?",
134
+ "sub_decisions": [
135
+ "What new features should be created?",
136
+ "How to combine existing features?",
137
+ "How to encode categorical variables?"
138
+ ]
139
  },
140
+ "🚨 Outlier Handling": {
141
+ "explain_text": "**🚨 Outlier Handling:** How to deal with unusual or extreme data points?",
142
+ "sub_decisions": [
143
+ "Which detection method to use (Z-score, IQR, clustering)?",
144
+ "Remove, cap, or keep outliers?"
145
+ ]
146
  },
147
+ "πŸ“ Data Scaling": {
148
+ "explain_text": "**πŸ“ Data Scaling:** How to scale or transform features before modeling?",
149
+ "sub_decisions": [
150
+ "Should Min-Max or Standard scaling be applied?",
151
+ "Is log or Box-Cox transformation needed?"
152
+ ]
153
  }
154
  },
155
 
156
+ "πŸ€– Model Selection": {
157
+ "explain_text": "**πŸ€– Model Selection:** Decisions about which model to train and the hyperparameter choices.",
158
+ "πŸ—οΈ Model Architecture": {
159
+ "explain_text": "**πŸ—οΈ Model Architecture:** Which type of model is best suited to the problem?",
160
+ "sub_decisions": [
161
+ "Linear vs tree-based vs neural networks?",
162
+ "How interpretable should the model be?",
163
+ "What are computational constraints?"
164
+ ]
165
  },
166
+ "πŸ“‰ Baseline Model": {
167
+ "explain_text": "**πŸ“‰ Baseline Model:** What simple models can set a performance baseline?",
168
+ "sub_decisions": [
169
+ "Should a logistic regression or decision tree be used?",
170
+ "What baseline metric is most relevant?"
171
+ ]
172
  },
173
+ "🧠 Pre-trained Models": {
174
+ "explain_text": "**🧠 Pre-trained Models:** Can existing models be leveraged?",
175
+ "sub_decisions": [
176
+ "Which pre-trained models are relevant (image, NLP, tabular)?",
177
+ "Fine-tune or use as feature extractors?"
178
+ ]
179
  },
180
+ "⚑ Hyperparameters": {
181
+ "explain_text": "**⚑ Hyperparameters:** How to optimize model hyperparameters?",
182
+ "sub_decisions": [
183
+ "Grid search vs random search vs Bayesian?",
184
+ "How many trials and folds to run?",
185
+ "What budget or time limit applies?"
186
+ ]
187
  },
188
+ "πŸ“¦ Model Complexity": {
189
+ "explain_text": "**πŸ“¦ Model Complexity:** Is the model efficient enough for deployment?",
190
+ "sub_decisions": [
191
+ "How many parameters and FLOPs?",
192
+ "What is memory usage and latency?",
193
+ "Are there deployment constraints (edge vs cloud)?"
194
+ ]
195
  }
196
  },
197
 
198
+ "πŸ‹οΈ Model Training": {
199
+ "explain_text": "**πŸ‹οΈ Model Training:** Decisions about the training algorithm used.",
200
+ "βœ‚οΈ Data Splitting": {
201
+ "explain_text": "**βœ‚οΈ Data Splitting:** How should data be divided for training and testing?",
202
+ "sub_decisions": [
203
+ "Train-test split ratio?",
204
+ "Cross-validation vs stratified split?"
205
+ ]
206
  },
207
+ "βš–οΈ Loss Function": {
208
+ "explain_text": "**βš–οΈ Loss Function:** Which loss function aligns with the task?",
209
+ "sub_decisions": [
210
+ "MSE vs MAE vs cross-entropy?",
211
+ "Is robustness to outliers needed?",
212
+ "Does it align with evaluation metrics?"
213
+ ]
214
  },
215
+ "πŸš€ Optimization Method": {
216
+ "explain_text": "**πŸš€ Optimization Method:** Which optimization algorithm should be used?",
217
+ "sub_decisions": [
218
+ "SGD vs Adam vs RMSProp?",
219
+ "What learning rate schedule?",
220
+ "What batch size?"
221
+ ]
222
  },
223
+ "πŸ›‘οΈ Regularization": {
224
+ "explain_text": "**πŸ›‘οΈ Regularization:** How to prevent overfitting?",
225
+ "sub_decisions": [
226
+ "L1 vs L2 regularization?",
227
+ "Dropout rate?",
228
+ "Should early stopping be applied?"
229
+ ]
230
  },
231
+ "πŸ“Š Training Monitoring": {
232
+ "explain_text": "**πŸ“Š Training Monitoring:** How to track and manage training progress?",
233
+ "sub_decisions": [
234
+ "Which metrics should be monitored?",
235
+ "How often to checkpoint models?"
236
+ ]
237
  }
238
  },
239
 
240
+ "πŸ“ˆ Model Evaluation": {
241
+ "explain_text": "**πŸ“ˆ Model Evaluation:** Decisions about the evaluation criteria.",
242
+ "πŸ“ Evaluation Metric": {
243
+ "explain_text": "**πŸ“ Evaluation Metric:** Which metrics best reflect model performance?",
244
+ "sub_decisions": [
245
+ "Accuracy vs Precision/Recall/F1?",
246
+ "How to handle class imbalance?",
247
+ "Including probabilistic metrics (AUC, log loss)?"
248
+ ]
249
  },
250
+ "πŸ§ͺ Test Data": {
251
+ "explain_text": "**πŸ§ͺ Test Data:** How should testing be performed?",
252
+ "sub_decisions": [
253
+ "Hold-out set vs cross-validation?",
254
+ "An external test dataset?"
255
+ ]
256
  },
257
+ "βš–οΈ Fairness": {
258
+ "explain_text": "**βš–οΈ Fairness:** How to ensure fairness across groups?",
259
+ "sub_decisions": [
260
+ "Which fairness metric to use (demographic parity, equalized odds)?",
261
+ "How to detect bias in predictions?"
262
+ ]
263
  },
264
+ "πŸ› οΈ Robustness": {
265
+ "explain_text": "**πŸ› οΈ Robustness:** How reliable is the model under stress?",
266
+ "sub_decisions": [
267
+ "How does the model handle noisy inputs?",
268
+ "How to test against distribution shifts?"
269
+ ]
270
  },
271
+ "πŸ” Interpretability": {
272
+ "explain_text": "**πŸ” Interpretability:** How understandable are the model predictions?",
273
+ "sub_decisions": [
274
+ "Which methods to use (feature importance, SHAP, LIME)?",
275
+ "How stable are explanations?",
276
+ "Are explanations actionable for stakeholders?"
277
+ ]
278
  }
279
  }
280
+ }