prakharg24 commited on
Commit
ff5e7ef
·
verified ·
1 Parent(s): bb2cf6a

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +83 -153
utils.py CHANGED
@@ -68,192 +68,122 @@ def add_red_text(text_to_display):
68
  # Define pipeline stages
69
  pipeline_data = {
70
  "Data Collection": {
 
71
  "Data Sources": {
72
- "Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"],
73
- "Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"],
74
- "Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"],
75
- "Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"],
76
- "Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"],
77
  },
78
- "Data Licensing & Permissions": {
79
- "Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"],
80
- "Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"],
81
- "Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"],
82
- "NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"],
83
- "Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"],
84
  },
85
- "Data Quality Checks": {
86
- "Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"],
87
- "Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"],
88
- "Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"],
89
- "Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"],
90
- "Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"],
91
  },
92
- "Data Volume Assessment": {
93
- "Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"],
94
- "Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"],
95
- "Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"],
96
- "Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"],
97
- "Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"],
98
  },
99
- "Data Storage Setup": {
100
- "Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"],
101
- "File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"],
102
- "Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"],
103
- "Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"],
104
- "Backup policy": ["Frequency?", "Retention period?", "Restore testing?"],
105
  },
106
  },
107
 
108
- "Preprocessing": {
 
109
  "Data Cleaning": {
110
- "Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"],
111
- "Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"],
112
- "Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"],
113
- "Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"],
114
- "Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"],
115
  },
116
  "Feature Selection": {
117
- "Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"],
118
- "Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"],
119
- "Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"],
120
- "Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"],
121
- "Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"],
122
  },
123
  "Feature Engineering": {
124
- "Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"],
125
- "Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"],
126
- "Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"],
127
- "Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"],
128
- "Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"],
129
  },
130
  "Outlier Handling": {
131
- "Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"],
132
- "IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"],
133
- "Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"],
134
- "Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"],
135
- "Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"],
136
- },
137
- "Scaling & Transformation": {
138
- "Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"],
139
- "Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"],
140
- "Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"],
141
- "Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"],
142
- "Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"],
143
  },
 
 
 
 
144
  },
145
 
146
  "Model Selection": {
147
- "Algorithm Research": {
148
- "Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"],
149
- "Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"],
150
- "Neural networks": ["Architecture size?", "Training budget?", "Latency target?"],
151
- "Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"],
152
- "Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"],
153
- },
154
- "Baseline Model Creation": {
155
- "Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"],
156
- "Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"],
157
- "Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"],
158
- "KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"],
159
- "Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"],
160
- },
161
- "Pre-trained Model Exploration": {
162
- "Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"],
163
- "NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"],
164
- "Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"],
165
- "Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"],
166
- "Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"],
167
- },
168
- "Hyperparameter Strategy": {
169
- "Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"],
170
- "Random search": ["Distributions?", "Trials planned?", "Early stopping?"],
171
- "Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"],
172
- "Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"],
173
- "Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"],
174
- },
175
- "Model Complexity Assessment": {
176
- "Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"],
177
- "FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"],
178
- "Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"],
179
- "Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"],
180
- "Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"],
181
- },
182
  },
183
 
184
- "Training": {
 
185
  "Data Splitting": {
186
- "Train-test split": ["Split ratio?", "Stratification?", "Random seed?"],
187
- "Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"],
188
- "Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"],
189
- "Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"],
190
- "Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"],
191
  },
192
- "Loss Function Choice": {
193
- "MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"],
194
- "Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"],
195
- "MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"],
196
- "Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"],
197
- "Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"],
198
  },
199
  "Optimization Method": {
200
- "SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"],
201
- "Adam": ["Beta values?", "Weight decay?", "Warmup?"],
202
- "RMSProp": ["Decay rate?", "Centered variant?", "Stability?"],
203
- "Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"],
204
- "L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"],
205
  },
206
  "Regularization": {
207
- "L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"],
208
- "L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"],
209
- "Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"],
210
- "Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"],
211
- "Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"],
212
  },
213
  "Training Monitoring": {
214
- "Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"],
215
- "Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"],
216
- "Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"],
217
- "Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"],
218
- "Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"],
219
- },
220
  },
221
 
222
- "Evaluation": {
223
- "Metric Selection": {
224
- "Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"],
225
- "Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"],
226
- "ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"],
227
- "Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"],
228
- "MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"],
229
  },
230
- "Test Data Strategy": {
231
- "Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"],
232
- "External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"],
233
- "Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"],
234
- "Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"],
235
- "Bootstrapping": ["Resample size?", "CI method?", "Stability?"],
236
  },
237
- "Fairness Checks": {
238
- "Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"],
239
- "Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"],
240
- "Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"],
241
- "Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"],
242
- "Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"],
243
  },
244
- "Robustness Testing": {
245
- "Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"],
246
- "Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"],
247
- "Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"],
248
- "Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"],
249
- "Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"],
250
  },
251
- "Model Interpretability": {
252
- "Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"],
253
- "SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"],
254
- "LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"],
255
- "Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"],
256
- "Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"],
257
- },
258
- },
259
  }
 
68
  # Define pipeline stages
69
  pipeline_data = {
70
  "Data Collection": {
71
+ "explain_text": "**Data Collection:** Decisions about what data to collect and how."
72
  "Data Sources": {
73
+ "explain_text": "**Data Sources:** What data sources will be used to collect data?",
74
+ "sub_decisions": ["Collect existing dataset or new sensor data?", "Public datasets or Private datasets?", "Design Web Scraping or use APIs?"]
 
 
 
75
  },
76
+ "Data Usage": {
77
+ "explain_text": "**Data Usage:** How should the data be used, given any license or permission constraints?",
78
+ "sub_decisions": ["Ethical concerns to be addressed?", "Commercial use policies?", "Geographic limits?"]
 
 
 
79
  },
80
+ "Data Quality": {
81
+ "explain_text": "**Data Quality:** What kind of quality checks are done to decide data collection?",
82
+ "sub_decisions": ["Missing value checks to see if critical field are affected?", "Potential duplicates?", "Format consistency and encoding issues?"]
 
 
 
83
  },
84
+ "Data Sampling": {
85
+ "explain_text": "**Data Sampling:** How to sample from a potentially bigger data source?",
86
+ "sub_decisions": ["Random sampling/stratified sampling/cluster sampling?", "Sample size?", "Potential imbalance?", "Additional synthetic data?"]
 
 
 
87
  },
88
+ "Data Storage": {
89
+ "explain_text": "**Data Storage:** How and where to store the data?",
90
+ "sub_decisions": ["Backup frequency?", "File format choice?"]
 
 
 
91
  },
92
  },
93
 
94
+ "Data Processing": {
95
+ "explain_text": "**Data Processing:** Decisions about how to process and prepare the data."
96
  "Data Cleaning": {
97
+ "explain_text": "**Data Cleaning:** How should raw data be cleaned and standardized?",
98
+ "sub_decisions": ["How to handle missing values?", "How to detect/remove duplicates?", "How to fix formatting errors?"]
 
 
 
99
  },
100
  "Feature Selection": {
101
+ "explain_text": "**Feature Selection:** Which features should be included in the model?",
102
+ "sub_decisions": ["Manual vs automated selection?", "How to check for data leakage?", "Should dimensionality reduction be applied?"]
 
 
 
103
  },
104
  "Feature Engineering": {
105
+ "explain_text": "**Feature Engineering:** How to create or transform features for better performance?",
106
+ "sub_decisions": ["What new features should be created?", "How to combine existing features?", "How to encode categorical variables?"]
 
 
 
107
  },
108
  "Outlier Handling": {
109
+ "explain_text": "**Outlier Handling:** How to deal with unusual or extreme data points?",
110
+ "sub_decisions": ["Which detection method to use (Z-score, IQR, clustering)?", "Remove, cap, or keep outliers?"]
 
 
 
 
 
 
 
 
 
 
111
  },
112
+ "Data Scaling": {
113
+ "explain_text": "**Data Scaling:** How to scale or transform features before modeling?",
114
+ "sub_decisions": ["Should Min-Max or Standard scaling be applied?", "Is log or Box-Cox transformation needed?"]
115
+ }
116
  },
117
 
118
  "Model Selection": {
119
+ "explain_text": "**Model Selection:** Decisions about which model to train and the hyperparameter choices."
120
+ "Model Architecture": {
121
+ "explain_text": "**Model Architecture:** Which type of model is best suited to the problem?",
122
+ "sub_decisions": ["Linear vs tree-based vs neural networks?", "How interpretable should the model be?", "What are computational constraints?"]
123
+ },
124
+ "Baseline Model": {
125
+ "explain_text": "**Baseline Model:** What simple models can set a performance baseline?",
126
+ "sub_decisions": ["Should a logistic regression or decision tree be used?", "What baseline metric is most relevant?"]
127
+ },
128
+ "Pre-trained Models": {
129
+ "explain_text": "**Pre-trained Models:** Can existing models be leveraged?",
130
+ "sub_decisions": ["Which pre-trained models are relevant (image, NLP, tabular)?", "Fine-tune or use as feature extractors?"]
131
+ },
132
+ "Hyperparameters": {
133
+ "explain_text": "**Hyperparameters:** How to optimize model hyperparameters?",
134
+ "sub_decisions": ["Grid search vs random search vs Bayesian?", "How many trials and folds to run?", "What budget or time limit applies?"]
135
+ },
136
+ "Model Complexity": {
137
+ "explain_text": "**Model Complexity:** Is the model efficient enough for deployment?",
138
+ "sub_decisions": ["How many parameters and FLOPs?", "What is memory usage and latency?", "Are there deployment constraints (edge vs cloud)?"]
139
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  },
141
 
142
+ "Model Training": {
143
+ "explain_text": "**Model Training:** Decisions about the training algorithm used."
144
  "Data Splitting": {
145
+ "explain_text": "**Data Splitting:** How should data be divided for training and testing?",
146
+ "sub_decisions": ["Train-test split ratio?", "Cross-validation vs stratified split?"]
 
 
 
147
  },
148
+ "Loss Function": {
149
+ "explain_text": "**Loss Function:** Which loss function aligns with the task?",
150
+ "sub_decisions": ["MSE vs MAE vs cross-entropy?", "Is robustness to outliers needed?", "Does it align with evaluation metrics?"]
 
 
 
151
  },
152
  "Optimization Method": {
153
+ "explain_text": "**Optimization Method:** Which optimization algorithm should be used?",
154
+ "sub_decisions": ["SGD vs Adam vs RMSProp?", "What learning rate schedule?", "What batch size?"]
 
 
 
155
  },
156
  "Regularization": {
157
+ "explain_text": "**Regularization:** How to prevent overfitting?",
158
+ "sub_decisions": ["L1 vs L2 regularization?", "Dropout rate?", "Should early stopping be applied?"]
 
 
 
159
  },
160
  "Training Monitoring": {
161
+ "explain_text": "**Training Monitoring:** How to track and manage training progress?",
162
+ "sub_decisions": ["Which metrics should be monitored?", "How often to checkpoint models?"]
163
+ }
 
 
 
164
  },
165
 
166
+ "Model Evaluation": {
167
+ "explain_text": "**Model Evaluation:** Decisions about the evaluation criteria."
168
+ "Evaluation Metric": {
169
+ "explain_text": "**Evaluation Metric:** Which metrics best reflect model performance?",
170
+ "sub_decisions": ["Accuracy vs Precision/Recall/F1?", "How to handle class imbalance?", "Including probabilistic metrics (AUC, log loss)?"]
 
 
171
  },
172
+ "Test Data": {
173
+ "explain_text": "**Test Data:** How should testing be performed?",
174
+ "sub_decisions": ["Hold-out set vs cross-validation?", "An external test dataset?"]
 
 
 
175
  },
176
+ "Fairness": {
177
+ "explain_text": "**Fairness:** How to ensure fairness across groups?",
178
+ "sub_decisions": ["Which fairness metric to use (demographic parity, equalized odds)?", "How to detect bias in predictions?"]
 
 
 
179
  },
180
+ "Robustness": {
181
+ "explain_text": "**Robustness:** How reliable is the model under stress?",
182
+ "sub_decisions": ["How does the model handle noisy inputs?", "How to test against distribution shifts?"]
 
 
 
183
  },
184
+ "Interpretability": {
185
+ "explain_text": "**Interpretability:** How understandable are the model predictions?",
186
+ "sub_decisions": ["Which methods to use (feature importance, SHAP, LIME)?", "How stable are explanations?", "Are explanations actionable for stakeholders?"]
187
+ }
188
+ }
 
 
 
189
  }