verifiability

Sleeping

App Files Files Community

prakharg24 commited on Aug 25, 2025

Commit

ff5e7ef

verified ·

1 Parent(s): bb2cf6a

Update utils.py

Browse files

Files changed (1) hide show

utils.py +83 -153

utils.py CHANGED Viewed

@@ -68,192 +68,122 @@ def add_red_text(text_to_display):
 # Define pipeline stages
 pipeline_data = {
     "Data Collection": {
         "Data Sources": {
-            "Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"],
-            "Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"],
-            "Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"],
-            "Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"],
-            "Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"],
         },
-        "Data Licensing & Permissions": {
-            "Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"],
-            "Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"],
-            "Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"],
-            "NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"],
-            "Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"],
         },
-        "Data Quality Checks": {
-            "Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"],
-            "Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"],
-            "Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"],
-            "Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"],
-            "Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"],
         },
-        "Data Volume Assessment": {
-            "Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"],
-            "Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"],
-            "Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"],
-            "Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"],
-            "Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"],
         },
-        "Data Storage Setup": {
-            "Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"],
-            "File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"],
-            "Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"],
-            "Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"],
-            "Backup policy": ["Frequency?", "Retention period?", "Restore testing?"],
         },
     },
-    "Preprocessing": {
         "Data Cleaning": {
-            "Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"],
-            "Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"],
-            "Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"],
-            "Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"],
-            "Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"],
         },
         "Feature Selection": {
-            "Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"],
-            "Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"],
-            "Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"],
-            "Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"],
-            "Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"],
         },
         "Feature Engineering": {
-            "Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"],
-            "Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"],
-            "Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"],
-            "Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"],
-            "Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"],
         },
         "Outlier Handling": {
-            "Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"],
-            "IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"],
-            "Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"],
-            "Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"],
-            "Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"],
-        },
-        "Scaling & Transformation": {
-            "Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"],
-            "Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"],
-            "Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"],
-            "Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"],
-            "Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"],
         },
     },
     "Model Selection": {
-        "Algorithm Research": {
-            "Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"],
-            "Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"],
-            "Neural networks": ["Architecture size?", "Training budget?", "Latency target?"],
-            "Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"],
-            "Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"],
-        },
-        "Baseline Model Creation": {
-            "Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"],
-            "Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"],
-            "Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"],
-            "KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"],
-            "Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"],
-        },
-        "Pre-trained Model Exploration": {
-            "Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"],
-            "NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"],
-            "Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"],
-            "Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"],
-            "Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"],
-        },
-        "Hyperparameter Strategy": {
-            "Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"],
-            "Random search": ["Distributions?", "Trials planned?", "Early stopping?"],
-            "Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"],
-            "Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"],
-            "Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"],
-        },
-        "Model Complexity Assessment": {
-            "Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"],
-            "FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"],
-            "Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"],
-            "Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"],
-            "Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"],
-        },
     },
-    "Training": {
         "Data Splitting": {
-            "Train-test split": ["Split ratio?", "Stratification?", "Random seed?"],
-            "Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"],
-            "Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"],
-            "Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"],
-            "Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"],
         },
-        "Loss Function Choice": {
-            "MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"],
-            "Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"],
-            "MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"],
-            "Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"],
-            "Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"],
         },
         "Optimization Method": {
-            "SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"],
-            "Adam": ["Beta values?", "Weight decay?", "Warmup?"],
-            "RMSProp": ["Decay rate?", "Centered variant?", "Stability?"],
-            "Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"],
-            "L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"],
         },
         "Regularization": {
-            "L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"],
-            "L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"],
-            "Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"],
-            "Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"],
-            "Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"],
         },
         "Training Monitoring": {
-            "Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"],
-            "Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"],
-            "Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"],
-            "Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"],
-            "Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"],
-        },
     },
-    "Evaluation": {
-        "Metric Selection": {
-            "Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"],
-            "Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"],
-            "ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"],
-            "Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"],
-            "MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"],
         },
-        "Test Data Strategy": {
-            "Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"],
-            "External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"],
-            "Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"],
-            "Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"],
-            "Bootstrapping": ["Resample size?", "CI method?", "Stability?"],
         },
-        "Fairness Checks": {
-            "Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"],
-            "Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"],
-            "Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"],
-            "Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"],
-            "Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"],
         },
-        "Robustness Testing": {
-            "Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"],
-            "Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"],
-            "Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"],
-            "Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"],
-            "Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"],
         },
-        "Model Interpretability": {
-            "Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"],
-            "SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"],
-            "LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"],
-            "Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"],
-            "Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"],
-        },
-    },
 }

 # Define pipeline stages
 pipeline_data = {
     "Data Collection": {
+        "explain_text": "**Data Collection:** Decisions about what data to collect and how."
         "Data Sources": {
+            "explain_text": "**Data Sources:** What data sources will be used to collect data?",
+            "sub_decisions": ["Collect existing dataset or new sensor data?", "Public datasets or Private datasets?", "Design Web Scraping or use APIs?"]
         },
+        "Data Usage": {
+            "explain_text": "**Data Usage:** How should the data be used, given any license or permission constraints?",
+            "sub_decisions": ["Ethical concerns to be addressed?", "Commercial use policies?", "Geographic limits?"]
         },
+        "Data Quality": {
+            "explain_text": "**Data Quality:** What kind of quality checks are done to decide data collection?",
+            "sub_decisions": ["Missing value checks to see if critical field are affected?", "Potential duplicates?", "Format consistency and encoding issues?"]
         },
+        "Data Sampling": {
+            "explain_text": "**Data Sampling:** How to sample from a potentially bigger data source?",
+            "sub_decisions": ["Random sampling/stratified sampling/cluster sampling?", "Sample size?", "Potential imbalance?", "Additional synthetic data?"]
         },
+        "Data Storage": {
+            "explain_text": "**Data Storage:** How and where to store the data?",
+            "sub_decisions": ["Backup frequency?", "File format choice?"]
         },
     },
+    "Data Processing": {
+        "explain_text": "**Data Processing:** Decisions about how to process and prepare the data."
         "Data Cleaning": {
+            "explain_text": "**Data Cleaning:** How should raw data be cleaned and standardized?",
+            "sub_decisions": ["How to handle missing values?", "How to detect/remove duplicates?", "How to fix formatting errors?"]
         },
         "Feature Selection": {
+            "explain_text": "**Feature Selection:** Which features should be included in the model?",
+            "sub_decisions": ["Manual vs automated selection?", "How to check for data leakage?", "Should dimensionality reduction be applied?"]
         },
         "Feature Engineering": {
+            "explain_text": "**Feature Engineering:** How to create or transform features for better performance?",
+            "sub_decisions": ["What new features should be created?", "How to combine existing features?", "How to encode categorical variables?"]
         },
         "Outlier Handling": {
+            "explain_text": "**Outlier Handling:** How to deal with unusual or extreme data points?",
+            "sub_decisions": ["Which detection method to use (Z-score, IQR, clustering)?", "Remove, cap, or keep outliers?"]
         },
+        "Data Scaling": {
+            "explain_text": "**Data Scaling:** How to scale or transform features before modeling?",
+            "sub_decisions": ["Should Min-Max or Standard scaling be applied?", "Is log or Box-Cox transformation needed?"]
+        }
     },
     "Model Selection": {
+        "explain_text": "**Model Selection:** Decisions about which model to train and the hyperparameter choices."
+        "Model Architecture": {
+            "explain_text": "**Model Architecture:** Which type of model is best suited to the problem?",
+            "sub_decisions": ["Linear vs tree-based vs neural networks?", "How interpretable should the model be?", "What are computational constraints?"]
+        },
+        "Baseline Model": {
+            "explain_text": "**Baseline Model:** What simple models can set a performance baseline?",
+            "sub_decisions": ["Should a logistic regression or decision tree be used?", "What baseline metric is most relevant?"]
+        },
+        "Pre-trained Models": {
+            "explain_text": "**Pre-trained Models:** Can existing models be leveraged?",
+            "sub_decisions": ["Which pre-trained models are relevant (image, NLP, tabular)?", "Fine-tune or use as feature extractors?"]
+        },
+        "Hyperparameters": {
+            "explain_text": "**Hyperparameters:** How to optimize model hyperparameters?",
+            "sub_decisions": ["Grid search vs random search vs Bayesian?", "How many trials and folds to run?", "What budget or time limit applies?"]
+        },
+        "Model Complexity": {
+            "explain_text": "**Model Complexity:** Is the model efficient enough for deployment?",
+            "sub_decisions": ["How many parameters and FLOPs?", "What is memory usage and latency?", "Are there deployment constraints (edge vs cloud)?"]
+        }
     },
+    "Model Training": {
+        "explain_text": "**Model Training:** Decisions about the training algorithm used."
         "Data Splitting": {
+            "explain_text": "**Data Splitting:** How should data be divided for training and testing?",
+            "sub_decisions": ["Train-test split ratio?", "Cross-validation vs stratified split?"]
         },
+        "Loss Function": {
+            "explain_text": "**Loss Function:** Which loss function aligns with the task?",
+            "sub_decisions": ["MSE vs MAE vs cross-entropy?", "Is robustness to outliers needed?", "Does it align with evaluation metrics?"]
         },
         "Optimization Method": {
+            "explain_text": "**Optimization Method:** Which optimization algorithm should be used?",
+            "sub_decisions": ["SGD vs Adam vs RMSProp?", "What learning rate schedule?", "What batch size?"]
         },
         "Regularization": {
+            "explain_text": "**Regularization:** How to prevent overfitting?",
+            "sub_decisions": ["L1 vs L2 regularization?", "Dropout rate?", "Should early stopping be applied?"]
         },
         "Training Monitoring": {
+            "explain_text": "**Training Monitoring:** How to track and manage training progress?",
+            "sub_decisions": ["Which metrics should be monitored?", "How often to checkpoint models?"]
+        }
     },
+    "Model Evaluation": {
+        "explain_text": "**Model Evaluation:** Decisions about the evaluation criteria."
+        "Evaluation Metric": {
+            "explain_text": "**Evaluation Metric:** Which metrics best reflect model performance?",
+            "sub_decisions": ["Accuracy vs Precision/Recall/F1?", "How to handle class imbalance?", "Including probabilistic metrics (AUC, log loss)?"]
         },
+        "Test Data": {
+            "explain_text": "**Test Data:** How should testing be performed?",
+            "sub_decisions": ["Hold-out set vs cross-validation?", "An external test dataset?"]
         },
+        "Fairness": {
+            "explain_text": "**Fairness:** How to ensure fairness across groups?",
+            "sub_decisions": ["Which fairness metric to use (demographic parity, equalized odds)?", "How to detect bias in predictions?"]
         },
+        "Robustness": {
+            "explain_text": "**Robustness:** How reliable is the model under stress?",
+            "sub_decisions": ["How does the model handle noisy inputs?", "How to test against distribution shifts?"]
         },
+        "Interpretability": {
+            "explain_text": "**Interpretability:** How understandable are the model predictions?",
+            "sub_decisions": ["Which methods to use (feature importance, SHAP, LIME)?", "How stable are explanations?", "Are explanations actionable for stakeholders?"]
+        }
+    }
 }