verifiability

Sleeping

App Files Files Community

prakharg24 commited on Aug 27, 2025

Commit

95bf9d2

verified ·

1 Parent(s): bfc5666

Update utils.py

Browse files

Files changed (1) hide show

utils.py +177 -86

utils.py CHANGED Viewed

@@ -67,123 +67,214 @@ def add_red_text(text_to_display):
 # Define pipeline stages
 pipeline_data = {
-    "Data Collection": {
-        "explain_text": "**Data Collection:** Decisions about what data to collect and how.",
-        "Data Sources": {
-            "explain_text": "**Data Sources:** What data sources will be used to collect data?",
-            "sub_decisions": ["Collect existing dataset or new sensor data?", "Public datasets or Private datasets?", "Design Web Scraping or use APIs?"]
         },
-        "Data Usage": {
-            "explain_text": "**Data Usage:** How should the data be used, given any license or permission constraints?",
-            "sub_decisions": ["Ethical concerns to be addressed?", "Commercial use policies?", "Geographic limits?"]
         },
-        "Data Quality": {
-            "explain_text": "**Data Quality:** What kind of quality checks are done to decide data collection?",
-            "sub_decisions": ["Missing value checks to see if critical field are affected?", "Potential duplicates?", "Format consistency and encoding issues?"]
         },
-        "Data Sampling": {
-            "explain_text": "**Data Sampling:** How to sample from a potentially bigger data source?",
-            "sub_decisions": ["Random sampling/stratified sampling/cluster sampling?", "Sample size?", "Potential imbalance?", "Additional synthetic data?"]
         },
-        "Data Storage": {
-            "explain_text": "**Data Storage:** How and where to store the data?",
-            "sub_decisions": ["Backup frequency?", "File format choice?"]
         },
     },
-    "Data Processing": {
-        "explain_text": "**Data Processing:** Decisions about how to process and prepare the data.",
-        "Data Cleaning": {
-            "explain_text": "**Data Cleaning:** How should raw data be cleaned and standardized?",
-            "sub_decisions": ["How to handle missing values?", "How to detect/remove duplicates?", "How to fix formatting errors?"]
         },
-        "Feature Selection": {
-            "explain_text": "**Feature Selection:** Which features should be included in the model?",
-            "sub_decisions": ["Manual vs automated selection?", "How to check for data leakage?", "Should dimensionality reduction be applied?"]
         },
-        "Feature Engineering": {
-            "explain_text": "**Feature Engineering:** How to create or transform features for better performance?",
-            "sub_decisions": ["What new features should be created?", "How to combine existing features?", "How to encode categorical variables?"]
         },
-        "Outlier Handling": {
-            "explain_text": "**Outlier Handling:** How to deal with unusual or extreme data points?",
-            "sub_decisions": ["Which detection method to use (Z-score, IQR, clustering)?", "Remove, cap, or keep outliers?"]
         },
-        "Data Scaling": {
-            "explain_text": "**Data Scaling:** How to scale or transform features before modeling?",
-            "sub_decisions": ["Should Min-Max or Standard scaling be applied?", "Is log or Box-Cox transformation needed?"]
         }
     },
-    "Model Selection": {
-        "explain_text": "**Model Selection:** Decisions about which model to train and the hyperparameter choices.",
-        "Model Architecture": {
-            "explain_text": "**Model Architecture:** Which type of model is best suited to the problem?",
-            "sub_decisions": ["Linear vs tree-based vs neural networks?", "How interpretable should the model be?", "What are computational constraints?"]
         },
-        "Baseline Model": {
-            "explain_text": "**Baseline Model:** What simple models can set a performance baseline?",
-            "sub_decisions": ["Should a logistic regression or decision tree be used?", "What baseline metric is most relevant?"]
         },
-        "Pre-trained Models": {
-            "explain_text": "**Pre-trained Models:** Can existing models be leveraged?",
-            "sub_decisions": ["Which pre-trained models are relevant (image, NLP, tabular)?", "Fine-tune or use as feature extractors?"]
         },
-        "Hyperparameters": {
-            "explain_text": "**Hyperparameters:** How to optimize model hyperparameters?",
-            "sub_decisions": ["Grid search vs random search vs Bayesian?", "How many trials and folds to run?", "What budget or time limit applies?"]
         },
-        "Model Complexity": {
-            "explain_text": "**Model Complexity:** Is the model efficient enough for deployment?",
-            "sub_decisions": ["How many parameters and FLOPs?", "What is memory usage and latency?", "Are there deployment constraints (edge vs cloud)?"]
         }
     },
-    "Model Training": {
-        "explain_text": "**Model Training:** Decisions about the training algorithm used.",
-        "Data Splitting": {
-            "explain_text": "**Data Splitting:** How should data be divided for training and testing?",
-            "sub_decisions": ["Train-test split ratio?", "Cross-validation vs stratified split?"]
         },
-        "Loss Function": {
-            "explain_text": "**Loss Function:** Which loss function aligns with the task?",
-            "sub_decisions": ["MSE vs MAE vs cross-entropy?", "Is robustness to outliers needed?", "Does it align with evaluation metrics?"]
         },
-        "Optimization Method": {
-            "explain_text": "**Optimization Method:** Which optimization algorithm should be used?",
-            "sub_decisions": ["SGD vs Adam vs RMSProp?", "What learning rate schedule?", "What batch size?"]
         },
-        "Regularization": {
-            "explain_text": "**Regularization:** How to prevent overfitting?",
-            "sub_decisions": ["L1 vs L2 regularization?", "Dropout rate?", "Should early stopping be applied?"]
         },
-        "Training Monitoring": {
-            "explain_text": "**Training Monitoring:** How to track and manage training progress?",
-            "sub_decisions": ["Which metrics should be monitored?", "How often to checkpoint models?"]
         }
     },
-    "Model Evaluation": {
-        "explain_text": "**Model Evaluation:** Decisions about the evaluation criteria.",
-        "Evaluation Metric": {
-            "explain_text": "**Evaluation Metric:** Which metrics best reflect model performance?",
-            "sub_decisions": ["Accuracy vs Precision/Recall/F1?", "How to handle class imbalance?", "Including probabilistic metrics (AUC, log loss)?"]
         },
-        "Test Data": {
-            "explain_text": "**Test Data:** How should testing be performed?",
-            "sub_decisions": ["Hold-out set vs cross-validation?", "An external test dataset?"]
         },
-        "Fairness": {
-            "explain_text": "**Fairness:** How to ensure fairness across groups?",
-            "sub_decisions": ["Which fairness metric to use (demographic parity, equalized odds)?", "How to detect bias in predictions?"]
         },
-        "Robustness": {
-            "explain_text": "**Robustness:** How reliable is the model under stress?",
-            "sub_decisions": ["How does the model handle noisy inputs?", "How to test against distribution shifts?"]
         },
-        "Interpretability": {
-            "explain_text": "**Interpretability:** How understandable are the model predictions?",
-            "sub_decisions": ["Which methods to use (feature importance, SHAP, LIME)?", "How stable are explanations?", "Are explanations actionable for stakeholders?"]
         }
     }
-}

 # Define pipeline stages
 pipeline_data = {
+    "📥 Data Collection": {
+        "explain_text": "**📥 Data Collection:** Decisions about what data to collect and how.",
+        "📊 Data Sources": {
+            "explain_text": "**📊 Data Sources:** What data sources will be used to collect data?",
+            "sub_decisions": [
+                "Collect existing dataset or new sensor data?",
+                "Public datasets or Private datasets?",
+                "Design Web Scraping or use APIs?"
+            ]
         },
+        "📜 Data Usage": {
+            "explain_text": "**📜 Data Usage:** How should the data be used, given any license or permission constraints?",
+            "sub_decisions": [
+                "Ethical concerns to be addressed?",
+                "Commercial use policies?",
+                "Geographic limits?"
+            ]
         },
+        "🧹 Data Quality": {
+            "explain_text": "**🧹 Data Quality:** What kind of quality checks are done to decide data collection?",
+            "sub_decisions": [
+                "Missing value checks to see if critical field are affected?",
+                "Potential duplicates?",
+                "Format consistency and encoding issues?"
+            ]
         },
+        "🎲 Data Sampling": {
+            "explain_text": "**🎲 Data Sampling:** How to sample from a potentially bigger data source?",
+            "sub_decisions": [
+                "Random sampling/stratified sampling/cluster sampling?",
+                "Sample size?",
+                "Potential imbalance?",
+                "Additional synthetic data?"
+            ]
         },
+        "💾 Data Storage": {
+            "explain_text": "**💾 Data Storage:** How and where to store the data?",
+            "sub_decisions": [
+                "Backup frequency?",
+                "File format choice?"
+            ]
         },
     },
+    "⚙️ Data Processing": {
+        "explain_text": "**⚙️ Data Processing:** Decisions about how to process and prepare the data.",
+        "🧽 Data Cleaning": {
+            "explain_text": "**🧽 Data Cleaning:** How should raw data be cleaned and standardized?",
+            "sub_decisions": [
+                "How to handle missing values?",
+                "How to detect/remove duplicates?",
+                "How to fix formatting errors?"
+            ]
         },
+        "🎯 Feature Selection": {
+            "explain_text": "**🎯 Feature Selection:** Which features should be included in the model?",
+            "sub_decisions": [
+                "Manual vs automated selection?",
+                "How to check for data leakage?",
+                "Should dimensionality reduction be applied?"
+            ]
         },
+        "🔧 Feature Engineering": {
+            "explain_text": "**🔧 Feature Engineering:** How to create or transform features for better performance?",
+            "sub_decisions": [
+                "What new features should be created?",
+                "How to combine existing features?",
+                "How to encode categorical variables?"
+            ]
         },
+        "🚨 Outlier Handling": {
+            "explain_text": "**🚨 Outlier Handling:** How to deal with unusual or extreme data points?",
+            "sub_decisions": [
+                "Which detection method to use (Z-score, IQR, clustering)?",
+                "Remove, cap, or keep outliers?"
+            ]
         },
+        "📏 Data Scaling": {
+            "explain_text": "**📏 Data Scaling:** How to scale or transform features before modeling?",
+            "sub_decisions": [
+                "Should Min-Max or Standard scaling be applied?",
+                "Is log or Box-Cox transformation needed?"
+            ]
         }
     },
+    "🤖 Model Selection": {
+        "explain_text": "**🤖 Model Selection:** Decisions about which model to train and the hyperparameter choices.",
+        "🏗️ Model Architecture": {
+            "explain_text": "**🏗️ Model Architecture:** Which type of model is best suited to the problem?",
+            "sub_decisions": [
+                "Linear vs tree-based vs neural networks?",
+                "How interpretable should the model be?",
+                "What are computational constraints?"
+            ]
         },
+        "📉 Baseline Model": {
+            "explain_text": "**📉 Baseline Model:** What simple models can set a performance baseline?",
+            "sub_decisions": [
+                "Should a logistic regression or decision tree be used?",
+                "What baseline metric is most relevant?"
+            ]
         },
+        "🧠 Pre-trained Models": {
+            "explain_text": "**🧠 Pre-trained Models:** Can existing models be leveraged?",
+            "sub_decisions": [
+                "Which pre-trained models are relevant (image, NLP, tabular)?",
+                "Fine-tune or use as feature extractors?"
+            ]
         },
+        "⚡ Hyperparameters": {
+            "explain_text": "**⚡ Hyperparameters:** How to optimize model hyperparameters?",
+            "sub_decisions": [
+                "Grid search vs random search vs Bayesian?",
+                "How many trials and folds to run?",
+                "What budget or time limit applies?"
+            ]
         },
+        "📦 Model Complexity": {
+            "explain_text": "**📦 Model Complexity:** Is the model efficient enough for deployment?",
+            "sub_decisions": [
+                "How many parameters and FLOPs?",
+                "What is memory usage and latency?",
+                "Are there deployment constraints (edge vs cloud)?"
+            ]
         }
     },
+    "🏋️ Model Training": {
+        "explain_text": "**🏋️ Model Training:** Decisions about the training algorithm used.",
+        "✂️ Data Splitting": {
+            "explain_text": "**✂️ Data Splitting:** How should data be divided for training and testing?",
+            "sub_decisions": [
+                "Train-test split ratio?",
+                "Cross-validation vs stratified split?"
+            ]
         },
+        "⚖️ Loss Function": {
+            "explain_text": "**⚖️ Loss Function:** Which loss function aligns with the task?",
+            "sub_decisions": [
+                "MSE vs MAE vs cross-entropy?",
+                "Is robustness to outliers needed?",
+                "Does it align with evaluation metrics?"
+            ]
         },
+        "🚀 Optimization Method": {
+            "explain_text": "**🚀 Optimization Method:** Which optimization algorithm should be used?",
+            "sub_decisions": [
+                "SGD vs Adam vs RMSProp?",
+                "What learning rate schedule?",
+                "What batch size?"
+            ]
         },
+        "🛡️ Regularization": {
+            "explain_text": "**🛡️ Regularization:** How to prevent overfitting?",
+            "sub_decisions": [
+                "L1 vs L2 regularization?",
+                "Dropout rate?",
+                "Should early stopping be applied?"
+            ]
         },
+        "📊 Training Monitoring": {
+            "explain_text": "**📊 Training Monitoring:** How to track and manage training progress?",
+            "sub_decisions": [
+                "Which metrics should be monitored?",
+                "How often to checkpoint models?"
+            ]
         }
     },
+    "📈 Model Evaluation": {
+        "explain_text": "**📈 Model Evaluation:** Decisions about the evaluation criteria.",
+        "📏 Evaluation Metric": {
+            "explain_text": "**📏 Evaluation Metric:** Which metrics best reflect model performance?",
+            "sub_decisions": [
+                "Accuracy vs Precision/Recall/F1?",
+                "How to handle class imbalance?",
+                "Including probabilistic metrics (AUC, log loss)?"
+            ]
         },
+        "🧪 Test Data": {
+            "explain_text": "**🧪 Test Data:** How should testing be performed?",
+            "sub_decisions": [
+                "Hold-out set vs cross-validation?",
+                "An external test dataset?"
+            ]
         },
+        "⚖️ Fairness": {
+            "explain_text": "**⚖️ Fairness:** How to ensure fairness across groups?",
+            "sub_decisions": [
+                "Which fairness metric to use (demographic parity, equalized odds)?",
+                "How to detect bias in predictions?"
+            ]
         },
+        "🛠️ Robustness": {
+            "explain_text": "**🛠️ Robustness:** How reliable is the model under stress?",
+            "sub_decisions": [
+                "How does the model handle noisy inputs?",
+                "How to test against distribution shifts?"
+            ]
         },
+        "🔍 Interpretability": {
+            "explain_text": "**🔍 Interpretability:** How understandable are the model predictions?",
+            "sub_decisions": [
+                "Which methods to use (feature importance, SHAP, LIME)?",
+                "How stable are explanations?",
+                "Are explanations actionable for stakeholders?"
+            ]
         }
     }
+}