File size: 14,773 Bytes
3a29afc d9e600b 3a29afc e49e103 1d745dd ec10a30 49edf45 35aba11 5c6dc10 da4cc06 1d745dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | import streamlit as st
import time
def go_to(page_name, from_callback=False):
"""
Updates the session_state page and optionally triggers a rerun.
- from_callback=True if called inside an on_click callback
"""
st.session_state.page = page_name
if not from_callback:
st.rerun()
st.stop()
def add_navigation(previous_page, next_page):
col1, col2, col3 = st.columns([1, 4, 1])
if previous_page is not None:
with col1:
if st.button("Previous"):
go_to(previous_page)
if next_page is not None:
with col3:
if st.button("Next"):
go_to(next_page)
st.markdown("---")
def add_fadein_text(paragraphs):
# Define CSS fade-in animation
st.markdown("""
<style>
@keyframes fadeIn {
from {opacity: 0;}
to {opacity: 1;}
}
.fade-in {
animation: fadeIn 1.5s ease-in forwards;
opacity: 0;
margin-bottom: 1em;
font-size: 24px;
color: #A6C8FF;
text-align: left;
}
</style>
""", unsafe_allow_html=True)
# Create one placeholder for each paragraph
placeholders = [st.empty() for _ in paragraphs]
# Reveal paragraphs sequentially
for i, p in enumerate(paragraphs):
placeholders[i].markdown(f"<div class='fade-in'>{p[0]}</div>", unsafe_allow_html=True)
time.sleep(p[1]) # delay (seconds) before showing next one
# Define pipeline stages
pipeline_data = {
"Data Collection": {
"Data Sources": {
"Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"],
"Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"],
"Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"],
"Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"],
"Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"],
},
"Data Licensing & Permissions": {
"Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"],
"Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"],
"Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"],
"NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"],
"Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"],
},
"Data Quality Checks": {
"Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"],
"Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"],
"Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"],
"Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"],
"Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"],
},
"Data Volume Assessment": {
"Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"],
"Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"],
"Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"],
"Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"],
"Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"],
},
"Data Storage Setup": {
"Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"],
"File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"],
"Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"],
"Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"],
"Backup policy": ["Frequency?", "Retention period?", "Restore testing?"],
},
},
"Preprocessing": {
"Data Cleaning": {
"Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"],
"Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"],
"Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"],
"Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"],
"Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"],
},
"Feature Selection": {
"Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"],
"Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"],
"Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"],
"Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"],
"Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"],
},
"Feature Engineering": {
"Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"],
"Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"],
"Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"],
"Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"],
"Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"],
},
"Outlier Handling": {
"Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"],
"IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"],
"Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"],
"Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"],
"Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"],
},
"Scaling & Transformation": {
"Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"],
"Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"],
"Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"],
"Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"],
"Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"],
},
},
"Model Selection": {
"Algorithm Research": {
"Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"],
"Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"],
"Neural networks": ["Architecture size?", "Training budget?", "Latency target?"],
"Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"],
"Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"],
},
"Baseline Model Creation": {
"Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"],
"Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"],
"Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"],
"KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"],
"Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"],
},
"Pre-trained Model Exploration": {
"Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"],
"NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"],
"Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"],
"Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"],
"Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"],
},
"Hyperparameter Strategy": {
"Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"],
"Random search": ["Distributions?", "Trials planned?", "Early stopping?"],
"Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"],
"Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"],
"Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"],
},
"Model Complexity Assessment": {
"Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"],
"FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"],
"Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"],
"Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"],
"Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"],
},
},
"Training": {
"Data Splitting": {
"Train-test split": ["Split ratio?", "Stratification?", "Random seed?"],
"Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"],
"Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"],
"Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"],
"Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"],
},
"Loss Function Choice": {
"MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"],
"Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"],
"MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"],
"Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"],
"Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"],
},
"Optimization Method": {
"SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"],
"Adam": ["Beta values?", "Weight decay?", "Warmup?"],
"RMSProp": ["Decay rate?", "Centered variant?", "Stability?"],
"Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"],
"L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"],
},
"Regularization": {
"L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"],
"L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"],
"Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"],
"Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"],
"Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"],
},
"Training Monitoring": {
"Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"],
"Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"],
"Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"],
"Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"],
"Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"],
},
},
"Evaluation": {
"Metric Selection": {
"Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"],
"Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"],
"ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"],
"Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"],
"MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"],
},
"Test Data Strategy": {
"Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"],
"External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"],
"Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"],
"Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"],
"Bootstrapping": ["Resample size?", "CI method?", "Stability?"],
},
"Fairness Checks": {
"Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"],
"Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"],
"Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"],
"Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"],
"Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"],
},
"Robustness Testing": {
"Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"],
"Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"],
"Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"],
"Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"],
"Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"],
},
"Model Interpretability": {
"Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"],
"SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"],
"LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"],
"Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"],
"Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"],
},
},
} |