| import streamlit as st |
| import time |
|
|
| def go_to(page_name, from_callback=False): |
| """ |
| Updates the session_state page and optionally triggers a rerun. |
| - from_callback=True if called inside an on_click callback |
| """ |
| st.session_state.page = page_name |
| if not from_callback: |
| st.rerun() |
| |
|
|
| def add_navigation(previous_page, next_page): |
| col1, col2, col3 = st.columns([1, 4, 1]) |
|
|
| if previous_page is not None: |
| with col1: |
| if st.button("Previous"): |
| go_to(previous_page) |
|
|
| if next_page is not None: |
| with col3: |
| if st.button("Next"): |
| go_to(next_page) |
|
|
| st.markdown("---") |
| |
|
|
| def add_fadein_text(paragraphs): |
| |
| st.markdown(""" |
| <style> |
| @keyframes fadeIn { |
| from {opacity: 0;} |
| to {opacity: 1;} |
| } |
| .fade-in { |
| animation: fadeIn 1.5s ease-in forwards; |
| opacity: 0; |
| margin-bottom: 1em; |
| font-size: 24px; |
| color: #A6C8FF; |
| text-align: left; |
| } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| placeholders = [st.empty() for _ in paragraphs] |
|
|
| |
| for i, p in enumerate(paragraphs): |
| placeholders[i].markdown(f"<div class='fade-in'>{p[0]}</div>", unsafe_allow_html=True) |
| time.sleep(p[1]) |
|
|
| def add_instruction_text(text_to_display): |
| st.markdown(f"<div style='text-align: center; font-size:18px; color:gray;'>{text_to_display}</div>", unsafe_allow_html=True) |
|
|
| st.markdown("---") |
|
|
| |
| pipeline_data = { |
| "Data Collection": { |
| "Data Sources": { |
| "Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"], |
| "Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"], |
| "Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"], |
| "Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"], |
| "Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"], |
| }, |
| "Data Licensing & Permissions": { |
| "Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"], |
| "Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"], |
| "Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"], |
| "NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"], |
| "Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"], |
| }, |
| "Data Quality Checks": { |
| "Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"], |
| "Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"], |
| "Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"], |
| "Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"], |
| "Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"], |
| }, |
| "Data Volume Assessment": { |
| "Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"], |
| "Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"], |
| "Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"], |
| "Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"], |
| "Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"], |
| }, |
| "Data Storage Setup": { |
| "Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"], |
| "File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"], |
| "Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"], |
| "Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"], |
| "Backup policy": ["Frequency?", "Retention period?", "Restore testing?"], |
| }, |
| }, |
|
|
| "Preprocessing": { |
| "Data Cleaning": { |
| "Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"], |
| "Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"], |
| "Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"], |
| "Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"], |
| "Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"], |
| }, |
| "Feature Selection": { |
| "Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"], |
| "Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"], |
| "Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"], |
| "Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"], |
| "Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"], |
| }, |
| "Feature Engineering": { |
| "Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"], |
| "Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"], |
| "Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"], |
| "Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"], |
| "Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"], |
| }, |
| "Outlier Handling": { |
| "Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"], |
| "IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"], |
| "Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"], |
| "Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"], |
| "Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"], |
| }, |
| "Scaling & Transformation": { |
| "Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"], |
| "Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"], |
| "Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"], |
| "Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"], |
| "Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"], |
| }, |
| }, |
|
|
| "Model Selection": { |
| "Algorithm Research": { |
| "Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"], |
| "Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"], |
| "Neural networks": ["Architecture size?", "Training budget?", "Latency target?"], |
| "Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"], |
| "Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"], |
| }, |
| "Baseline Model Creation": { |
| "Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"], |
| "Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"], |
| "Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"], |
| "KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"], |
| "Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"], |
| }, |
| "Pre-trained Model Exploration": { |
| "Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"], |
| "NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"], |
| "Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"], |
| "Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"], |
| "Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"], |
| }, |
| "Hyperparameter Strategy": { |
| "Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"], |
| "Random search": ["Distributions?", "Trials planned?", "Early stopping?"], |
| "Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"], |
| "Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"], |
| "Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"], |
| }, |
| "Model Complexity Assessment": { |
| "Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"], |
| "FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"], |
| "Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"], |
| "Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"], |
| "Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"], |
| }, |
| }, |
|
|
| "Training": { |
| "Data Splitting": { |
| "Train-test split": ["Split ratio?", "Stratification?", "Random seed?"], |
| "Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"], |
| "Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"], |
| "Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"], |
| "Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"], |
| }, |
| "Loss Function Choice": { |
| "MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"], |
| "Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"], |
| "MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"], |
| "Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"], |
| "Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"], |
| }, |
| "Optimization Method": { |
| "SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"], |
| "Adam": ["Beta values?", "Weight decay?", "Warmup?"], |
| "RMSProp": ["Decay rate?", "Centered variant?", "Stability?"], |
| "Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"], |
| "L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"], |
| }, |
| "Regularization": { |
| "L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"], |
| "L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"], |
| "Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"], |
| "Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"], |
| "Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"], |
| }, |
| "Training Monitoring": { |
| "Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"], |
| "Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"], |
| "Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"], |
| "Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"], |
| "Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"], |
| }, |
| }, |
|
|
| "Evaluation": { |
| "Metric Selection": { |
| "Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"], |
| "Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"], |
| "ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"], |
| "Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"], |
| "MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"], |
| }, |
| "Test Data Strategy": { |
| "Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"], |
| "External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"], |
| "Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"], |
| "Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"], |
| "Bootstrapping": ["Resample size?", "CI method?", "Stability?"], |
| }, |
| "Fairness Checks": { |
| "Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"], |
| "Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"], |
| "Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"], |
| "Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"], |
| "Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"], |
| }, |
| "Robustness Testing": { |
| "Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"], |
| "Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"], |
| "Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"], |
| "Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"], |
| "Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"], |
| }, |
| "Model Interpretability": { |
| "Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"], |
| "SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"], |
| "LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"], |
| "Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"], |
| "Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"], |
| }, |
| }, |
| } |