File size: 14,773 Bytes
3a29afc
d9e600b
3a29afc
 
 
 
 
 
 
 
 
e49e103
1d745dd
ec10a30
49edf45
 
35aba11
 
 
 
 
 
 
 
 
 
5c6dc10
 
da4cc06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d745dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import streamlit as st
import time

def go_to(page_name, from_callback=False):
    """
    Updates the session_state page and optionally triggers a rerun.
    - from_callback=True if called inside an on_click callback
    """
    st.session_state.page = page_name
    if not from_callback:
        st.rerun()
    st.stop()

def add_navigation(previous_page, next_page):
    col1, col2, col3 = st.columns([1, 4, 1])

    if previous_page is not None:
        with col1:
            if st.button("Previous"):
                go_to(previous_page)

    if next_page is not None:
        with col3:
            if st.button("Next"):
                go_to(next_page)

    st.markdown("---")

def add_fadein_text(paragraphs):
    # Define CSS fade-in animation
    st.markdown("""
        <style>
        @keyframes fadeIn {
            from {opacity: 0;}
            to {opacity: 1;}
        }
        .fade-in {
            animation: fadeIn 1.5s ease-in forwards;
            opacity: 0;
            margin-bottom: 1em;
            font-size: 24px;
            color: #A6C8FF;
            text-align: left;
        }
        </style>
    """, unsafe_allow_html=True)

    # Create one placeholder for each paragraph
    placeholders = [st.empty() for _ in paragraphs]

    # Reveal paragraphs sequentially
    for i, p in enumerate(paragraphs):
        placeholders[i].markdown(f"<div class='fade-in'>{p[0]}</div>", unsafe_allow_html=True)
        time.sleep(p[1])  # delay (seconds) before showing next one

# Define pipeline stages
pipeline_data = {
    "Data Collection": {
        "Data Sources": {
            "Identify public datasets": ["Where will you find them?", "Update frequency?", "Licensing constraints?"],
            "Acquire proprietary data": ["Who owns it?", "Access method?", "Cost/contract terms?"],
            "Integrate APIs": ["Which APIs?", "Rate limits?", "Auth method?"],
            "Crowdsourced collection": ["Which platform?", "Quality control?", "Incentive model?"],
            "Sensor/IoT data gathering": ["What hardware?", "Sampling rate?", "Data format?"],
        },
        "Data Licensing & Permissions": {
            "Check copyright status": ["Is it copyrighted?", "Fair use applicable?", "Geographic limits?"],
            "Review usage terms": ["Commercial use allowed?", "Redistribution permitted?", "Attribution required?"],
            "Obtain licenses": ["Cost and renewal?", "Scope of use?", "Termination clauses?"],
            "NDA agreements": ["Parties and duration?", "Scope of confidentiality?", "Breach penalties?"],
            "Open data validation": ["Truly open?", "Source reliability?", "Ethical concerns?"],
        },
        "Data Quality Checks": {
            "Missing value detection": ["% missing?", "MCAR/MAR/MNAR?", "Critical fields affected?"],
            "Duplicate detection": ["Exact vs fuzzy?", "Dedup strategy?", "Impact on metrics?"],
            "Noise assessment": ["Noise sources?", "Filtering options?", "Tolerance thresholds?"],
            "Format consistency": ["Types and units consistent?", "Datetime/encoding issues?", "Schema validation?"],
            "Data freshness review": ["Last update?", "Desired recency?", "Auto-refresh feasible?"],
        },
        "Data Volume Assessment": {
            "Sampling strategy": ["Random/stratified/cluster?", "Sample size?", "Bias risks?"],
            "Class balance check": ["Imbalance ratio?", "Oversample/undersample?", "Synthetic data?"],
            "Size estimation": ["Rows and file size?", "Memory needs?", "Compute bandwidth?"],
            "Incremental updates": ["Append vs merge?", "Versioning plan?", "Conflict handling?"],
            "Redundancy removal": ["Detect redundancy?", "Compression options?", "Archive policy?"],
        },
        "Data Storage Setup": {
            "Database schema design": ["Relational or NoSQL?", "Indexing strategy?", "Normalization level?"],
            "File format selection": ["CSV/Parquet/JSON?", "Compression?", "Interoperability?"],
            "Cloud storage choice": ["AWS/Azure/GCP?", "Cost model?", "Latency region?"],
            "Security setup": ["At-rest/in-transit encryption?", "Access control?", "Audit logging?"],
            "Backup policy": ["Frequency?", "Retention period?", "Restore testing?"],
        },
    },

    "Preprocessing": {
        "Data Cleaning": {
            "Handle missing values": ["Impute or drop?", "Method chosen?", "Impact analysis?"],
            "Remove duplicates": ["Detection method?", "Tie-breaking rule?", "Logging removals?"],
            "Fix formatting errors": ["Standardize types?", "Normalize text?", "Unit conversions?"],
            "Normalize text fields": ["Lowercasing/stemming?", "Stopwords?", "Unicode handling?"],
            "Remove special characters": ["Allowed charset?", "Regex rules?", "Downstream effects?"],
        },
        "Feature Selection": {
            "Manual selection": ["Domain criteria?", "Baseline subset?", "Rationale recorded?"],
            "Statistical selection": ["Correlation/ANOVA/chi²?", "Thresholds?", "Leakage checks?"],
            "Model-based selection": ["Which estimator?", "Importance cutoff?", "Stability across folds?"],
            "Dimensionality reduction": ["PCA/UMAP?", "Target leakage risk?", "Explained variance?"],
            "Domain expert input": ["Who signs off?", "Review cadence?", "Conflict resolution?"],
        },
        "Feature Engineering": {
            "Create new features": ["What transformations?", "Business meaning?", "Overfitting risk?"],
            "Combine existing features": ["Ratios/interactions?", "Collinearity?", "Scaling needs?"],
            "Polynomial features": ["Max degree?", "Sparsity management?", "Regularization plan?"],
            "Temporal features": ["Lags/rolling stats?", "Seasonality?", "Time zones?"],
            "Categorical encoding": ["One-hot/target/WOE?", "High-cardinality strategy?", "Leakage prevention?"],
        },
        "Outlier Handling": {
            "Z-score method": ["Threshold used?", "Per-group scaling?", "Robust alternatives?"],
            "IQR method": ["Multiplier (1.5/3)?", "Per-feature vs joint?", "Winsorize vs remove?"],
            "Winsorization": ["Clip bounds?", "Effect on metrics?", "Documented rationale?"],
            "Clustering-based removal": ["Which clustering?", "Distance cutoff?", "Class impact?"],
            "Manual inspection": ["Visualization used?", "Reviewer criteria?", "Reproducibility?"],
        },
        "Scaling & Transformation": {
            "Min-Max scaling": ["Range chosen?", "Fit on train only?", "Outlier sensitivity?"],
            "Standard scaling": ["Fit scope?", "Pipeline placement?", "Assumed distribution?"],
            "Log transformation": ["Which features?", "Shift for zeros?", "Interpretability?"],
            "Box-Cox transformation": ["Lambda search?", "Normality gain?", "Constraints?"],
            "Quantile transformation": ["Quantiles used?", "Monotonicity preserved?", "Generalization?"],
        },
    },

    "Model Selection": {
        "Algorithm Research": {
            "Linear models": ["Why suitable?", "Regularization choice?", "Feature assumptions?"],
            "Tree-based models": ["Depth/leaf constraints?", "Handling missing?", "Interpretability?"],
            "Neural networks": ["Architecture size?", "Training budget?", "Latency target?"],
            "Ensemble methods": ["Bagging/boosting/stacking?", "Diversity sources?", "Overfit control?"],
            "Probabilistic models": ["Distributional assumptions?", "Calibration needs?", "Uncertainty outputs?"],
        },
        "Baseline Model Creation": {
            "Simple logistic regression": ["Baseline metric?", "Class weighting?", "Regularization?"],
            "Decision stump": ["Split criterion?", "Benchmark purpose?", "Handling ties?"],
            "Dummy classifier": ["Most frequent/stratified?", "Expected score?", "Sanity check?"],
            "KNN baseline": ["K selection?", "Distance metric?", "Scaling requirement?"],
            "Majority class predictor": ["Imbalance insight?", "Floor performance?", "Usefulness?"],
        },
        "Pre-trained Model Exploration": {
            "Image models": ["Which backbone?", "Input size?", "Fine-tune vs freeze?"],
            "NLP models": ["Tokenizer/vocab?", "Sequence length?", "Adaptation method?"],
            "Speech models": ["Sampling rate?", "Feature front-end?", "WER target?"],
            "Tabular models": ["CatBoost/FT-Transformer?", "Categorical handling?", "GPU needs?"],
            "Multi-modal models": ["Fusion strategy?", "Alignment loss?", "Data requirements?"],
        },
        "Hyperparameter Strategy": {
            "Grid search": ["Search space size?", "CV folds?", "Budget/time limit?"],
            "Random search": ["Distributions?", "Trials planned?", "Early stopping?"],
            "Bayesian optimization": ["Surrogate model?", "Acquisition function?", "Parallelism?"],
            "Hyperband": ["Max resources?", "Reduction factor?", "Stochasticity handling?"],
            "Manual tuning": ["Heuristics?", "Logging decisions?", "Reproducibility?"],
        },
        "Model Complexity Assessment": {
            "Parameter count": ["Max allowed?", "Memory footprint?", "Compression options?"],
            "FLOPs estimation": ["Target platform?", "Latency budget?", "Batch size effects?"],
            "Memory usage": ["Peak RAM/VRAM?", "Streaming feasible?", "Quantization?"],
            "Inference latency": ["P50/P95 targets?", "Hardware assumptions?", "Batching strategy?"],
            "Deployment constraints": ["Edge vs cloud?", "Throughput goals?", "Cost ceiling?"],
        },
    },

    "Training": {
        "Data Splitting": {
            "Train-test split": ["Split ratio?", "Stratification?", "Random seed?"],
            "Cross-validation": ["K folds?", "Shuffle strategy?", "Leakage prevention?"],
            "Stratified split": ["Which strata?", "Min group size?", "Imbalance kept?"],
            "Time-series split": ["Gap/embargo?", "Horizon size?", "Leakage checks?"],
            "Nested CV": ["Outer/inner folds?", "Compute budget?", "Model selection rule?"],
        },
        "Loss Function Choice": {
            "MSE": ["Why MSE?", "Outlier sensitivity?", "Alternatives considered?"],
            "Cross-entropy": ["Label smoothing?", "Class weights?", "Numerical stability?"],
            "MAE": ["Robustness need?", "Optimization impact?", "Evaluation alignment?"],
            "Huber loss": ["Delta parameter?", "Outlier profile?", "Convergence behavior?"],
            "Custom loss": ["Definition and gradients?", "Calibration to metrics?", "Debugging plan?"],
        },
        "Optimization Method": {
            "SGD": ["Momentum/nesterov?", "Learning rate schedule?", "Batch size?"],
            "Adam": ["Beta values?", "Weight decay?", "Warmup?"],
            "RMSProp": ["Decay rate?", "Centered variant?", "Stability?"],
            "Adagrad": ["Learning rate decay?", "Sparsity benefits?", "Reset strategy?"],
            "L-BFGS": ["Batching approach?", "Memory limits?", "Convergence criteria?"],
        },
        "Regularization": {
            "L1": ["Sparsity goal?", "Lambda value?", "Feature pruning?"],
            "L2": ["Weight decay?", "Overfit control?", "Interaction with optimizer?"],
            "Dropout": ["Rates per layer?", "Inference behavior?", "Co-adaptation risk?"],
            "Data augmentation": ["Which transforms?", "Label preservation?", "Distribution shift?"],
            "Early stopping": ["Patience metric?", "Min delta?", "Checkpoint policy?"],
        },
        "Training Monitoring": {
            "Loss curves": ["Smoothing?", "Train/val gap?", "Anomaly alerts?"],
            "Accuracy curves": ["Metric tracked?", "Class-wise trends?", "Plateau detection?"],
            "Validation metrics": ["Primary KPI?", "Reporting cadence?", "Confidence intervals?"],
            "Learning rate schedule": ["Schedule type?", "Boundaries?", "Warm restarts?"],
            "Checkpointing": ["Frequency?", "Best-vs-last?", "Storage budget?"],
        },
    },

    "Evaluation": {
        "Metric Selection": {
            "Accuracy": ["Is class balance fair?", "Threshold chosen?", "Business relevance?"],
            "Precision/Recall/F1": ["Which is primary?", "Threshold tuning?", "Cost of errors?"],
            "ROC AUC": ["Calibration issues?", "Class imbalance?", "Interpretation limits?"],
            "Log loss": ["Probability quality?", "Overconfidence penalty?", "Label noise?"],
            "MSE/RMSE": ["Scale sensitivity?", "Baseline comparison?", "Outlier impact?"],
        },
        "Test Data Strategy": {
            "Hold-out set": ["Size and representativeness?", "Temporal leakage?", "Reuse policy?"],
            "External dataset": ["Domain match?", "License/ethics?", "Reproducibility?"],
            "Cross-validation results": ["Variance across folds?", "Confidence bands?", "Selection bias?"],
            "Leave-one-out": ["Compute cost?", "Variance concerns?", "Use case fit?"],
            "Bootstrapping": ["Resample size?", "CI method?", "Stability?"],
        },
        "Fairness Checks": {
            "Demographic parity": ["Protected attributes?", "Gap tolerated?", "Mitigation plan?"],
            "Equalized odds": ["TPR/FPR parity?", "Group definitions?", "Trade-offs?"],
            "Calibration across groups": ["Expected vs observed?", "Bins and sizes?", "Recalibration?"],
            "Bias detection": ["Pre/post metrics?", "Data imbalance role?", "Human review?"],
            "Ethical review": ["Stakeholder impact?", "Transparency level?", "Documentation?"],
        },
        "Robustness Testing": {
            "Noisy input tests": ["Noise model?", "Degradation curve?", "Defenses?"],
            "Adversarial attacks": ["Threat model?", "Attack types?", "Detection/robustness?"],
            "Stress tests": ["Extreme values?", "Load/latency?", "Resource limits?"],
            "Distribution shift": ["Which shifts?", "Detection method?", "Adaptation strategy?"],
            "Random perturbations": ["Perturbation scale?", "Repeatability?", "Metric sensitivity?"],
        },
        "Model Interpretability": {
            "Feature importance": ["Method used?", "Stability across runs?", "Correlated features?"],
            "SHAP values": ["Background data?", "Runtime cost?", "Global vs local?"],
            "LIME explanations": ["Kernel width?", "Neighborhood size?", "Faithfulness?"],
            "Partial dependence plots": ["Feature interactions?", "ICE vs PDP?", "Monotonicity?"],
            "Counterfactual explanations": ["Feasible actions?", "Cost function?", "Recourse policy?"],
        },
    },
}