File size: 11,095 Bytes
3a29afc
d9e600b
3a29afc
 
 
 
 
 
 
 
 
7650f6e
1d745dd
ec10a30
49edf45
 
35aba11
 
 
 
 
 
 
 
 
 
7650f6e
 
5c6dc10
da4cc06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3232101
 
da4cc06
7650f6e
 
 
 
 
e7897d7
 
7e68df4
e7897d7
 
 
1d745dd
 
95bf9d2
 
 
 
 
 
 
 
 
1d745dd
61c745a
95bf9d2
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
1d745dd
 
 
95bf9d2
 
 
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
ff5e7ef
1d745dd
 
95bf9d2
 
 
 
 
 
 
 
 
ff5e7ef
95bf9d2
 
 
 
 
 
ff5e7ef
95bf9d2
 
 
 
 
 
ff5e7ef
61c745a
 
95bf9d2
 
 
 
 
ff5e7ef
95bf9d2
 
 
 
 
 
 
ff5e7ef
1d745dd
 
95bf9d2
 
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
1d745dd
61c745a
 
95bf9d2
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
ff5e7ef
1d745dd
 
95bf9d2
 
 
 
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
1d745dd
95bf9d2
 
 
 
 
 
 
ff5e7ef
 
95bf9d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import streamlit as st
import time

def go_to(page_name, from_callback=False):
    """
    Updates the session_state page and optionally triggers a rerun.
    - from_callback=True if called inside an on_click callback
    """
    st.session_state.page = page_name
    if not from_callback:
        st.rerun()
    # st.stop()

def add_navigation(previous_page, next_page):
    col1, col2, col3 = st.columns([1, 4, 1])

    if previous_page is not None:
        with col1:
            if st.button("Previous"):
                go_to(previous_page)

    if next_page is not None:
        with col3:
            if st.button("Next"):
                go_to(next_page)

    st.markdown("---")
    # st.divider()

def add_fadein_text(paragraphs):
    # Define CSS fade-in animation
    st.markdown("""
        <style>
        @keyframes fadeIn {
            from {opacity: 0;}
            to {opacity: 1;}
        }
        .fade-in {
            animation: fadeIn 1.5s ease-in forwards;
            opacity: 0;
            margin-bottom: 1em;
            font-size: 24px;
            color: #A6C8FF;
            text-align: left;
        }
        </style>
    """, unsafe_allow_html=True)

    # Create one placeholder for each paragraph
    placeholders = [st.empty() for _ in paragraphs]

    # Reveal paragraphs sequentially
    for i, p in enumerate(paragraphs):
        placeholders[i].markdown(f"<div class='fade-in' style='animation-delay:{p[1]}s'>{p[0]}</div>", unsafe_allow_html=True)
        # time.sleep(p[1])  # delay (seconds) before showing next one

def add_instruction_text(text_to_display):
    st.markdown(f"<div style='text-align: center; font-size:18px; color:gray;'>{text_to_display}</div>", unsafe_allow_html=True)

    st.markdown("---")

def add_red_text(text_to_display):
    st.markdown(
        f"<div style='text-align:center; color:#c0392b; font-size:18px;'>{text_to_display}<br></div>",
        unsafe_allow_html=True,
    )

# Define pipeline stages
pipeline_data = {
    "πŸ“₯ Data Collection": {
        "explain_text": "**πŸ“₯ Data Collection:** Decisions about what data to collect and how.",
        "πŸ“Š Data Sources": {
            "explain_text": "**πŸ“Š Data Sources:** What data sources will be used to collect data?",
            "sub_decisions": [
                "Collect existing dataset or new sensor data?",
                "Public datasets or Private datasets?",
                "Design Web Scraping or use APIs?"
            ]
        },
        "πŸ“œ Data Usage": {
            "explain_text": "**πŸ“œ Data Usage:** How should the data be used, given any license or permission constraints?",
            "sub_decisions": [
                "Ethical concerns to be addressed?",
                "Commercial use policies?",
                "Geographic limits?"
            ]
        },
        "🧹 Data Quality": {
            "explain_text": "**🧹 Data Quality:** What kind of quality checks are done to decide data collection?",
            "sub_decisions": [
                "Missing value checks to see if critical field are affected?",
                "Potential duplicates?",
                "Format consistency and encoding issues?"
            ]
        },
        "🎲 Data Sampling": {
            "explain_text": "**🎲 Data Sampling:** How to sample from a potentially bigger data source?",
            "sub_decisions": [
                "Random sampling/stratified sampling/cluster sampling?",
                "Sample size?",
                "Potential imbalance?",
                "Additional synthetic data?"
            ]
        },
        "πŸ’Ύ Data Storage": {
            "explain_text": "**πŸ’Ύ Data Storage:** How and where to store the data?",
            "sub_decisions": [
                "Backup frequency?",
                "File format choice?"
            ]
        },
    },

    "βš™οΈ Data Processing": {
        "explain_text": "**βš™οΈ Data Processing:** Decisions about how to process and prepare the data.",
        "🧽 Data Cleaning": {
            "explain_text": "**🧽 Data Cleaning:** How should raw data be cleaned and standardized?",
            "sub_decisions": [
                "How to handle missing values?",
                "How to detect/remove duplicates?",
                "How to fix formatting errors?"
            ]
        },
        "🎯 Feature Selection": {
            "explain_text": "**🎯 Feature Selection:** Which features should be included in the model?",
            "sub_decisions": [
                "Manual vs automated selection?",
                "How to check for data leakage?",
                "Should dimensionality reduction be applied?"
            ]
        },
        "πŸ”§ Feature Engineering": {
            "explain_text": "**πŸ”§ Feature Engineering:** How to create or transform features for better performance?",
            "sub_decisions": [
                "What new features should be created?",
                "How to combine existing features?",
                "How to encode categorical variables?"
            ]
        },
        "🚨 Outlier Handling": {
            "explain_text": "**🚨 Outlier Handling:** How to deal with unusual or extreme data points?",
            "sub_decisions": [
                "Which detection method to use (Z-score, IQR, clustering)?",
                "Remove, cap, or keep outliers?"
            ]
        },
        "πŸ“ Data Scaling": {
            "explain_text": "**πŸ“ Data Scaling:** How to scale or transform features before modeling?",
            "sub_decisions": [
                "Should Min-Max or Standard scaling be applied?",
                "Is log or Box-Cox transformation needed?"
            ]
        }
    },

    "πŸ€– Model Selection": {
        "explain_text": "**πŸ€– Model Selection:** Decisions about which model to train and the hyperparameter choices.",
        "πŸ—οΈ Model Architecture": {
            "explain_text": "**πŸ—οΈ Model Architecture:** Which type of model is best suited to the problem?",
            "sub_decisions": [
                "Linear vs tree-based vs neural networks?",
                "How interpretable should the model be?",
                "What are computational constraints?"
            ]
        },
        "πŸ“‰ Baseline Model": {
            "explain_text": "**πŸ“‰ Baseline Model:** What simple models can set a performance baseline?",
            "sub_decisions": [
                "Should a logistic regression or decision tree be used?",
                "What baseline metric is most relevant?"
            ]
        },
        "🧠 Pre-trained Models": {
            "explain_text": "**🧠 Pre-trained Models:** Can existing models be leveraged?",
            "sub_decisions": [
                "Which pre-trained models are relevant (image, NLP, tabular)?",
                "Fine-tune or use as feature extractors?"
            ]
        },
        "⚑ Hyperparams": {
            "explain_text": "**⚑ Hyperparams:** How to optimize model hyperparameters?",
            "sub_decisions": [
                "Grid search vs random search vs Bayesian?",
                "How many trials and folds to run?",
                "What budget or time limit applies?"
            ]
        },
        "πŸ“¦ Model Complexity": {
            "explain_text": "**πŸ“¦ Model Complexity:** Is the model efficient enough for deployment?",
            "sub_decisions": [
                "How many parameters and FLOPs?",
                "What is memory usage and latency?",
                "Are there deployment constraints (edge vs cloud)?"
            ]
        }
    },

    "πŸ‹οΈ Model Training": {
        "explain_text": "**πŸ‹οΈ Model Training:** Decisions about the training algorithm used.",
        "βœ‚οΈ Data Splitting": {
            "explain_text": "**βœ‚οΈ Data Splitting:** How should data be divided for training and testing?",
            "sub_decisions": [
                "Train-test split ratio?",
                "Cross-validation vs stratified split?"
            ]
        },
        "βš–οΈ Loss Function": {
            "explain_text": "**βš–οΈ Loss Function:** Which loss function aligns with the task?",
            "sub_decisions": [
                "MSE vs MAE vs cross-entropy?",
                "Is robustness to outliers needed?",
                "Does it align with evaluation metrics?"
            ]
        },
        "πŸš€ Optimization": {
            "explain_text": "**πŸš€ Optimization:** Which optimization algorithm should be used?",
            "sub_decisions": [
                "SGD vs Adam vs RMSProp?",
                "What learning rate schedule?",
                "What batch size?"
            ]
        },
        "πŸ›‘οΈ Regularization": {
            "explain_text": "**πŸ›‘οΈ Regularization:** How to prevent overfitting?",
            "sub_decisions": [
                "L1 vs L2 regularization?",
                "Dropout rate?",
                "Should early stopping be applied?"
            ]
        },
        "πŸ“Š Training Monitoring": {
            "explain_text": "**πŸ“Š Training Monitoring:** How to track and manage training progress?",
            "sub_decisions": [
                "Which metrics should be monitored?",
                "How often to checkpoint models?"
            ]
        }
    },

    "πŸ“ˆ Model Evaluation": {
        "explain_text": "**πŸ“ˆ Model Evaluation:** Decisions about the evaluation criteria.",
        "πŸ“ Evaluation Metric": {
            "explain_text": "**πŸ“ Evaluation Metric:** Which metrics best reflect model performance?",
            "sub_decisions": [
                "Accuracy vs Precision/Recall/F1?",
                "How to handle class imbalance?",
                "Including probabilistic metrics (AUC, log loss)?"
            ]
        },
        "πŸ§ͺ Test Data": {
            "explain_text": "**πŸ§ͺ Test Data:** How should testing be performed?",
            "sub_decisions": [
                "Hold-out set vs cross-validation?",
                "An external test dataset?"
            ]
        },
        "βš–οΈ Fairness": {
            "explain_text": "**βš–οΈ Fairness:** How to ensure fairness across groups?",
            "sub_decisions": [
                "Which fairness metric to use (demographic parity, equalized odds)?",
                "How to detect bias in predictions?"
            ]
        },
        "πŸ› οΈ Robustness": {
            "explain_text": "**πŸ› οΈ Robustness:** How reliable is the model under stress?",
            "sub_decisions": [
                "How does the model handle noisy inputs?",
                "How to test against distribution shifts?"
            ]
        },
        "πŸ” Interpretability": {
            "explain_text": "**πŸ” Interpretability:** How understandable are the model predictions?",
            "sub_decisions": [
                "Which methods to use (feature importance, SHAP, LIME)?",
                "How stable are explanations?",
                "Are explanations actionable for stakeholders?"
            ]
        }
    }
}