import streamlit as st
import time
def go_to(page_name, from_callback=False):
"""
Updates the session_state page and optionally triggers a rerun.
- from_callback=True if called inside an on_click callback
"""
st.session_state.page = page_name
if not from_callback:
st.rerun()
# st.stop()
def add_navigation(previous_page, next_page):
col1, col2, col3 = st.columns([1, 4, 1])
if previous_page is not None:
with col1:
if st.button("Previous"):
go_to(previous_page)
if next_page is not None:
with col3:
if st.button("Next"):
go_to(next_page)
st.markdown("---")
# st.divider()
def add_fadein_text(paragraphs):
# Define CSS fade-in animation
st.markdown("""
""", unsafe_allow_html=True)
# Create one placeholder for each paragraph
placeholders = [st.empty() for _ in paragraphs]
# Reveal paragraphs sequentially
for i, p in enumerate(paragraphs):
placeholders[i].markdown(f"
{p[0]}
", unsafe_allow_html=True)
time.sleep(p[1]) # delay (seconds) before showing next one
def add_instruction_text(text_to_display):
st.markdown(f"{text_to_display}
", unsafe_allow_html=True)
st.markdown("---")
def add_red_text(text_to_display):
st.markdown(
f"{text_to_display}
",
unsafe_allow_html=True,
)
# Define pipeline stages
pipeline_data = {
"Data Collection": {
"explain_text": "**Data Collection:** Decisions about what data to collect and how.",
"Data Sources": {
"explain_text": "**Data Sources:** What data sources will be used to collect data?",
"sub_decisions": ["Collect existing dataset or new sensor data?", "Public datasets or Private datasets?", "Design Web Scraping or use APIs?"]
},
"Data Usage": {
"explain_text": "**Data Usage:** How should the data be used, given any license or permission constraints?",
"sub_decisions": ["Ethical concerns to be addressed?", "Commercial use policies?", "Geographic limits?"]
},
"Data Quality": {
"explain_text": "**Data Quality:** What kind of quality checks are done to decide data collection?",
"sub_decisions": ["Missing value checks to see if critical field are affected?", "Potential duplicates?", "Format consistency and encoding issues?"]
},
"Data Sampling": {
"explain_text": "**Data Sampling:** How to sample from a potentially bigger data source?",
"sub_decisions": ["Random sampling/stratified sampling/cluster sampling?", "Sample size?", "Potential imbalance?", "Additional synthetic data?"]
},
"Data Storage": {
"explain_text": "**Data Storage:** How and where to store the data?",
"sub_decisions": ["Backup frequency?", "File format choice?"]
},
},
"Data Processing": {
"explain_text": "**Data Processing:** Decisions about how to process and prepare the data.",
"Data Cleaning": {
"explain_text": "**Data Cleaning:** How should raw data be cleaned and standardized?",
"sub_decisions": ["How to handle missing values?", "How to detect/remove duplicates?", "How to fix formatting errors?"]
},
"Feature Selection": {
"explain_text": "**Feature Selection:** Which features should be included in the model?",
"sub_decisions": ["Manual vs automated selection?", "How to check for data leakage?", "Should dimensionality reduction be applied?"]
},
"Feature Engineering": {
"explain_text": "**Feature Engineering:** How to create or transform features for better performance?",
"sub_decisions": ["What new features should be created?", "How to combine existing features?", "How to encode categorical variables?"]
},
"Outlier Handling": {
"explain_text": "**Outlier Handling:** How to deal with unusual or extreme data points?",
"sub_decisions": ["Which detection method to use (Z-score, IQR, clustering)?", "Remove, cap, or keep outliers?"]
},
"Data Scaling": {
"explain_text": "**Data Scaling:** How to scale or transform features before modeling?",
"sub_decisions": ["Should Min-Max or Standard scaling be applied?", "Is log or Box-Cox transformation needed?"]
}
},
"Model Selection": {
"explain_text": "**Model Selection:** Decisions about which model to train and the hyperparameter choices.",
"Model Architecture": {
"explain_text": "**Model Architecture:** Which type of model is best suited to the problem?",
"sub_decisions": ["Linear vs tree-based vs neural networks?", "How interpretable should the model be?", "What are computational constraints?"]
},
"Baseline Model": {
"explain_text": "**Baseline Model:** What simple models can set a performance baseline?",
"sub_decisions": ["Should a logistic regression or decision tree be used?", "What baseline metric is most relevant?"]
},
"Pre-trained Models": {
"explain_text": "**Pre-trained Models:** Can existing models be leveraged?",
"sub_decisions": ["Which pre-trained models are relevant (image, NLP, tabular)?", "Fine-tune or use as feature extractors?"]
},
"Hyperparameters": {
"explain_text": "**Hyperparameters:** How to optimize model hyperparameters?",
"sub_decisions": ["Grid search vs random search vs Bayesian?", "How many trials and folds to run?", "What budget or time limit applies?"]
},
"Model Complexity": {
"explain_text": "**Model Complexity:** Is the model efficient enough for deployment?",
"sub_decisions": ["How many parameters and FLOPs?", "What is memory usage and latency?", "Are there deployment constraints (edge vs cloud)?"]
}
},
"Model Training": {
"explain_text": "**Model Training:** Decisions about the training algorithm used.",
"Data Splitting": {
"explain_text": "**Data Splitting:** How should data be divided for training and testing?",
"sub_decisions": ["Train-test split ratio?", "Cross-validation vs stratified split?"]
},
"Loss Function": {
"explain_text": "**Loss Function:** Which loss function aligns with the task?",
"sub_decisions": ["MSE vs MAE vs cross-entropy?", "Is robustness to outliers needed?", "Does it align with evaluation metrics?"]
},
"Optimization Method": {
"explain_text": "**Optimization Method:** Which optimization algorithm should be used?",
"sub_decisions": ["SGD vs Adam vs RMSProp?", "What learning rate schedule?", "What batch size?"]
},
"Regularization": {
"explain_text": "**Regularization:** How to prevent overfitting?",
"sub_decisions": ["L1 vs L2 regularization?", "Dropout rate?", "Should early stopping be applied?"]
},
"Training Monitoring": {
"explain_text": "**Training Monitoring:** How to track and manage training progress?",
"sub_decisions": ["Which metrics should be monitored?", "How often to checkpoint models?"]
}
},
"Model Evaluation": {
"explain_text": "**Model Evaluation:** Decisions about the evaluation criteria.",
"Evaluation Metric": {
"explain_text": "**Evaluation Metric:** Which metrics best reflect model performance?",
"sub_decisions": ["Accuracy vs Precision/Recall/F1?", "How to handle class imbalance?", "Including probabilistic metrics (AUC, log loss)?"]
},
"Test Data": {
"explain_text": "**Test Data:** How should testing be performed?",
"sub_decisions": ["Hold-out set vs cross-validation?", "An external test dataset?"]
},
"Fairness": {
"explain_text": "**Fairness:** How to ensure fairness across groups?",
"sub_decisions": ["Which fairness metric to use (demographic parity, equalized odds)?", "How to detect bias in predictions?"]
},
"Robustness": {
"explain_text": "**Robustness:** How reliable is the model under stress?",
"sub_decisions": ["How does the model handle noisy inputs?", "How to test against distribution shifts?"]
},
"Interpretability": {
"explain_text": "**Interpretability:** How understandable are the model predictions?",
"sub_decisions": ["Which methods to use (feature importance, SHAP, LIME)?", "How stable are explanations?", "Are explanations actionable for stakeholders?"]
}
}
}