# data_loader.py import streamlit as st import pandas as pd import os # Try importing optional dependencies try: from datasets import load_dataset import kagglehub except ImportError: pass # Handle usage inside functions @st.cache_data(show_spinner="Loading CSV file...", ttl=3600) def load_csv(file) -> pd.DataFrame: try: df = pd.read_csv(file) df.reset_index(drop=True, inplace=True) return df except Exception as e: st.error(f"Failed to load CSV: {e}") return None @st.cache_data(show_spinner="Downloading from Hugging Face...", ttl=3600) def load_hf(path: str, split: str) -> pd.DataFrame: try: ds = load_dataset(path, split=split) df = pd.DataFrame(ds) return df except Exception as e: st.error(f"Failed to load HF dataset: {e}") return None @st.cache_data(show_spinner="Downloading from Kaggle...", ttl=3600) def load_kaggle(kaggle_path: str, file_name: str) -> pd.DataFrame: try: path = kagglehub.dataset_download(kaggle_path) full_path = os.path.join(path, file_name) return pd.read_csv(full_path) except Exception as e: st.error(f"Failed to load Kaggle dataset: {e}") return None def dataset_sidebar(): st.sidebar.header("1️⃣ Data Ingestion") source = st.sidebar.selectbox( "Source Type", ["Upload CSV", "Hugging Face", "Kaggle"], help="Select the source of your dataset." ) df = None if source == "Upload CSV": file = st.sidebar.file_uploader("Drop CSV Here", type=["csv"]) if file: df = load_csv(file) elif source == "Hugging Face": path = st.sidebar.text_input("Dataset Path (e.g., 'iris')", "iris") split = st.sidebar.text_input("Split (e.g., 'train')", "train") if st.sidebar.button("Fetch HF Dataset"): df = load_hf(path, split) elif source == "Kaggle": kp = st.sidebar.text_input("Kaggle Path (user/dataset)") fn = st.sidebar.text_input("CSV Filename inside dataset") if st.sidebar.button("Fetch Kaggle Dataset"): df = load_kaggle(kp, fn) # State Update Logic if df is not None: if st.session_state.original_df is None or not df.equals(st.session_state.original_df): st.session_state.original_df = df.copy() # Initialize processed_df as a copy of original st.session_state.processed_df = df.copy() st.sidebar.success(f"Loaded: {df.shape[0]} rows, {df.shape[1]} cols")