Spaces:

asenturisk
/

Benchmark-Kit-26

Sleeping

File size: 2,574 Bytes

# data_loader.py
import streamlit as st
import pandas as pd
import os

# Try importing optional dependencies
try:
    from datasets import load_dataset
    import kagglehub
except ImportError:
    pass # Handle usage inside functions

@st.cache_data(show_spinner="Loading CSV file...", ttl=3600)
def load_csv(file) -> pd.DataFrame:
    try:
        df = pd.read_csv(file)
        df.reset_index(drop=True, inplace=True)
        return df
    except Exception as e:
        st.error(f"Failed to load CSV: {e}")
        return None

@st.cache_data(show_spinner="Downloading from Hugging Face...", ttl=3600)
def load_hf(path: str, split: str) -> pd.DataFrame:
    try:
        ds = load_dataset(path, split=split)
        df = pd.DataFrame(ds)
        return df
    except Exception as e:
        st.error(f"Failed to load HF dataset: {e}")
        return None

@st.cache_data(show_spinner="Downloading from Kaggle...", ttl=3600)
def load_kaggle(kaggle_path: str, file_name: str) -> pd.DataFrame:
    try:
        path = kagglehub.dataset_download(kaggle_path)
        full_path = os.path.join(path, file_name)
        return pd.read_csv(full_path)
    except Exception as e:
        st.error(f"Failed to load Kaggle dataset: {e}")
        return None

def dataset_sidebar():
    st.sidebar.header("1️⃣ Data Ingestion")
    
    source = st.sidebar.selectbox(
        "Source Type",
        ["Upload CSV", "Hugging Face", "Kaggle"],
        help="Select the source of your dataset."
    )

    df = None

    if source == "Upload CSV":
        file = st.sidebar.file_uploader("Drop CSV Here", type=["csv"])
        if file:
            df = load_csv(file)

    elif source == "Hugging Face":
        path = st.sidebar.text_input("Dataset Path (e.g., 'iris')", "iris")
        split = st.sidebar.text_input("Split (e.g., 'train')", "train")
        if st.sidebar.button("Fetch HF Dataset"):
            df = load_hf(path, split)

    elif source == "Kaggle":
        kp = st.sidebar.text_input("Kaggle Path (user/dataset)")
        fn = st.sidebar.text_input("CSV Filename inside dataset")
        if st.sidebar.button("Fetch Kaggle Dataset"):
            df = load_kaggle(kp, fn)

    # State Update Logic
    if df is not None:
        if st.session_state.original_df is None or not df.equals(st.session_state.original_df):
            st.session_state.original_df = df.copy()
            # Initialize processed_df as a copy of original
            st.session_state.processed_df = df.copy()
            st.sidebar.success(f"Loaded: {df.shape[0]} rows, {df.shape[1]} cols")