Benchmark-Kit-26 / src /data_loader.py
dwmk's picture
Update src/data_loader.py
c771784 verified
# data_loader.py
import streamlit as st
import pandas as pd
import os
# Try importing optional dependencies
try:
from datasets import load_dataset
import kagglehub
except ImportError:
pass # Handle usage inside functions
@st.cache_data(show_spinner="Loading CSV file...", ttl=3600)
def load_csv(file) -> pd.DataFrame:
try:
df = pd.read_csv(file)
df.reset_index(drop=True, inplace=True)
return df
except Exception as e:
st.error(f"Failed to load CSV: {e}")
return None
@st.cache_data(show_spinner="Downloading from Hugging Face...", ttl=3600)
def load_hf(path: str, split: str) -> pd.DataFrame:
try:
ds = load_dataset(path, split=split)
df = pd.DataFrame(ds)
return df
except Exception as e:
st.error(f"Failed to load HF dataset: {e}")
return None
@st.cache_data(show_spinner="Downloading from Kaggle...", ttl=3600)
def load_kaggle(kaggle_path: str, file_name: str) -> pd.DataFrame:
try:
path = kagglehub.dataset_download(kaggle_path)
full_path = os.path.join(path, file_name)
return pd.read_csv(full_path)
except Exception as e:
st.error(f"Failed to load Kaggle dataset: {e}")
return None
def dataset_sidebar():
st.sidebar.header("1️⃣ Data Ingestion")
source = st.sidebar.selectbox(
"Source Type",
["Upload CSV", "Hugging Face", "Kaggle"],
help="Select the source of your dataset."
)
df = None
if source == "Upload CSV":
file = st.sidebar.file_uploader("Drop CSV Here", type=["csv"])
if file:
df = load_csv(file)
elif source == "Hugging Face":
path = st.sidebar.text_input("Dataset Path (e.g., 'iris')", "iris")
split = st.sidebar.text_input("Split (e.g., 'train')", "train")
if st.sidebar.button("Fetch HF Dataset"):
df = load_hf(path, split)
elif source == "Kaggle":
kp = st.sidebar.text_input("Kaggle Path (user/dataset)")
fn = st.sidebar.text_input("CSV Filename inside dataset")
if st.sidebar.button("Fetch Kaggle Dataset"):
df = load_kaggle(kp, fn)
# State Update Logic
if df is not None:
if st.session_state.original_df is None or not df.equals(st.session_state.original_df):
st.session_state.original_df = df.copy()
# Initialize processed_df as a copy of original
st.session_state.processed_df = df.copy()
st.sidebar.success(f"Loaded: {df.shape[0]} rows, {df.shape[1]} cols")