Spaces:
Sleeping
Sleeping
Update src/data_loader.py
Browse files- src/data_loader.py +53 -29
src/data_loader.py
CHANGED
|
@@ -2,52 +2,76 @@
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import os
|
| 5 |
-
from datasets import load_dataset
|
| 6 |
-
import kagglehub
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
@st.cache_data(show_spinner=
|
| 13 |
-
def
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
@st.cache_data(show_spinner=
|
| 17 |
-
def
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
source = st.sidebar.selectbox(
|
| 25 |
-
"
|
| 26 |
-
["Upload CSV", "Hugging Face", "Kaggle"]
|
|
|
|
| 27 |
)
|
| 28 |
|
| 29 |
df = None
|
| 30 |
|
| 31 |
if source == "Upload CSV":
|
| 32 |
-
file = st.sidebar.file_uploader("
|
| 33 |
-
if file
|
| 34 |
df = load_csv(file)
|
| 35 |
|
| 36 |
elif source == "Hugging Face":
|
| 37 |
-
path = st.sidebar.text_input("Dataset
|
| 38 |
-
split = st.sidebar.text_input("Split", "train")
|
| 39 |
-
if st.sidebar.button("
|
| 40 |
df = load_hf(path, split)
|
| 41 |
|
| 42 |
elif source == "Kaggle":
|
| 43 |
-
kp = st.sidebar.text_input("Kaggle
|
| 44 |
-
fn = st.sidebar.text_input("
|
| 45 |
-
if st.sidebar.button("
|
| 46 |
df = load_kaggle(kp, fn)
|
| 47 |
|
|
|
|
| 48 |
if df is not None:
|
| 49 |
-
st.session_state.original_df
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import os
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
# Try importing optional dependencies
|
| 7 |
+
try:
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
import kagglehub
|
| 10 |
+
except ImportError:
|
| 11 |
+
pass # Handle usage inside functions
|
| 12 |
|
| 13 |
+
@st.cache_data(show_spinner="Loading CSV file...", ttl=3600)
|
| 14 |
+
def load_csv(file) -> pd.DataFrame:
|
| 15 |
+
try:
|
| 16 |
+
df = pd.read_csv(file)
|
| 17 |
+
df.reset_index(drop=True, inplace=True)
|
| 18 |
+
return df
|
| 19 |
+
except Exception as e:
|
| 20 |
+
st.error(f"Failed to load CSV: {e}")
|
| 21 |
+
return None
|
| 22 |
|
| 23 |
+
@st.cache_data(show_spinner="Downloading from Hugging Face...", ttl=3600)
|
| 24 |
+
def load_hf(path: str, split: str) -> pd.DataFrame:
|
| 25 |
+
try:
|
| 26 |
+
ds = load_dataset(path, split=split)
|
| 27 |
+
df = pd.DataFrame(ds)
|
| 28 |
+
return df
|
| 29 |
+
except Exception as e:
|
| 30 |
+
st.error(f"Failed to load HF dataset: {e}")
|
| 31 |
+
return None
|
| 32 |
|
| 33 |
+
@st.cache_data(show_spinner="Downloading from Kaggle...", ttl=3600)
|
| 34 |
+
def load_kaggle(kaggle_path: str, file_name: str) -> pd.DataFrame:
|
| 35 |
+
try:
|
| 36 |
+
path = kagglehub.dataset_download(kaggle_path)
|
| 37 |
+
full_path = os.path.join(path, file_name)
|
| 38 |
+
return pd.read_csv(full_path)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
st.error(f"Failed to load Kaggle dataset: {e}")
|
| 41 |
+
return None
|
| 42 |
|
| 43 |
+
def dataset_sidebar():
|
| 44 |
+
st.sidebar.header("1️⃣ Data Ingestion")
|
| 45 |
+
|
| 46 |
source = st.sidebar.selectbox(
|
| 47 |
+
"Source Type",
|
| 48 |
+
["Upload CSV", "Hugging Face", "Kaggle"],
|
| 49 |
+
help="Select the source of your dataset."
|
| 50 |
)
|
| 51 |
|
| 52 |
df = None
|
| 53 |
|
| 54 |
if source == "Upload CSV":
|
| 55 |
+
file = st.sidebar.file_uploader("Drop CSV Here", type=["csv"])
|
| 56 |
+
if file:
|
| 57 |
df = load_csv(file)
|
| 58 |
|
| 59 |
elif source == "Hugging Face":
|
| 60 |
+
path = st.sidebar.text_input("Dataset Path (e.g., 'iris')", "iris")
|
| 61 |
+
split = st.sidebar.text_input("Split (e.g., 'train')", "train")
|
| 62 |
+
if st.sidebar.button("Fetch HF Dataset"):
|
| 63 |
df = load_hf(path, split)
|
| 64 |
|
| 65 |
elif source == "Kaggle":
|
| 66 |
+
kp = st.sidebar.text_input("Kaggle Path (user/dataset)")
|
| 67 |
+
fn = st.sidebar.text_input("CSV Filename inside dataset")
|
| 68 |
+
if st.sidebar.button("Fetch Kaggle Dataset"):
|
| 69 |
df = load_kaggle(kp, fn)
|
| 70 |
|
| 71 |
+
# State Update Logic
|
| 72 |
if df is not None:
|
| 73 |
+
if st.session_state.original_df is None or not df.equals(st.session_state.original_df):
|
| 74 |
+
st.session_state.original_df = df.copy()
|
| 75 |
+
# Initialize processed_df as a copy of original
|
| 76 |
+
st.session_state.processed_df = df.copy()
|
| 77 |
+
st.sidebar.success(f"Loaded: {df.shape[0]} rows, {df.shape[1]} cols")
|