dwmk commited on
Commit
c771784
·
verified ·
1 Parent(s): 963afa9

Update src/data_loader.py

Browse files
Files changed (1) hide show
  1. src/data_loader.py +53 -29
src/data_loader.py CHANGED
@@ -2,52 +2,76 @@
2
  import streamlit as st
3
  import pandas as pd
4
  import os
5
- from datasets import load_dataset
6
- import kagglehub
7
 
8
- @st.cache_data(show_spinner=False)
9
- def load_csv(file):
10
- return pd.read_csv(file)
 
 
 
11
 
12
- @st.cache_data(show_spinner=False)
13
- def load_hf(path, split):
14
- return pd.DataFrame(load_dataset(path, split=split))
 
 
 
 
 
 
15
 
16
- @st.cache_data(show_spinner=False)
17
- def load_kaggle(kaggle_path, file_name):
18
- path = kagglehub.dataset_download(kaggle_path)
19
- return pd.read_csv(os.path.join(path, file_name))
 
 
 
 
 
20
 
21
- def dataset_sidebar():
22
- st.sidebar.header("1️⃣ Dataset Loader")
 
 
 
 
 
 
 
23
 
 
 
 
24
  source = st.sidebar.selectbox(
25
- "Data Source",
26
- ["Upload CSV", "Hugging Face", "Kaggle"]
 
27
  )
28
 
29
  df = None
30
 
31
  if source == "Upload CSV":
32
- file = st.sidebar.file_uploader("Upload CSV", type=["csv"])
33
- if file and st.sidebar.button("Load CSV"):
34
  df = load_csv(file)
35
 
36
  elif source == "Hugging Face":
37
- path = st.sidebar.text_input("Dataset name", "iris")
38
- split = st.sidebar.text_input("Split", "train")
39
- if st.sidebar.button("Load HF Dataset"):
40
  df = load_hf(path, split)
41
 
42
  elif source == "Kaggle":
43
- kp = st.sidebar.text_input("Kaggle path")
44
- fn = st.sidebar.text_input("File name")
45
- if st.sidebar.button("Load Kaggle Dataset"):
46
  df = load_kaggle(kp, fn)
47
 
 
48
  if df is not None:
49
- st.session_state.original_df = df
50
- st.session_state.processed_df = df.copy()
51
- st.success("Dataset loaded")
52
-
53
- return df
 
2
  import streamlit as st
3
  import pandas as pd
4
  import os
 
 
5
 
6
+ # Try importing optional dependencies
7
+ try:
8
+ from datasets import load_dataset
9
+ import kagglehub
10
+ except ImportError:
11
+ pass # Handle usage inside functions
12
 
13
+ @st.cache_data(show_spinner="Loading CSV file...", ttl=3600)
14
+ def load_csv(file) -> pd.DataFrame:
15
+ try:
16
+ df = pd.read_csv(file)
17
+ df.reset_index(drop=True, inplace=True)
18
+ return df
19
+ except Exception as e:
20
+ st.error(f"Failed to load CSV: {e}")
21
+ return None
22
 
23
+ @st.cache_data(show_spinner="Downloading from Hugging Face...", ttl=3600)
24
+ def load_hf(path: str, split: str) -> pd.DataFrame:
25
+ try:
26
+ ds = load_dataset(path, split=split)
27
+ df = pd.DataFrame(ds)
28
+ return df
29
+ except Exception as e:
30
+ st.error(f"Failed to load HF dataset: {e}")
31
+ return None
32
 
33
+ @st.cache_data(show_spinner="Downloading from Kaggle...", ttl=3600)
34
+ def load_kaggle(kaggle_path: str, file_name: str) -> pd.DataFrame:
35
+ try:
36
+ path = kagglehub.dataset_download(kaggle_path)
37
+ full_path = os.path.join(path, file_name)
38
+ return pd.read_csv(full_path)
39
+ except Exception as e:
40
+ st.error(f"Failed to load Kaggle dataset: {e}")
41
+ return None
42
 
43
+ def dataset_sidebar():
44
+ st.sidebar.header("1️⃣ Data Ingestion")
45
+
46
  source = st.sidebar.selectbox(
47
+ "Source Type",
48
+ ["Upload CSV", "Hugging Face", "Kaggle"],
49
+ help="Select the source of your dataset."
50
  )
51
 
52
  df = None
53
 
54
  if source == "Upload CSV":
55
+ file = st.sidebar.file_uploader("Drop CSV Here", type=["csv"])
56
+ if file:
57
  df = load_csv(file)
58
 
59
  elif source == "Hugging Face":
60
+ path = st.sidebar.text_input("Dataset Path (e.g., 'iris')", "iris")
61
+ split = st.sidebar.text_input("Split (e.g., 'train')", "train")
62
+ if st.sidebar.button("Fetch HF Dataset"):
63
  df = load_hf(path, split)
64
 
65
  elif source == "Kaggle":
66
+ kp = st.sidebar.text_input("Kaggle Path (user/dataset)")
67
+ fn = st.sidebar.text_input("CSV Filename inside dataset")
68
+ if st.sidebar.button("Fetch Kaggle Dataset"):
69
  df = load_kaggle(kp, fn)
70
 
71
+ # State Update Logic
72
  if df is not None:
73
+ if st.session_state.original_df is None or not df.equals(st.session_state.original_df):
74
+ st.session_state.original_df = df.copy()
75
+ # Initialize processed_df as a copy of original
76
+ st.session_state.processed_df = df.copy()
77
+ st.sidebar.success(f"Loaded: {df.shape[0]} rows, {df.shape[1]} cols")