salihfurkaan commited on
Commit
2554712
·
1 Parent(s): 1e121c1

Add HF Dataset subset and split support

Browse files
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -10,10 +10,11 @@ from src.visualization import generate_charts
10
  from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
11
 
12
  # Updated analyze_dataset to accept api_token
13
- def analyze_dataset(file_obj, hf_dataset_name, api_token):
 
14
  # Handle HF Dataset loading if name is provided
15
  if hf_dataset_name:
16
- df, error, load_log = load_hf_dataset(hf_dataset_name, api_token=api_token)
17
  elif file_obj:
18
  df, error, load_log = load_file(file_obj)
19
  else:
@@ -104,8 +105,9 @@ def chat_response(message, history, overview_text, api_token):
104
  return ask_llm(message, history, overview_text, api_token)
105
 
106
  # Updated process function wrapper to match inputs/outputs
107
- def process_data_wrapper(file_obj, hf_dataset, api_token):
108
- results = analyze_dataset(file_obj, hf_dataset, api_token)
 
109
  return results + (results[0],) # Append overview_md for the state
110
 
111
  # --- Custom Styling & Theme ---
@@ -250,7 +252,10 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
250
  example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
251
 
252
  with gr.TabItem("HF Dataset"):
253
- hf_input = gr.Textbox(label="Dataset Name", placeholder="e.g. titanic, dair-ai/emotion", info="Loads the 'train' split.")
 
 
 
254
  hf_load_btn = gr.Button("⬇️ Load Dataset", variant="primary")
255
 
256
  gr.Markdown("---")
@@ -297,7 +302,7 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
297
  # Event wiring - File Upload
298
  file_upload.change(
299
  fn=process_data_wrapper,
300
- inputs=[file_upload, gr.State(None), api_token_input],
301
  outputs=[
302
  overview_md,
303
  dataframe_view,
@@ -315,7 +320,7 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
315
  # Also wire HF Load Button
316
  hf_load_btn.click(
317
  fn=process_data_wrapper,
318
- inputs=[gr.State(None), hf_input, api_token_input],
319
  outputs=[
320
  overview_md,
321
  dataframe_view,
 
10
  from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
11
 
12
  # Updated analyze_dataset to accept api_token
13
+ # Updated analyze_dataset to accept subset and split
14
+ def analyze_dataset(file_obj, hf_dataset_name, hf_subset, hf_split, api_token):
15
  # Handle HF Dataset loading if name is provided
16
  if hf_dataset_name:
17
+ df, error, load_log = load_hf_dataset(hf_dataset_name, subset=hf_subset, split=hf_split, api_token=api_token)
18
  elif file_obj:
19
  df, error, load_log = load_file(file_obj)
20
  else:
 
105
  return ask_llm(message, history, overview_text, api_token)
106
 
107
  # Updated process function wrapper to match inputs/outputs
108
+ # Updated process function wrapper to match inputs/outputs
109
+ def process_data_wrapper(file_obj, hf_dataset, hf_subset, hf_split, api_token):
110
+ results = analyze_dataset(file_obj, hf_dataset, hf_subset, hf_split, api_token)
111
  return results + (results[0],) # Append overview_md for the state
112
 
113
  # --- Custom Styling & Theme ---
 
252
  example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
253
 
254
  with gr.TabItem("HF Dataset"):
255
+ hf_input = gr.Textbox(label="Dataset Name", placeholder="e.g. glue", info="Name of the dataset on Hub.")
256
+ with gr.Row():
257
+ hf_subset_input = gr.Textbox(label="Subset/Config (Optional)", placeholder="e.g. mrpc", info="Specific configuration.", scale=1)
258
+ hf_split_input = gr.Textbox(label="Split", value="train", placeholder="e.g. train, test", info="Split to load.", scale=1)
259
  hf_load_btn = gr.Button("⬇️ Load Dataset", variant="primary")
260
 
261
  gr.Markdown("---")
 
302
  # Event wiring - File Upload
303
  file_upload.change(
304
  fn=process_data_wrapper,
305
+ inputs=[file_upload, gr.State(None), gr.State(None), gr.State(None), api_token_input],
306
  outputs=[
307
  overview_md,
308
  dataframe_view,
 
320
  # Also wire HF Load Button
321
  hf_load_btn.click(
322
  fn=process_data_wrapper,
323
+ inputs=[gr.State(None), hf_input, hf_subset_input, hf_split_input, api_token_input],
324
  outputs=[
325
  overview_md,
326
  dataframe_view,
src/__pycache__/ingestion.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ
 
src/ingestion.py CHANGED
@@ -123,7 +123,7 @@ def load_file(file_obj):
123
  except Exception as e:
124
  return None, f"Error loading file: {str(e)}", None
125
 
126
- def load_hf_dataset(dataset_name, split='train', api_token=None):
127
  """
128
  Loads a dataset from Hugging Face Hub.
129
  """
@@ -131,17 +131,19 @@ def load_hf_dataset(dataset_name, split='train', api_token=None):
131
  from datasets import load_dataset
132
 
133
  # Load dataset
134
- # If config is needed, user might need to specify "dataset_name/config".
135
- # For now, we try default.
136
- ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
 
 
137
 
138
  # Convert to pandas
139
  df = ds.to_pandas()
140
 
141
  if df.empty:
142
- return None, f"Dataset '{dataset_name}' (split='{split}') is empty.", None
143
 
144
- return df, None, f"Loaded Hugging Face Dataset: `{dataset_name}` (Split: {split})"
145
 
146
  except Exception as e:
147
  return None, f"Error loading HF Dataset '{dataset_name}': {str(e)}", None
 
123
  except Exception as e:
124
  return None, f"Error loading file: {str(e)}", None
125
 
126
+ def load_hf_dataset(dataset_name, subset=None, split='train', api_token=None):
127
  """
128
  Loads a dataset from Hugging Face Hub.
129
  """
 
131
  from datasets import load_dataset
132
 
133
  # Load dataset
134
+ # If subset is provided, pass it as the second argument
135
+ if subset:
136
+ ds = load_dataset(dataset_name, subset, split=split, token=api_token, trust_remote_code=True)
137
+ else:
138
+ ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
139
 
140
  # Convert to pandas
141
  df = ds.to_pandas()
142
 
143
  if df.empty:
144
+ return None, f"Dataset '{dataset_name}' (subset='{subset}', split='{split}') is empty.", None
145
 
146
+ return df, None, f"Loaded Hugging Face Dataset: `{dataset_name}`\n- Subset: `{subset}`\n- Split: `{split}`"
147
 
148
  except Exception as e:
149
  return None, f"Error loading HF Dataset '{dataset_name}': {str(e)}", None
verify_pipeline_mock.py CHANGED
@@ -27,8 +27,8 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
27
  print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
28
 
29
  try:
30
- # analyze_dataset signature changed: (file_obj, hf_dataset_name, api_token)
31
- results = analyze_dataset(mock_file, None, api_token="test")
32
 
33
  # Unpack results to verify types (updated for new return signature)
34
  # (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis_output, download_path)
 
27
  print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
28
 
29
  try:
30
+ # analyze_dataset signature changed: (file_obj, hf_dataset_name, hf_subset, hf_split, api_token)
31
+ results = analyze_dataset(mock_file, None, None, None, api_token="test")
32
 
33
  # Unpack results to verify types (updated for new return signature)
34
  # (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis_output, download_path)