Spaces:
Running
Running
Commit ·
2554712
1
Parent(s): 1e121c1
Add HF Dataset subset and split support
Browse files- __pycache__/app.cpython-313.pyc +0 -0
- app.py +12 -7
- src/__pycache__/ingestion.cpython-313.pyc +0 -0
- src/ingestion.py +8 -6
- verify_pipeline_mock.py +2 -2
__pycache__/app.cpython-313.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
|
|
|
app.py
CHANGED
|
@@ -10,10 +10,11 @@ from src.visualization import generate_charts
|
|
| 10 |
from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
|
| 11 |
|
| 12 |
# Updated analyze_dataset to accept api_token
|
| 13 |
-
|
|
|
|
| 14 |
# Handle HF Dataset loading if name is provided
|
| 15 |
if hf_dataset_name:
|
| 16 |
-
df, error, load_log = load_hf_dataset(hf_dataset_name, api_token=api_token)
|
| 17 |
elif file_obj:
|
| 18 |
df, error, load_log = load_file(file_obj)
|
| 19 |
else:
|
|
@@ -104,8 +105,9 @@ def chat_response(message, history, overview_text, api_token):
|
|
| 104 |
return ask_llm(message, history, overview_text, api_token)
|
| 105 |
|
| 106 |
# Updated process function wrapper to match inputs/outputs
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
return results + (results[0],) # Append overview_md for the state
|
| 110 |
|
| 111 |
# --- Custom Styling & Theme ---
|
|
@@ -250,7 +252,10 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
|
|
| 250 |
example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
|
| 251 |
|
| 252 |
with gr.TabItem("HF Dataset"):
|
| 253 |
-
hf_input = gr.Textbox(label="Dataset Name", placeholder="e.g.
|
|
|
|
|
|
|
|
|
|
| 254 |
hf_load_btn = gr.Button("⬇️ Load Dataset", variant="primary")
|
| 255 |
|
| 256 |
gr.Markdown("---")
|
|
@@ -297,7 +302,7 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
|
|
| 297 |
# Event wiring - File Upload
|
| 298 |
file_upload.change(
|
| 299 |
fn=process_data_wrapper,
|
| 300 |
-
inputs=[file_upload, gr.State(None), api_token_input],
|
| 301 |
outputs=[
|
| 302 |
overview_md,
|
| 303 |
dataframe_view,
|
|
@@ -315,7 +320,7 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
|
|
| 315 |
# Also wire HF Load Button
|
| 316 |
hf_load_btn.click(
|
| 317 |
fn=process_data_wrapper,
|
| 318 |
-
inputs=[gr.State(None), hf_input, api_token_input],
|
| 319 |
outputs=[
|
| 320 |
overview_md,
|
| 321 |
dataframe_view,
|
|
|
|
| 10 |
from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
|
| 11 |
|
| 12 |
# Updated analyze_dataset to accept api_token
|
| 13 |
+
# Updated analyze_dataset to accept subset and split
|
| 14 |
+
def analyze_dataset(file_obj, hf_dataset_name, hf_subset, hf_split, api_token):
|
| 15 |
# Handle HF Dataset loading if name is provided
|
| 16 |
if hf_dataset_name:
|
| 17 |
+
df, error, load_log = load_hf_dataset(hf_dataset_name, subset=hf_subset, split=hf_split, api_token=api_token)
|
| 18 |
elif file_obj:
|
| 19 |
df, error, load_log = load_file(file_obj)
|
| 20 |
else:
|
|
|
|
| 105 |
return ask_llm(message, history, overview_text, api_token)
|
| 106 |
|
| 107 |
# Updated process function wrapper to match inputs/outputs
|
| 108 |
+
# Updated process function wrapper to match inputs/outputs
|
| 109 |
+
def process_data_wrapper(file_obj, hf_dataset, hf_subset, hf_split, api_token):
|
| 110 |
+
results = analyze_dataset(file_obj, hf_dataset, hf_subset, hf_split, api_token)
|
| 111 |
return results + (results[0],) # Append overview_md for the state
|
| 112 |
|
| 113 |
# --- Custom Styling & Theme ---
|
|
|
|
| 252 |
example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
|
| 253 |
|
| 254 |
with gr.TabItem("HF Dataset"):
|
| 255 |
+
hf_input = gr.Textbox(label="Dataset Name", placeholder="e.g. glue", info="Name of the dataset on Hub.")
|
| 256 |
+
with gr.Row():
|
| 257 |
+
hf_subset_input = gr.Textbox(label="Subset/Config (Optional)", placeholder="e.g. mrpc", info="Specific configuration.", scale=1)
|
| 258 |
+
hf_split_input = gr.Textbox(label="Split", value="train", placeholder="e.g. train, test", info="Split to load.", scale=1)
|
| 259 |
hf_load_btn = gr.Button("⬇️ Load Dataset", variant="primary")
|
| 260 |
|
| 261 |
gr.Markdown("---")
|
|
|
|
| 302 |
# Event wiring - File Upload
|
| 303 |
file_upload.change(
|
| 304 |
fn=process_data_wrapper,
|
| 305 |
+
inputs=[file_upload, gr.State(None), gr.State(None), gr.State(None), api_token_input],
|
| 306 |
outputs=[
|
| 307 |
overview_md,
|
| 308 |
dataframe_view,
|
|
|
|
| 320 |
# Also wire HF Load Button
|
| 321 |
hf_load_btn.click(
|
| 322 |
fn=process_data_wrapper,
|
| 323 |
+
inputs=[gr.State(None), hf_input, hf_subset_input, hf_split_input, api_token_input],
|
| 324 |
outputs=[
|
| 325 |
overview_md,
|
| 326 |
dataframe_view,
|
src/__pycache__/ingestion.cpython-313.pyc
CHANGED
|
Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ
|
|
|
src/ingestion.py
CHANGED
|
@@ -123,7 +123,7 @@ def load_file(file_obj):
|
|
| 123 |
except Exception as e:
|
| 124 |
return None, f"Error loading file: {str(e)}", None
|
| 125 |
|
| 126 |
-
def load_hf_dataset(dataset_name, split='train', api_token=None):
|
| 127 |
"""
|
| 128 |
Loads a dataset from Hugging Face Hub.
|
| 129 |
"""
|
|
@@ -131,17 +131,19 @@ def load_hf_dataset(dataset_name, split='train', api_token=None):
|
|
| 131 |
from datasets import load_dataset
|
| 132 |
|
| 133 |
# Load dataset
|
| 134 |
-
# If
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# Convert to pandas
|
| 139 |
df = ds.to_pandas()
|
| 140 |
|
| 141 |
if df.empty:
|
| 142 |
-
return None, f"Dataset '{dataset_name}' (split='{split}') is empty.", None
|
| 143 |
|
| 144 |
-
return df, None, f"Loaded Hugging Face Dataset: `{dataset_name}`
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
return None, f"Error loading HF Dataset '{dataset_name}': {str(e)}", None
|
|
|
|
| 123 |
except Exception as e:
|
| 124 |
return None, f"Error loading file: {str(e)}", None
|
| 125 |
|
| 126 |
+
def load_hf_dataset(dataset_name, subset=None, split='train', api_token=None):
|
| 127 |
"""
|
| 128 |
Loads a dataset from Hugging Face Hub.
|
| 129 |
"""
|
|
|
|
| 131 |
from datasets import load_dataset
|
| 132 |
|
| 133 |
# Load dataset
|
| 134 |
+
# If subset is provided, pass it as the second argument
|
| 135 |
+
if subset:
|
| 136 |
+
ds = load_dataset(dataset_name, subset, split=split, token=api_token, trust_remote_code=True)
|
| 137 |
+
else:
|
| 138 |
+
ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
|
| 139 |
|
| 140 |
# Convert to pandas
|
| 141 |
df = ds.to_pandas()
|
| 142 |
|
| 143 |
if df.empty:
|
| 144 |
+
return None, f"Dataset '{dataset_name}' (subset='{subset}', split='{split}') is empty.", None
|
| 145 |
|
| 146 |
+
return df, None, f"Loaded Hugging Face Dataset: `{dataset_name}`\n- Subset: `{subset}`\n- Split: `{split}`"
|
| 147 |
|
| 148 |
except Exception as e:
|
| 149 |
return None, f"Error loading HF Dataset '{dataset_name}': {str(e)}", None
|
verify_pipeline_mock.py
CHANGED
|
@@ -27,8 +27,8 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
|
|
| 27 |
print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
|
| 28 |
|
| 29 |
try:
|
| 30 |
-
# analyze_dataset signature changed: (file_obj, hf_dataset_name, api_token)
|
| 31 |
-
results = analyze_dataset(mock_file, None, api_token="test")
|
| 32 |
|
| 33 |
# Unpack results to verify types (updated for new return signature)
|
| 34 |
# (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis_output, download_path)
|
|
|
|
| 27 |
print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
|
| 28 |
|
| 29 |
try:
|
| 30 |
+
# analyze_dataset signature changed: (file_obj, hf_dataset_name, hf_subset, hf_split, api_token)
|
| 31 |
+
results = analyze_dataset(mock_file, None, None, None, api_token="test")
|
| 32 |
|
| 33 |
# Unpack results to verify types (updated for new return signature)
|
| 34 |
# (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis_output, download_path)
|