leonardklin's picture
Upload 328 files
978fed5 verified
"""Data analysis workflow form and runner."""
from pathlib import Path
import streamlit as st
from loguru import logger
from utils import cleanup_uploaded_data, save_and_extract_upload
from scider.core import constant
from scider.workflows.data_workflow import DataWorkflow
from scider.workflows.hypo_data_workflow import HypoDataWorkflow
def run_data(path, q, workspace_path):
"""Run data workflow. Called from background thread."""
data_path = Path(path) if Path(path).exists() else path # may be HF repo name
logger.info(f"Running data analysis on path: {data_path}")
w = DataWorkflow(
data_path=Path(path) if Path(path).exists() else Path(str(path)),
workspace_path=workspace_path,
recursion_limit=100,
)
w.run()
intermediate_state = getattr(w, "data_agent_intermediate_state", [])
if w.final_status != "success":
error_msg = w.error_message or "Data workflow failed"
return f"Data workflow failed: {error_msg}", intermediate_state
out = ["## Data Analysis Complete"]
if w.data_summary:
out.append(w.data_summary)
return "\n\n".join(out), intermediate_state
def run_hypo_data(feature_desc, num_rows, query, workspace_path):
"""Run hypothetical data workflow. Called from background thread."""
logger.info(f"Running hypothetical data generation: {feature_desc[:100]}...")
w = HypoDataWorkflow(
feature_desc=feature_desc,
workspace_path=workspace_path,
num_rows=num_rows,
user_query=query,
recursion_limit=100,
)
w.run()
if w.final_status != "success":
error_msg = w.error_message or "Hypothetical data workflow failed"
return f"Workflow failed: {error_msg}", []
out = ["## Hypothetical Data Analysis Complete"]
if w.data_summary:
out.append(w.data_summary)
return "\n\n".join(out), []
def render_form():
"""Render the data analysis form. Returns workflow_config dict or None."""
hf_enabled = constant.HF_DATASET_DOWNLOAD_ENABLED
source_options = ["Upload local file"]
if hf_enabled:
source_options.append("HuggingFace dataset")
source_options.append("Generate hypothetical data")
data_source = st.radio(
"Data Source",
source_options,
horizontal=True,
key="data_source_radio",
)
with st.form("data_form", clear_on_submit=True):
st.markdown("### Data Analysis Workflow")
if data_source == "Generate hypothetical data":
feature_desc = st.text_area(
"Describe the data you want to generate",
placeholder=(
"e.g. A dataset about house prices with features: "
"square footage (1000-5000 sq ft), number of bedrooms (1-6), "
"age of house (0-100 years), price ($100k-$1M)"
),
height=150,
help="Describe the features, their ranges, and the domain of the dataset.",
)
num_rows = st.number_input(
"Number of rows",
min_value=10,
max_value=100000,
value=1000,
step=100,
)
query = st.text_input(
"Analysis query",
placeholder="What would you like to analyze about this data?",
)
submitted = st.form_submit_button("Generate & Analyze")
if submitted:
if not feature_desc or not feature_desc.strip():
st.error("Please describe the data you want to generate.")
return None
return {
"type": "data_hypo",
"feature_desc": feature_desc.strip(),
"num_rows": num_rows,
"query": query
or f"Analyze this synthetic dataset: {feature_desc.strip()[:200]}",
}
elif data_source == "HuggingFace dataset":
hf_repo = st.text_input(
"HuggingFace Dataset Repo",
placeholder="e.g. scikit-learn/iris",
help="Enter a HuggingFace dataset repository name. It will be downloaded automatically.",
)
query = st.text_input("Query", placeholder="What would you like to analyze?")
submitted = st.form_submit_button("Run Data Analysis")
if submitted:
if not hf_repo or not hf_repo.strip():
st.error("Please enter a HuggingFace dataset repository name.")
return None
if not query or not query.strip():
query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
return {"type": "data", "path": hf_repo.strip(), "query": query}
else:
st.caption("Upload a zip dataset or enter a path to existing data")
uploaded_zip = st.file_uploader(
"Upload ZIP dataset (optional)",
type=["zip"],
help="Upload a zip file containing your dataset. Extracted temporarily, deleted on reset.",
)
if st.session_state.get("uploaded_data_path"):
st.info(f"Using uploaded data: `{st.session_state.uploaded_data_path}`")
query = st.text_input("Query", placeholder="What would you like to analyze?")
submitted = st.form_submit_button("Run Data Analysis")
if submitted:
if not query or not query.strip():
query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
path_to_use = None
if uploaded_zip:
cleanup_uploaded_data()
extracted = save_and_extract_upload(uploaded_zip)
if extracted and extracted.exists():
extracted = extracted.resolve()
st.session_state.uploaded_data_path = str(extracted)
st.session_state.workspace_path = extracted.parent
path_to_use = str(extracted)
st.success(f"File uploaded and extracted to: {path_to_use}")
else:
st.error(
f"Failed to process uploaded zip file. Extracted path: {extracted}"
)
elif st.session_state.get("uploaded_data_path"):
path = Path(st.session_state.uploaded_data_path).resolve()
if path.exists():
path_to_use = str(path)
st.session_state.workspace_path = path.parent
else:
st.warning(f"Previously uploaded path no longer exists: {path}")
cleanup_uploaded_data()
if path_to_use:
verify_path = Path(path_to_use).resolve()
if not verify_path.exists():
st.error(f"Path does not exist: {path_to_use}")
else:
return {"type": "data", "path": str(verify_path), "query": query}
else:
st.error("Please upload a zip file or enter a data path.")
return None