Spaces:
Running
Running
File size: 7,390 Bytes
978fed5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """Data analysis workflow form and runner."""
from pathlib import Path
import streamlit as st
from loguru import logger
from utils import cleanup_uploaded_data, save_and_extract_upload
from scider.core import constant
from scider.workflows.data_workflow import DataWorkflow
from scider.workflows.hypo_data_workflow import HypoDataWorkflow
def run_data(path, q, workspace_path):
"""Run data workflow. Called from background thread."""
data_path = Path(path) if Path(path).exists() else path # may be HF repo name
logger.info(f"Running data analysis on path: {data_path}")
w = DataWorkflow(
data_path=Path(path) if Path(path).exists() else Path(str(path)),
workspace_path=workspace_path,
recursion_limit=100,
)
w.run()
intermediate_state = getattr(w, "data_agent_intermediate_state", [])
if w.final_status != "success":
error_msg = w.error_message or "Data workflow failed"
return f"Data workflow failed: {error_msg}", intermediate_state
out = ["## Data Analysis Complete"]
if w.data_summary:
out.append(w.data_summary)
return "\n\n".join(out), intermediate_state
def run_hypo_data(feature_desc, num_rows, query, workspace_path):
"""Run hypothetical data workflow. Called from background thread."""
logger.info(f"Running hypothetical data generation: {feature_desc[:100]}...")
w = HypoDataWorkflow(
feature_desc=feature_desc,
workspace_path=workspace_path,
num_rows=num_rows,
user_query=query,
recursion_limit=100,
)
w.run()
if w.final_status != "success":
error_msg = w.error_message or "Hypothetical data workflow failed"
return f"Workflow failed: {error_msg}", []
out = ["## Hypothetical Data Analysis Complete"]
if w.data_summary:
out.append(w.data_summary)
return "\n\n".join(out), []
def render_form():
"""Render the data analysis form. Returns workflow_config dict or None."""
hf_enabled = constant.HF_DATASET_DOWNLOAD_ENABLED
source_options = ["Upload local file"]
if hf_enabled:
source_options.append("HuggingFace dataset")
source_options.append("Generate hypothetical data")
data_source = st.radio(
"Data Source",
source_options,
horizontal=True,
key="data_source_radio",
)
with st.form("data_form", clear_on_submit=True):
st.markdown("### Data Analysis Workflow")
if data_source == "Generate hypothetical data":
feature_desc = st.text_area(
"Describe the data you want to generate",
placeholder=(
"e.g. A dataset about house prices with features: "
"square footage (1000-5000 sq ft), number of bedrooms (1-6), "
"age of house (0-100 years), price ($100k-$1M)"
),
height=150,
help="Describe the features, their ranges, and the domain of the dataset.",
)
num_rows = st.number_input(
"Number of rows",
min_value=10,
max_value=100000,
value=1000,
step=100,
)
query = st.text_input(
"Analysis query",
placeholder="What would you like to analyze about this data?",
)
submitted = st.form_submit_button("Generate & Analyze")
if submitted:
if not feature_desc or not feature_desc.strip():
st.error("Please describe the data you want to generate.")
return None
return {
"type": "data_hypo",
"feature_desc": feature_desc.strip(),
"num_rows": num_rows,
"query": query
or f"Analyze this synthetic dataset: {feature_desc.strip()[:200]}",
}
elif data_source == "HuggingFace dataset":
hf_repo = st.text_input(
"HuggingFace Dataset Repo",
placeholder="e.g. scikit-learn/iris",
help="Enter a HuggingFace dataset repository name. It will be downloaded automatically.",
)
query = st.text_input("Query", placeholder="What would you like to analyze?")
submitted = st.form_submit_button("Run Data Analysis")
if submitted:
if not hf_repo or not hf_repo.strip():
st.error("Please enter a HuggingFace dataset repository name.")
return None
if not query or not query.strip():
query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
return {"type": "data", "path": hf_repo.strip(), "query": query}
else:
st.caption("Upload a zip dataset or enter a path to existing data")
uploaded_zip = st.file_uploader(
"Upload ZIP dataset (optional)",
type=["zip"],
help="Upload a zip file containing your dataset. Extracted temporarily, deleted on reset.",
)
if st.session_state.get("uploaded_data_path"):
st.info(f"Using uploaded data: `{st.session_state.uploaded_data_path}`")
query = st.text_input("Query", placeholder="What would you like to analyze?")
submitted = st.form_submit_button("Run Data Analysis")
if submitted:
if not query or not query.strip():
query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
path_to_use = None
if uploaded_zip:
cleanup_uploaded_data()
extracted = save_and_extract_upload(uploaded_zip)
if extracted and extracted.exists():
extracted = extracted.resolve()
st.session_state.uploaded_data_path = str(extracted)
st.session_state.workspace_path = extracted.parent
path_to_use = str(extracted)
st.success(f"File uploaded and extracted to: {path_to_use}")
else:
st.error(
f"Failed to process uploaded zip file. Extracted path: {extracted}"
)
elif st.session_state.get("uploaded_data_path"):
path = Path(st.session_state.uploaded_data_path).resolve()
if path.exists():
path_to_use = str(path)
st.session_state.workspace_path = path.parent
else:
st.warning(f"Previously uploaded path no longer exists: {path}")
cleanup_uploaded_data()
if path_to_use:
verify_path = Path(path_to_use).resolve()
if not verify_path.exists():
st.error(f"Path does not exist: {path_to_use}")
else:
return {"type": "data", "path": str(verify_path), "query": query}
else:
st.error("Please upload a zip file or enter a data path.")
return None
|