Spaces:
Sleeping
Sleeping
File size: 7,970 Bytes
1499363 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | """Data analysis workflow form and runner."""
from pathlib import Path
import streamlit as st
from loguru import logger
from utils import cleanup_uploaded_data, save_and_extract_upload, status_banner
from scider.core import constant
from scider.workflows.data_workflow import DataWorkflow
from scider.workflows.hypo_data_workflow import HypoDataWorkflow
def run_data(path, q, workspace_path):
"""Run data workflow. Called from background thread."""
data_path = Path(path) if Path(path).exists() else Path(str(path))
logger.info(f"Running data analysis on path: {data_path}")
w = DataWorkflow(
data_path=data_path,
workspace_path=workspace_path,
recursion_limit=100,
)
w.run()
intermediate_state = getattr(w, "data_agent_intermediate_state", [])
if w.final_status != "success":
error_msg = w.error_message or "Data workflow failed"
banner = status_banner("failed", "Data workflow failed")
return f"{banner}\n\n{error_msg}", intermediate_state
banner = status_banner("success", "Data analysis complete")
body = w.data_summary or "The data agent finished but produced no summary."
return f"{banner}\n\n{body}", intermediate_state
def run_hypo_data(feature_desc, num_rows, query, workspace_path):
"""Run hypothetical data workflow. Called from background thread."""
logger.info(f"Running hypothetical data generation: {feature_desc[:100]}...")
w = HypoDataWorkflow(
feature_desc=feature_desc,
workspace_path=workspace_path,
num_rows=num_rows,
user_query=query,
recursion_limit=100,
)
w.run()
if w.final_status != "success":
error_msg = w.error_message or "Hypothetical data workflow failed"
banner = status_banner("failed", "Hypothetical data workflow failed")
return f"{banner}\n\n{error_msg}", []
banner = status_banner("success", "Hypothetical data analysis complete")
body = w.data_summary or "The data agent finished but produced no summary."
return f"{banner}\n\n{body}", []
def render_form():
"""Render the data analysis form. Returns workflow_config dict or None."""
hf_enabled = constant.HF_DATASET_DOWNLOAD_ENABLED
source_options = ["Upload local file"]
if hf_enabled:
source_options.append("HuggingFace dataset")
source_options.append("Generate hypothetical data")
data_source = st.radio(
"Data Source",
source_options,
horizontal=True,
key="data_source_radio",
)
with st.form("data_form", clear_on_submit=True):
st.markdown("### Analyze Your Data")
st.caption(
"Upload a dataset, enter a HuggingFace repo name, or let SciDER generate synthetic data. "
"The AI agent explores structure, runs statistical analysis, and searches for related metrics in the literature."
)
if data_source == "Generate hypothetical data":
feature_desc = st.text_area(
"Describe the data you want to generate",
placeholder=(
"e.g. A dataset about house prices with features: "
"square footage (1000-5000 sq ft), number of bedrooms (1-6), "
"age of house (0-100 years), price ($100k-$1M)"
),
height=150,
help="Describe the features, their ranges, and the domain of the dataset.",
)
num_rows = st.number_input(
"Number of rows",
min_value=10,
max_value=100000,
value=1000,
step=100,
)
query = st.text_input(
"Analysis query",
placeholder="What would you like to analyze about this data?",
)
submitted = st.form_submit_button("Generate & Analyze")
if submitted:
if not feature_desc or not feature_desc.strip():
st.error("Please describe the data you want to generate.")
return None
return {
"type": "data_hypo",
"feature_desc": feature_desc.strip(),
"num_rows": num_rows,
"query": query
or f"Analyze this synthetic dataset: {feature_desc.strip()[:200]}",
}
elif data_source == "HuggingFace dataset":
hf_repo = st.text_input(
"HuggingFace Dataset Repo",
placeholder="e.g. scikit-learn/iris",
help="Enter a HuggingFace dataset repository name. It will be downloaded automatically.",
)
query = st.text_input(
"Query",
placeholder="e.g. What features most strongly predict the target variable?",
)
submitted = st.form_submit_button("Analyze Dataset")
if submitted:
if not hf_repo or not hf_repo.strip():
st.error("Please enter a HuggingFace dataset repository name.")
return None
if not query or not query.strip():
query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
return {"type": "data", "path": hf_repo.strip(), "query": query}
else:
st.caption("Upload a zip file containing your dataset")
uploaded_zip = st.file_uploader(
"Upload ZIP dataset (optional)",
type=["zip"],
help="Upload a zip file containing your dataset. Extracted temporarily, deleted on reset.",
)
if st.session_state.get("uploaded_data_path"):
st.info(f"Using uploaded data: `{st.session_state.uploaded_data_path}`")
query = st.text_input(
"Query",
placeholder="e.g. What features most strongly predict the target variable?",
)
submitted = st.form_submit_button("Analyze Dataset")
if submitted:
if not query or not query.strip():
query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
path_to_use = None
if uploaded_zip:
cleanup_uploaded_data()
extracted = save_and_extract_upload(uploaded_zip)
if extracted and extracted.exists():
extracted = extracted.resolve()
st.session_state.uploaded_data_path = str(extracted)
st.session_state.workspace_path = extracted.parent
path_to_use = str(extracted)
st.success(f"File uploaded and extracted to: {path_to_use}")
else:
st.error(
f"Failed to process uploaded zip file. Extracted path: {extracted}"
)
elif st.session_state.get("uploaded_data_path"):
path = Path(st.session_state.uploaded_data_path).resolve()
if path.exists():
path_to_use = str(path)
st.session_state.workspace_path = path.parent
else:
st.warning(f"Previously uploaded path no longer exists: {path}")
cleanup_uploaded_data()
if path_to_use:
verify_path = Path(path_to_use).resolve()
if not verify_path.exists():
st.error(f"Path does not exist: {path_to_use}")
else:
return {"type": "data", "path": str(verify_path), "query": query}
else:
st.error("Please upload a zip file or enter a data path.")
return None
|