Spaces:

AI4Research
/

scider

Running

File size: 7,390 Bytes

978fed5

"""Data analysis workflow form and runner."""

from pathlib import Path

import streamlit as st
from loguru import logger
from utils import cleanup_uploaded_data, save_and_extract_upload

from scider.core import constant
from scider.workflows.data_workflow import DataWorkflow
from scider.workflows.hypo_data_workflow import HypoDataWorkflow


def run_data(path, q, workspace_path):
    """Run data workflow. Called from background thread."""
    data_path = Path(path) if Path(path).exists() else path  # may be HF repo name
    logger.info(f"Running data analysis on path: {data_path}")

    w = DataWorkflow(
        data_path=Path(path) if Path(path).exists() else Path(str(path)),
        workspace_path=workspace_path,
        recursion_limit=100,
    )
    w.run()
    intermediate_state = getattr(w, "data_agent_intermediate_state", [])
    if w.final_status != "success":
        error_msg = w.error_message or "Data workflow failed"
        return f"Data workflow failed: {error_msg}", intermediate_state
    out = ["## Data Analysis Complete"]
    if w.data_summary:
        out.append(w.data_summary)
    return "\n\n".join(out), intermediate_state


def run_hypo_data(feature_desc, num_rows, query, workspace_path):
    """Run hypothetical data workflow. Called from background thread."""
    logger.info(f"Running hypothetical data generation: {feature_desc[:100]}...")

    w = HypoDataWorkflow(
        feature_desc=feature_desc,
        workspace_path=workspace_path,
        num_rows=num_rows,
        user_query=query,
        recursion_limit=100,
    )
    w.run()

    if w.final_status != "success":
        error_msg = w.error_message or "Hypothetical data workflow failed"
        return f"Workflow failed: {error_msg}", []
    out = ["## Hypothetical Data Analysis Complete"]
    if w.data_summary:
        out.append(w.data_summary)
    return "\n\n".join(out), []


def render_form():
    """Render the data analysis form. Returns workflow_config dict or None."""
    hf_enabled = constant.HF_DATASET_DOWNLOAD_ENABLED

    source_options = ["Upload local file"]
    if hf_enabled:
        source_options.append("HuggingFace dataset")
    source_options.append("Generate hypothetical data")

    data_source = st.radio(
        "Data Source",
        source_options,
        horizontal=True,
        key="data_source_radio",
    )

    with st.form("data_form", clear_on_submit=True):
        st.markdown("### Data Analysis Workflow")

        if data_source == "Generate hypothetical data":
            feature_desc = st.text_area(
                "Describe the data you want to generate",
                placeholder=(
                    "e.g. A dataset about house prices with features: "
                    "square footage (1000-5000 sq ft), number of bedrooms (1-6), "
                    "age of house (0-100 years), price ($100k-$1M)"
                ),
                height=150,
                help="Describe the features, their ranges, and the domain of the dataset.",
            )
            num_rows = st.number_input(
                "Number of rows",
                min_value=10,
                max_value=100000,
                value=1000,
                step=100,
            )
            query = st.text_input(
                "Analysis query",
                placeholder="What would you like to analyze about this data?",
            )
            submitted = st.form_submit_button("Generate & Analyze")
            if submitted:
                if not feature_desc or not feature_desc.strip():
                    st.error("Please describe the data you want to generate.")
                    return None
                return {
                    "type": "data_hypo",
                    "feature_desc": feature_desc.strip(),
                    "num_rows": num_rows,
                    "query": query
                    or f"Analyze this synthetic dataset: {feature_desc.strip()[:200]}",
                }

        elif data_source == "HuggingFace dataset":
            hf_repo = st.text_input(
                "HuggingFace Dataset Repo",
                placeholder="e.g. scikit-learn/iris",
                help="Enter a HuggingFace dataset repository name. It will be downloaded automatically.",
            )
            query = st.text_input("Query", placeholder="What would you like to analyze?")
            submitted = st.form_submit_button("Run Data Analysis")
            if submitted:
                if not hf_repo or not hf_repo.strip():
                    st.error("Please enter a HuggingFace dataset repository name.")
                    return None
                if not query or not query.strip():
                    query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
                return {"type": "data", "path": hf_repo.strip(), "query": query}

        else:
            st.caption("Upload a zip dataset or enter a path to existing data")
            uploaded_zip = st.file_uploader(
                "Upload ZIP dataset (optional)",
                type=["zip"],
                help="Upload a zip file containing your dataset. Extracted temporarily, deleted on reset.",
            )
            if st.session_state.get("uploaded_data_path"):
                st.info(f"Using uploaded data: `{st.session_state.uploaded_data_path}`")
            query = st.text_input("Query", placeholder="What would you like to analyze?")
            submitted = st.form_submit_button("Run Data Analysis")
            if submitted:
                if not query or not query.strip():
                    query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
                path_to_use = None
                if uploaded_zip:
                    cleanup_uploaded_data()
                    extracted = save_and_extract_upload(uploaded_zip)
                    if extracted and extracted.exists():
                        extracted = extracted.resolve()
                        st.session_state.uploaded_data_path = str(extracted)
                        st.session_state.workspace_path = extracted.parent
                        path_to_use = str(extracted)
                        st.success(f"File uploaded and extracted to: {path_to_use}")
                    else:
                        st.error(
                            f"Failed to process uploaded zip file. Extracted path: {extracted}"
                        )
                elif st.session_state.get("uploaded_data_path"):
                    path = Path(st.session_state.uploaded_data_path).resolve()
                    if path.exists():
                        path_to_use = str(path)
                        st.session_state.workspace_path = path.parent
                    else:
                        st.warning(f"Previously uploaded path no longer exists: {path}")
                        cleanup_uploaded_data()
                if path_to_use:
                    verify_path = Path(path_to_use).resolve()
                    if not verify_path.exists():
                        st.error(f"Path does not exist: {path_to_use}")
                    else:
                        return {"type": "data", "path": str(verify_path), "query": query}
                else:
                    st.error("Please upload a zip file or enter a data path.")
    return None