File size: 7,970 Bytes
1499363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""Data analysis workflow form and runner."""

from pathlib import Path

import streamlit as st
from loguru import logger
from utils import cleanup_uploaded_data, save_and_extract_upload, status_banner

from scider.core import constant
from scider.workflows.data_workflow import DataWorkflow
from scider.workflows.hypo_data_workflow import HypoDataWorkflow


def run_data(path, q, workspace_path):
    """Run data workflow. Called from background thread."""
    data_path = Path(path) if Path(path).exists() else Path(str(path))
    logger.info(f"Running data analysis on path: {data_path}")

    w = DataWorkflow(
        data_path=data_path,
        workspace_path=workspace_path,
        recursion_limit=100,
    )
    w.run()
    intermediate_state = getattr(w, "data_agent_intermediate_state", [])
    if w.final_status != "success":
        error_msg = w.error_message or "Data workflow failed"
        banner = status_banner("failed", "Data workflow failed")
        return f"{banner}\n\n{error_msg}", intermediate_state
    banner = status_banner("success", "Data analysis complete")
    body = w.data_summary or "The data agent finished but produced no summary."
    return f"{banner}\n\n{body}", intermediate_state


def run_hypo_data(feature_desc, num_rows, query, workspace_path):
    """Run hypothetical data workflow. Called from background thread."""
    logger.info(f"Running hypothetical data generation: {feature_desc[:100]}...")

    w = HypoDataWorkflow(
        feature_desc=feature_desc,
        workspace_path=workspace_path,
        num_rows=num_rows,
        user_query=query,
        recursion_limit=100,
    )
    w.run()

    if w.final_status != "success":
        error_msg = w.error_message or "Hypothetical data workflow failed"
        banner = status_banner("failed", "Hypothetical data workflow failed")
        return f"{banner}\n\n{error_msg}", []
    banner = status_banner("success", "Hypothetical data analysis complete")
    body = w.data_summary or "The data agent finished but produced no summary."
    return f"{banner}\n\n{body}", []


def render_form():
    """Render the data analysis form. Returns workflow_config dict or None."""
    hf_enabled = constant.HF_DATASET_DOWNLOAD_ENABLED

    source_options = ["Upload local file"]
    if hf_enabled:
        source_options.append("HuggingFace dataset")
    source_options.append("Generate hypothetical data")

    data_source = st.radio(
        "Data Source",
        source_options,
        horizontal=True,
        key="data_source_radio",
    )

    with st.form("data_form", clear_on_submit=True):
        st.markdown("### Analyze Your Data")
        st.caption(
            "Upload a dataset, enter a HuggingFace repo name, or let SciDER generate synthetic data. "
            "The AI agent explores structure, runs statistical analysis, and searches for related metrics in the literature."
        )

        if data_source == "Generate hypothetical data":
            feature_desc = st.text_area(
                "Describe the data you want to generate",
                placeholder=(
                    "e.g. A dataset about house prices with features: "
                    "square footage (1000-5000 sq ft), number of bedrooms (1-6), "
                    "age of house (0-100 years), price ($100k-$1M)"
                ),
                height=150,
                help="Describe the features, their ranges, and the domain of the dataset.",
            )
            num_rows = st.number_input(
                "Number of rows",
                min_value=10,
                max_value=100000,
                value=1000,
                step=100,
            )
            query = st.text_input(
                "Analysis query",
                placeholder="What would you like to analyze about this data?",
            )
            submitted = st.form_submit_button("Generate & Analyze")
            if submitted:
                if not feature_desc or not feature_desc.strip():
                    st.error("Please describe the data you want to generate.")
                    return None
                return {
                    "type": "data_hypo",
                    "feature_desc": feature_desc.strip(),
                    "num_rows": num_rows,
                    "query": query
                    or f"Analyze this synthetic dataset: {feature_desc.strip()[:200]}",
                }

        elif data_source == "HuggingFace dataset":
            hf_repo = st.text_input(
                "HuggingFace Dataset Repo",
                placeholder="e.g. scikit-learn/iris",
                help="Enter a HuggingFace dataset repository name. It will be downloaded automatically.",
            )
            query = st.text_input(
                "Query",
                placeholder="e.g. What features most strongly predict the target variable?",
            )
            submitted = st.form_submit_button("Analyze Dataset")
            if submitted:
                if not hf_repo or not hf_repo.strip():
                    st.error("Please enter a HuggingFace dataset repository name.")
                    return None
                if not query or not query.strip():
                    query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
                return {"type": "data", "path": hf_repo.strip(), "query": query}

        else:
            st.caption("Upload a zip file containing your dataset")
            uploaded_zip = st.file_uploader(
                "Upload ZIP dataset (optional)",
                type=["zip"],
                help="Upload a zip file containing your dataset. Extracted temporarily, deleted on reset.",
            )
            if st.session_state.get("uploaded_data_path"):
                st.info(f"Using uploaded data: `{st.session_state.uploaded_data_path}`")
            query = st.text_input(
                "Query",
                placeholder="e.g. What features most strongly predict the target variable?",
            )
            submitted = st.form_submit_button("Analyze Dataset")
            if submitted:
                if not query or not query.strip():
                    query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
                path_to_use = None
                if uploaded_zip:
                    cleanup_uploaded_data()
                    extracted = save_and_extract_upload(uploaded_zip)
                    if extracted and extracted.exists():
                        extracted = extracted.resolve()
                        st.session_state.uploaded_data_path = str(extracted)
                        st.session_state.workspace_path = extracted.parent
                        path_to_use = str(extracted)
                        st.success(f"File uploaded and extracted to: {path_to_use}")
                    else:
                        st.error(
                            f"Failed to process uploaded zip file. Extracted path: {extracted}"
                        )
                elif st.session_state.get("uploaded_data_path"):
                    path = Path(st.session_state.uploaded_data_path).resolve()
                    if path.exists():
                        path_to_use = str(path)
                        st.session_state.workspace_path = path.parent
                    else:
                        st.warning(f"Previously uploaded path no longer exists: {path}")
                        cleanup_uploaded_data()
                if path_to_use:
                    verify_path = Path(path_to_use).resolve()
                    if not verify_path.exists():
                        st.error(f"Path does not exist: {path_to_use}")
                    else:
                        return {"type": "data", "path": str(verify_path), "query": query}
                else:
                    st.error("Please upload a zip file or enter a data path.")
    return None