File size: 7,390 Bytes
978fed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Data analysis workflow form and runner."""

from pathlib import Path

import streamlit as st
from loguru import logger
from utils import cleanup_uploaded_data, save_and_extract_upload

from scider.core import constant
from scider.workflows.data_workflow import DataWorkflow
from scider.workflows.hypo_data_workflow import HypoDataWorkflow


def run_data(path, q, workspace_path):
    """Run data workflow. Called from background thread."""
    data_path = Path(path) if Path(path).exists() else path  # may be HF repo name
    logger.info(f"Running data analysis on path: {data_path}")

    w = DataWorkflow(
        data_path=Path(path) if Path(path).exists() else Path(str(path)),
        workspace_path=workspace_path,
        recursion_limit=100,
    )
    w.run()
    intermediate_state = getattr(w, "data_agent_intermediate_state", [])
    if w.final_status != "success":
        error_msg = w.error_message or "Data workflow failed"
        return f"Data workflow failed: {error_msg}", intermediate_state
    out = ["## Data Analysis Complete"]
    if w.data_summary:
        out.append(w.data_summary)
    return "\n\n".join(out), intermediate_state


def run_hypo_data(feature_desc, num_rows, query, workspace_path):
    """Run hypothetical data workflow. Called from background thread."""
    logger.info(f"Running hypothetical data generation: {feature_desc[:100]}...")

    w = HypoDataWorkflow(
        feature_desc=feature_desc,
        workspace_path=workspace_path,
        num_rows=num_rows,
        user_query=query,
        recursion_limit=100,
    )
    w.run()

    if w.final_status != "success":
        error_msg = w.error_message or "Hypothetical data workflow failed"
        return f"Workflow failed: {error_msg}", []
    out = ["## Hypothetical Data Analysis Complete"]
    if w.data_summary:
        out.append(w.data_summary)
    return "\n\n".join(out), []


def render_form():
    """Render the data analysis form. Returns workflow_config dict or None."""
    hf_enabled = constant.HF_DATASET_DOWNLOAD_ENABLED

    source_options = ["Upload local file"]
    if hf_enabled:
        source_options.append("HuggingFace dataset")
    source_options.append("Generate hypothetical data")

    data_source = st.radio(
        "Data Source",
        source_options,
        horizontal=True,
        key="data_source_radio",
    )

    with st.form("data_form", clear_on_submit=True):
        st.markdown("### Data Analysis Workflow")

        if data_source == "Generate hypothetical data":
            feature_desc = st.text_area(
                "Describe the data you want to generate",
                placeholder=(
                    "e.g. A dataset about house prices with features: "
                    "square footage (1000-5000 sq ft), number of bedrooms (1-6), "
                    "age of house (0-100 years), price ($100k-$1M)"
                ),
                height=150,
                help="Describe the features, their ranges, and the domain of the dataset.",
            )
            num_rows = st.number_input(
                "Number of rows",
                min_value=10,
                max_value=100000,
                value=1000,
                step=100,
            )
            query = st.text_input(
                "Analysis query",
                placeholder="What would you like to analyze about this data?",
            )
            submitted = st.form_submit_button("Generate & Analyze")
            if submitted:
                if not feature_desc or not feature_desc.strip():
                    st.error("Please describe the data you want to generate.")
                    return None
                return {
                    "type": "data_hypo",
                    "feature_desc": feature_desc.strip(),
                    "num_rows": num_rows,
                    "query": query
                    or f"Analyze this synthetic dataset: {feature_desc.strip()[:200]}",
                }

        elif data_source == "HuggingFace dataset":
            hf_repo = st.text_input(
                "HuggingFace Dataset Repo",
                placeholder="e.g. scikit-learn/iris",
                help="Enter a HuggingFace dataset repository name. It will be downloaded automatically.",
            )
            query = st.text_input("Query", placeholder="What would you like to analyze?")
            submitted = st.form_submit_button("Run Data Analysis")
            if submitted:
                if not hf_repo or not hf_repo.strip():
                    st.error("Please enter a HuggingFace dataset repository name.")
                    return None
                if not query or not query.strip():
                    query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
                return {"type": "data", "path": hf_repo.strip(), "query": query}

        else:
            st.caption("Upload a zip dataset or enter a path to existing data")
            uploaded_zip = st.file_uploader(
                "Upload ZIP dataset (optional)",
                type=["zip"],
                help="Upload a zip file containing your dataset. Extracted temporarily, deleted on reset.",
            )
            if st.session_state.get("uploaded_data_path"):
                st.info(f"Using uploaded data: `{st.session_state.uploaded_data_path}`")
            query = st.text_input("Query", placeholder="What would you like to analyze?")
            submitted = st.form_submit_button("Run Data Analysis")
            if submitted:
                if not query or not query.strip():
                    query = "Analyze this dataset — explore its structure, key patterns, and notable findings."
                path_to_use = None
                if uploaded_zip:
                    cleanup_uploaded_data()
                    extracted = save_and_extract_upload(uploaded_zip)
                    if extracted and extracted.exists():
                        extracted = extracted.resolve()
                        st.session_state.uploaded_data_path = str(extracted)
                        st.session_state.workspace_path = extracted.parent
                        path_to_use = str(extracted)
                        st.success(f"File uploaded and extracted to: {path_to_use}")
                    else:
                        st.error(
                            f"Failed to process uploaded zip file. Extracted path: {extracted}"
                        )
                elif st.session_state.get("uploaded_data_path"):
                    path = Path(st.session_state.uploaded_data_path).resolve()
                    if path.exists():
                        path_to_use = str(path)
                        st.session_state.workspace_path = path.parent
                    else:
                        st.warning(f"Previously uploaded path no longer exists: {path}")
                        cleanup_uploaded_data()
                if path_to_use:
                    verify_path = Path(path_to_use).resolve()
                    if not verify_path.exists():
                        st.error(f"Path does not exist: {path_to_use}")
                    else:
                        return {"type": "data", "path": str(verify_path), "query": query}
                else:
                    st.error("Please upload a zip file or enter a data path.")
    return None