Spaces:

ElvisWang111
/

AutoSTAT

Running

File size: 7,961 Bytes

import os
from typing import List, Optional

import pandas as pd
import streamlit as st
import streamlit_antd_components as sac

from workflow.dataloading.dataloading_core import process_complex_data, load_from_path, load_concat_file, PathFileWrapper


def loading_data_file(agent):

    st.info(
        "💡 提示：\n"
        "1. 支持一次上传多个数据文件\n"
        "2. 自动使用大模型分析并处理数据\n"
        "3. 支持多种格式的文件类型上传\n"
    )

    selected_index = sac.tabs([
        sac.TabsItem(label='本地上传'),
        sac.TabsItem(label='路径导入'),
    ], color='#5980AE',)

    if selected_index == "本地上传":
        # 点击上传文件
        uploaded_files = st.file_uploader(
            "选择新文件",
            accept_multiple_files=True,
            help="拖拽或点击上传多个文件",
        )

        if uploaded_files:
            current_memory_file_name = agent.load_file_name()
            new_files = [f for f in uploaded_files if f.name not in current_memory_file_name]
            if new_files:
                try:
                    with st.spinner("正在处理数据..."):
                        df, dfs = process_complex_data(new_files, agent)
                    if df is not None:
                        agent.add_df(df)
                        agent.save_dfs(dfs)
                        for f in new_files:
                            agent.save_file_name(f.name)
                        st.rerun()
                except Exception as err:
                    st.error(f"导入失败：{err}")

    elif selected_index == "路径导入":
        # 路径上传文件
        raw_paths = st.text_area(
            "从路径导入数据 (每行一个文件路径)",
            placeholder=    "C:\\data\\iris.names\nC:\\data\\iris.data",
            height=100
        )

        if st.button("从路径加载文件", use_container_width=True):
            if raw_paths:

                path_list = [p.strip().strip("'\"") for p in raw_paths.strip().split('\n') if p.strip()]
                
                valid_paths = [p for p in path_list if os.path.exists(p)]
                invalid_paths = [p for p in path_list if not os.path.exists(p)]

                if invalid_paths:
                    st.warning(f"路径不存在，已跳过：\n- " + "\n- ".join(invalid_paths))

                if not valid_paths:
                    st.error("未找到任何有效的本地文件路径。")
                else:
                    current_memory_file_name = agent.load_file_name()
                    new_paths = [p for p in valid_paths if p not in current_memory_file_name]

                    if not new_paths:
                        st.info("所有指定的路径文件均已加载。")
                    else:
                        files_to_process = [PathFileWrapper(p) for p in new_paths]
                        try:
                            with st.spinner("正在处理数据..."):
                                df, dfs = process_complex_data(files_to_process, agent)
                            if df is not None:
                                agent.add_df(df)
                                agent.save_dfs(dfs)
                                for p in new_paths:
                                    agent.save_file_name(p)
                                st.rerun()
                        except Exception as err:
                            st.error(f"本地文件读取失败：{err}")
    
    dfs = agent.load_dfs()
    if dfs is not None and len(dfs) >= 2:
        load_concat_file(dfs, agent)


def loading_basic_info(agent):
    
    df = agent.load_df()
    if df is not None:
        r, c = df.shape
        missing = int(df.isnull().sum().sum())
        col1, col2, col3 = st.columns(3)
        col1.metric("行数", r)
        col2.metric("列数", c)
        col3.metric("缺失值总数", missing)

        dtype_info = pd.DataFrame({
            "列名": df.columns,
            "类型": df.dtypes.astype(str),
            "非空": df.count().values,
            "缺失%": (df.isnull().mean() * 100).round(2).values,
        }).reset_index(drop=True)

        selected_index = sac.tabs([
            sac.TabsItem(label='数据类型概览'),
            sac.TabsItem(label='数据预览'),
        ],color='#5980AE',)

        if selected_index == "数据类型概览":
            st.dataframe(dtype_info, use_container_width=True)
        elif selected_index == "数据预览":
            if st.button("🎲 随机抽样"):
                display_df = df.sample(10)
                st.dataframe(display_df, use_container_width=True)
            else:
                st.dataframe(df.head(10), use_container_width=True)


def loading_chat(agent, auto=False) -> None:

    df = agent.load_df()
    if df is None:
        return

    with st.chat_message("assistant"):
        st.write(
            "我是 Autostat 数据分析助手，很高兴为您服务！\n\n"
            "请先上传您的数据文件，上传完成后，您可以在下方和我对话，也可以直接点击按钮解析数据含义。"
        )
        analyze_btn = st.button("🔍 解析含义")
        result_placeholder = st.empty()
        
    # 渲染历史对话
    chat_history = agent.load_memory()

    for idx, entry in enumerate(chat_history):
        bubble = st.chat_message(entry["role"])
        content = entry["content"]
        if isinstance(content, str):
            bubble.write(content)

    already_generated = any(
        entry["role"] == "assistant" and "含义" in str(entry["content"])
        for entry in chat_history
    )

    if analyze_btn or (auto and not already_generated):
        st.chat_message("user").write("请帮我解析数据含义")
        agent.add_memory({"role": "user", "content": "请帮我解析数据含义"})
        with st.spinner("分析中..."):
            desc = agent.do_data_description(df)

        agent.finish_auto()
        st.chat_message("assistant").write(desc)
        agent.add_memory({"role": "assistant", "content": desc})
        st.rerun()

    # 用户自定义输入
    user_input = st.chat_input("请输入需求，例如“帮我分析xx列”")
    if user_input:
        st.chat_message("user").write(user_input)
        agent.add_memory({"role": "user", "content": user_input})
        with st.spinner("处理中…"):
            reply = agent.do_data_description(df, user_input)

        st.chat_message("assistant").write(reply)
        agent.add_memory({"role": "assistant", "content": reply})
        st.rerun()


if __name__ == "__main__":

    agent = st.session_state.data_loading_agent
    planner = st.session_state.planner_agent
    auto = planner.loading_auto

    if st.session_state.auto_mode == True:
        if (agent.finish_auto_task == True and planner.switched_prep == False) or planner.loading_auto == False:
            planner.finish_loading_auto()
            st.switch_page("workflow/preprocessing/preprocessing_render.py")

    c1,c2 = st.columns(2)
    with c1:
        st.title("数据导入")
    with c2:
        st.write("")  
        st.write("")  
        sac.buttons([
            sac.ButtonsItem(label='Github', icon='github', href='https://github.com/Jiaye-s-Group/AutoSTAT'),
            sac.ButtonsItem(label='Doc', icon=sac.BsIcon(name='bi bi-file-earmark-post-fill', size=16), href='https://autostat.cc/docs/'),
            sac.ButtonsItem(label='Web', icon=sac.BsIcon(name='bi bi-globe2', size=16), href='https://autostat.cc/'),
        ], align='end', color='dark', variant='filled', index=None)
    st.markdown("---")

    c = st.columns(2)
    with c[0].expander('数据上传', True):
        loading_data_file(agent)
    with c[1].expander('数据建议', True):
        loading_chat(agent, auto)
    with c[0].expander('数据展示', True):
        loading_basic_info(agent)