import os from typing import List, Optional import pandas as pd import streamlit as st import streamlit_antd_components as sac from workflow.dataloading.dataloading_core import process_complex_data, load_from_path, load_concat_file, PathFileWrapper def loading_data_file(agent): st.info( "💡 提示:\n" "1. 支持一次上传多个数据文件\n" "2. 自动使用大模型分析并处理数据\n" "3. 支持多种格式的文件类型上传\n" ) selected_index = sac.tabs([ sac.TabsItem(label='本地上传'), sac.TabsItem(label='路径导入'), ], color='#5980AE',) if selected_index == "本地上传": # 点击上传文件 uploaded_files = st.file_uploader( "选择新文件", accept_multiple_files=True, help="拖拽或点击上传多个文件", ) if uploaded_files: current_memory_file_name = agent.load_file_name() new_files = [f for f in uploaded_files if f.name not in current_memory_file_name] if new_files: try: with st.spinner("正在处理数据..."): df, dfs = process_complex_data(new_files, agent) if df is not None: agent.add_df(df) agent.save_dfs(dfs) for f in new_files: agent.save_file_name(f.name) st.rerun() except Exception as err: st.error(f"导入失败:{err}") elif selected_index == "路径导入": # 路径上传文件 raw_paths = st.text_area( "从路径导入数据 (每行一个文件路径)", placeholder= "C:\\data\\iris.names\nC:\\data\\iris.data", height=100 ) if st.button("从路径加载文件", use_container_width=True): if raw_paths: path_list = [p.strip().strip("'\"") for p in raw_paths.strip().split('\n') if p.strip()] valid_paths = [p for p in path_list if os.path.exists(p)] invalid_paths = [p for p in path_list if not os.path.exists(p)] if invalid_paths: st.warning(f"路径不存在,已跳过:\n- " + "\n- ".join(invalid_paths)) if not valid_paths: st.error("未找到任何有效的本地文件路径。") else: current_memory_file_name = agent.load_file_name() new_paths = [p for p in valid_paths if p not in current_memory_file_name] if not new_paths: st.info("所有指定的路径文件均已加载。") else: files_to_process = [PathFileWrapper(p) for p in new_paths] try: with st.spinner("正在处理数据..."): df, dfs = process_complex_data(files_to_process, agent) if df is not None: agent.add_df(df) agent.save_dfs(dfs) for p in new_paths: agent.save_file_name(p) st.rerun() except Exception as err: st.error(f"本地文件读取失败:{err}") dfs = agent.load_dfs() if dfs is not None and len(dfs) >= 2: load_concat_file(dfs, agent) def loading_basic_info(agent): df = agent.load_df() if df is not None: r, c = df.shape missing = int(df.isnull().sum().sum()) col1, col2, col3 = st.columns(3) col1.metric("行数", r) col2.metric("列数", c) col3.metric("缺失值总数", missing) dtype_info = pd.DataFrame({ "列名": df.columns, "类型": df.dtypes.astype(str), "非空": df.count().values, "缺失%": (df.isnull().mean() * 100).round(2).values, }).reset_index(drop=True) selected_index = sac.tabs([ sac.TabsItem(label='数据类型概览'), sac.TabsItem(label='数据预览'), ],color='#5980AE',) if selected_index == "数据类型概览": st.dataframe(dtype_info, use_container_width=True) elif selected_index == "数据预览": if st.button("🎲 随机抽样"): display_df = df.sample(10) st.dataframe(display_df, use_container_width=True) else: st.dataframe(df.head(10), use_container_width=True) def loading_chat(agent, auto=False) -> None: df = agent.load_df() if df is None: return with st.chat_message("assistant"): st.write( "我是 Autostat 数据分析助手,很高兴为您服务!\n\n" "请先上传您的数据文件,上传完成后,您可以在下方和我对话,也可以直接点击按钮解析数据含义。" ) analyze_btn = st.button("🔍 解析含义") result_placeholder = st.empty() # 渲染历史对话 chat_history = agent.load_memory() for idx, entry in enumerate(chat_history): bubble = st.chat_message(entry["role"]) content = entry["content"] if isinstance(content, str): bubble.write(content) already_generated = any( entry["role"] == "assistant" and "含义" in str(entry["content"]) for entry in chat_history ) if analyze_btn or (auto and not already_generated): st.chat_message("user").write("请帮我解析数据含义") agent.add_memory({"role": "user", "content": "请帮我解析数据含义"}) with st.spinner("分析中..."): desc = agent.do_data_description(df) agent.finish_auto() st.chat_message("assistant").write(desc) agent.add_memory({"role": "assistant", "content": desc}) st.rerun() # 用户自定义输入 user_input = st.chat_input("请输入需求,例如“帮我分析xx列”") if user_input: st.chat_message("user").write(user_input) agent.add_memory({"role": "user", "content": user_input}) with st.spinner("处理中…"): reply = agent.do_data_description(df, user_input) st.chat_message("assistant").write(reply) agent.add_memory({"role": "assistant", "content": reply}) st.rerun() if __name__ == "__main__": agent = st.session_state.data_loading_agent planner = st.session_state.planner_agent auto = planner.loading_auto if st.session_state.auto_mode == True: if (agent.finish_auto_task == True and planner.switched_prep == False) or planner.loading_auto == False: planner.finish_loading_auto() st.switch_page("workflow/preprocessing/preprocessing_render.py") c1,c2 = st.columns(2) with c1: st.title("数据导入") with c2: st.write("") st.write("") sac.buttons([ sac.ButtonsItem(label='Github', icon='github', href='https://github.com/Jiaye-s-Group/AutoSTAT'), sac.ButtonsItem(label='Doc', icon=sac.BsIcon(name='bi bi-file-earmark-post-fill', size=16), href='https://autostat.cc/docs/'), sac.ButtonsItem(label='Web', icon=sac.BsIcon(name='bi bi-globe2', size=16), href='https://autostat.cc/'), ], align='end', color='dark', variant='filled', index=None) st.markdown("---") c = st.columns(2) with c[0].expander('数据上传', True): loading_data_file(agent) with c[1].expander('数据建议', True): loading_chat(agent, auto) with c[0].expander('数据展示', True): loading_basic_info(agent)