Spaces:
Running
Running
File size: 5,557 Bytes
342e4c4 896a9c8 342e4c4 a8d3c7a 342e4c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import io
import traceback
import numpy as np
import pandas as pd
import streamlit as st
from streamlit_ace import st_ace
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler
from utils.sanitize_code import sanitize_code
from workflow.preprocessing.preprocessing_core import prep_meta_execution, prep_code_gen
def prep_basic_info(agent):
df = agent.load_df()
# 展示基本统计
r, c = df.shape
missing = int(df.isnull().sum().sum())
col1, col2, col3 = st.columns(3)
col1.metric("行数", r)
col2.metric("列数", c)
col3.metric("缺失值总数", missing)
dtype_info = pd.DataFrame({
'列名': df.columns,
'类型': df.dtypes.astype(str),
'非空值数量': df.count().values,
'缺失值比例(%)': (df.isnull().mean() * 100).round(2).values,
})
dtype_info = dtype_info.reset_index(drop=True)
st.dataframe(dtype_info, use_container_width=True)
def prep_execution(agent, auto=False):
'''
training data进行预处理
'''
code = agent.load_code()
df = agent.load_df()
process_df = prep_meta_execution(agent, code, df, auto=auto)
def prep_result(agent):
process_df = agent.load_processed_df()
df = agent.load_df()
if process_df is not None:
st.write("处理前数据预览:", df.head(10))
st.write("处理后数据预览:", process_df.head(10))
csv_buffer = io.StringIO()
process_df.to_csv(csv_buffer, index=False)
csv_bytes = csv_buffer.getvalue().encode('utf-8')
st.download_button(
label="⬇️ 下载处理后数据",
data=csv_bytes,
file_name="processed_data.csv",
mime="text/csv",
)
def prep_chat(agent, auto=False):
"""渲染对话式建议区"""
with st.chat_message("assistant"):
st.write("我是 Autostat 数据分析助手,很高兴为您服务!\n\n"
"您可以在下方输入预处理需求,或直接点击按钮获取预处理建议。")
c = st.columns(2)
with c[0]:
analyze_btn = st.button("🔍 预处理推荐", key='prep_suggest', use_container_width=True)
with c[1]:
clear_prep_suggest = st.button("♻️ 清除预处理分析", key='clear_prep_suggest', use_container_width=True)
if clear_prep_suggest:
agent.clear_memory()
agent.preprocessing_suggestions = None
# 对话历史渲染
chat_history = agent.load_memory()
for idx, entry in enumerate(chat_history):
bubble = st.chat_message(entry["role"])
content = entry["content"]
if isinstance(content, str):
bubble.write(content)
already_generated = any(
entry["role"] == "assistant" and "预处理" in str(entry["content"])
for entry in chat_history
)
# 自动/手动触发
if analyze_btn or (auto and not already_generated):
st.chat_message("user").write("请给我预处理建议")
agent.add_memory({'role': 'user', 'content': "请给我预处理建议"})
with st.spinner("生成建议中…"):
text = agent.get_preprocessing_suggestions()
agent.save_preprocessing_suggestions(text)
agent.refine_suggestions(df.head(10).to_string())
st.chat_message("assistant").write(text)
agent.add_memory({'role': 'assistant', 'content': text})
# 用户自然语言交互
user_input = st.chat_input("请输入您的问题")
if user_input:
st.chat_message("user").write(user_input)
agent.add_memory({'role': 'user', 'content': user_input})
agent.save_user_input(user_input)
with st.spinner("处理中…"):
reply = agent.get_preprocessing_suggestions(user_input)
agent.save_preprocessing_suggestions(reply)
agent.refine_suggestions(df.head(10).to_string())
st.chat_message('assistant').write(reply)
agent.add_memory({'role': 'assistant', 'content': reply})
if __name__ == '__main__':
st.title("数据预处理与标准化")
st.markdown("---")
data_loading_agent = st.session_state.data_loading_agent
df = data_loading_agent.load_df()
planner = st.session_state.planner_agent
auto = planner.prep_auto
if df is None:
st.warning("⚠️ 请先在数据导入页面加载数据")
st.stop()
agent = st.session_state.data_preprocess_agent
agent.add_df(df)
if st.session_state.auto_mode == True:
if (agent.finish_auto_task == True and planner.switched_prep == False) or planner.prep_auto == False:
planner.finish_prep_auto()
st.switch_page("workflow/visualization/viz_render.py")
code = agent.load_code()
if code is None:
code_expand = False
else:
code_expand = True
c = st.columns(2)
with c[0].expander('预处理展示', True):
prep_basic_info(agent)
with c[1].expander('预处理建议', True):
prep_chat(agent, auto)
prep_code_gen(agent, auto=auto)
with c[0].expander('预处理执行', code_expand):
prep_execution(agent, auto)
with c[0].expander('预处理结果', code_expand):
prep_result(agent) |