ElvisWang111 commited on
Commit
5e52c91
·
verified ·
1 Parent(s): 896a9c8

Update workflow/dataloading/dataloading_render.py

Browse files
workflow/dataloading/dataloading_render.py CHANGED
@@ -1,210 +1,210 @@
1
- import os
2
- from typing import List, Optional
3
-
4
- import pandas as pd
5
- import streamlit as st
6
- import streamlit_antd_components as sac
7
-
8
- from workflow.dataloading.dataloading_core import process_complex_data, load_from_path, load_concat_file, PathFileWrapper
9
-
10
-
11
- def loading_data_file(agent):
12
-
13
- st.info(
14
- "💡 提示:\n"
15
- "1. 支持一次上传多个数据文件\n"
16
- "2. 自动使用大模型分析并处理数据\n"
17
- "3. 支持多种格式的文件类型上传\n"
18
- )
19
-
20
- selected_index = sac.tabs([
21
- sac.TabsItem(label='本地上传'),
22
- sac.TabsItem(label='路径导入'),
23
- ], color='#5980AE',)
24
-
25
- if selected_index == "本地上传":
26
- # 点击上传文件
27
- uploaded_files = st.file_uploader(
28
- "选择新文件",
29
- accept_multiple_files=True,
30
- help="拖拽或点击上传多个文件",
31
- )
32
-
33
- if uploaded_files:
34
- current_memory_file_name = agent.load_file_name()
35
- new_files = [f for f in uploaded_files if f.name not in current_memory_file_name]
36
- if new_files:
37
- try:
38
- with st.spinner("正在处理数据..."):
39
- df, dfs = process_complex_data(new_files, agent)
40
- if df is not None:
41
- agent.add_df(df)
42
- agent.save_dfs(dfs)
43
- for f in new_files:
44
- agent.save_file_name(f.name)
45
- st.rerun()
46
- except Exception as err:
47
- st.error(f"导入失败:{err}")
48
-
49
- elif selected_index == "路径导入":
50
- # 路径上传文件
51
- raw_paths = st.text_area(
52
- "从路径导入数据 (每行一个文件路径)",
53
- placeholder= "C:\\data\\iris.names\nC:\\data\\iris.data",
54
- height=100
55
- )
56
-
57
- if st.button("从路径加载文件", use_container_width=True):
58
- if raw_paths:
59
-
60
- path_list = [p.strip().strip("'\"") for p in raw_paths.strip().split('\n') if p.strip()]
61
-
62
- valid_paths = [p for p in path_list if os.path.exists(p)]
63
- invalid_paths = [p for p in path_list if not os.path.exists(p)]
64
-
65
- if invalid_paths:
66
- st.warning(f"路径不存在,已跳过:\n- " + "\n- ".join(invalid_paths))
67
-
68
- if not valid_paths:
69
- st.error("未找到任何有效的本地文件路径。")
70
- else:
71
- current_memory_file_name = agent.load_file_name()
72
- new_paths = [p for p in valid_paths if p not in current_memory_file_name]
73
-
74
- if not new_paths:
75
- st.info("所有指定的路径文件均已加载。")
76
- else:
77
- files_to_process = [PathFileWrapper(p) for p in new_paths]
78
- try:
79
- with st.spinner("正在处理数据..."):
80
- df, dfs = process_complex_data(files_to_process, agent)
81
- if df is not None:
82
- agent.add_df(df)
83
- agent.save_dfs(dfs)
84
- for p in new_paths:
85
- agent.save_file_name(p)
86
- st.rerun()
87
- except Exception as err:
88
- st.error(f"本地文件读取失败:{err}")
89
-
90
- dfs = agent.load_dfs()
91
- if dfs is not None and len(dfs) >= 2:
92
- load_concat_file(dfs, agent)
93
-
94
-
95
- def loading_basic_info(agent):
96
-
97
- df = agent.load_df()
98
- if df is not None:
99
- r, c = df.shape
100
- missing = int(df.isnull().sum().sum())
101
- col1, col2, col3 = st.columns(3)
102
- col1.metric("行数", r)
103
- col2.metric("列数", c)
104
- col3.metric("缺失值总数", missing)
105
-
106
- dtype_info = pd.DataFrame({
107
- "列名": df.columns,
108
- "类型": df.dtypes.astype(str),
109
- "非空": df.count().values,
110
- "缺失%": (df.isnull().mean() * 100).round(2).values,
111
- }).reset_index(drop=True)
112
-
113
- selected_index = sac.tabs([
114
- sac.TabsItem(label='数据类型概览'),
115
- sac.TabsItem(label='数据预览'),
116
- ],color='#5980AE',)
117
-
118
- if selected_index == "数据类型概览":
119
- st.dataframe(dtype_info, use_container_width=True)
120
- elif selected_index == "数据预览":
121
- if st.button("🎲 随机抽样"):
122
- display_df = df.sample(10)
123
- st.dataframe(display_df, use_container_width=True)
124
- else:
125
- st.dataframe(df.head(10), use_container_width=True)
126
-
127
-
128
- def loading_chat(agent, auto=False) -> None:
129
-
130
- df = agent.load_df()
131
- if df is None:
132
- return
133
-
134
- with st.chat_message("assistant"):
135
- st.write(
136
- "我是 Autostat 数据分析助手,很高兴为您服务!\n\n"
137
- "请先上传您的数据文件,上传完成后,您可以在下方和我对话,也可以直接点击按钮解析数据含义。"
138
- )
139
- analyze_btn = st.button("🔍 解析含义")
140
- result_placeholder = st.empty()
141
-
142
- # 渲染历史对话
143
- chat_history = agent.load_memory()
144
-
145
- for idx, entry in enumerate(chat_history):
146
- bubble = st.chat_message(entry["role"])
147
- content = entry["content"]
148
- if isinstance(content, str):
149
- bubble.write(content)
150
-
151
- already_generated = any(
152
- entry["role"] == "assistant" and "含义" in str(entry["content"])
153
- for entry in chat_history
154
- )
155
-
156
- if analyze_btn or (auto and not already_generated):
157
- st.chat_message("user").write("请帮我解析数据含义")
158
- agent.add_memory({"role": "user", "content": "请帮我解析数据含义"})
159
- with st.spinner("分析中..."):
160
- desc = agent.do_data_description(df)
161
-
162
- agent.finish_auto()
163
- st.chat_message("assistant").write(desc)
164
- agent.add_memory({"role": "assistant", "content": desc})
165
- st.rerun()
166
-
167
- # 用户自定义输入
168
- user_input = st.chat_input("请输入需求,例如“帮我分析xx列”")
169
- if user_input:
170
- st.chat_message("user").write(user_input)
171
- agent.add_memory({"role": "user", "content": user_input})
172
- with st.spinner("处理中…"):
173
- reply = agent.do_data_description(df, user_input)
174
-
175
- st.chat_message("assistant").write(reply)
176
- agent.add_memory({"role": "assistant", "content": reply})
177
- st.rerun()
178
-
179
-
180
- if __name__ == "__main__":
181
-
182
- agent = st.session_state.data_loading_agent
183
- planner = st.session_state.planner_agent
184
- auto = planner.loading_auto
185
-
186
- if st.session_state.auto_mode == True:
187
- if (agent.finish_auto_task == True and planner.switched_prep == False) or planner.loading_auto == False:
188
- planner.finish_loading_auto()
189
- st.switch_page("workflow/preprocessing/preprocessing_render.py")
190
-
191
- c1,c2 = st.columns(2)
192
- with c1:
193
- st.title("数据导入")
194
- with c2:
195
- st.write("")
196
- st.write("")
197
- sac.buttons([
198
- sac.ButtonsItem(label='Github', icon='github', href='https://github.com/ElvisWang1111/AAAAAnystat'),
199
- sac.ButtonsItem(label='Doc', icon=sac.BsIcon(name='bi bi-file-earmark-post-fill', size=16), href='https://elviswang1111.github.io/anystatweb.github.io/index.html'),
200
- ], align='end', color='dark', variant='filled', index=None)
201
- st.markdown("---")
202
-
203
- c = st.columns(2)
204
- with c[0].expander('数据上传', True):
205
- loading_data_file(agent)
206
- with c[1].expander('数据建议', True):
207
- loading_chat(agent, auto)
208
- with c[0].expander('数据展示', True):
209
- loading_basic_info(agent)
210
-
 
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import streamlit_antd_components as sac
7
+
8
+ from workflow.dataloading.dataloading_core import process_complex_data, load_from_path, load_concat_file, PathFileWrapper
9
+
10
+
11
+ def loading_data_file(agent):
12
+
13
+ st.info(
14
+ "💡 提示:\n"
15
+ "1. 支持一次上传多个数据文件\n"
16
+ "2. 自动使用大模型分析并处理数据\n"
17
+ "3. 支持多种格式的文件类型上传\n"
18
+ )
19
+
20
+ selected_index = sac.tabs([
21
+ sac.TabsItem(label='本地上传'),
22
+ sac.TabsItem(label='路径导入'),
23
+ ], color='#5980AE',)
24
+
25
+ if selected_index == "本地上传":
26
+ # 点击上传文件
27
+ uploaded_files = st.file_uploader(
28
+ "选择新文件",
29
+ accept_multiple_files=True,
30
+ help="拖拽或点击上传多个文件",
31
+ )
32
+
33
+ if uploaded_files:
34
+ current_memory_file_name = agent.load_file_name()
35
+ new_files = [f for f in uploaded_files if f.name not in current_memory_file_name]
36
+ if new_files:
37
+ try:
38
+ with st.spinner("正在处理数据..."):
39
+ df, dfs = process_complex_data(new_files, agent)
40
+ if df is not None:
41
+ agent.add_df(df)
42
+ agent.save_dfs(dfs)
43
+ for f in new_files:
44
+ agent.save_file_name(f.name)
45
+ st.rerun()
46
+ except Exception as err:
47
+ st.error(f"导入失败:{err}")
48
+
49
+ elif selected_index == "路径导入":
50
+ # 路径上传文件
51
+ raw_paths = st.text_area(
52
+ "从路径导入数据 (每行一个文件路径)",
53
+ placeholder= "C:\\data\\iris.names\nC:\\data\\iris.data",
54
+ height=100
55
+ )
56
+
57
+ if st.button("从路径加载文件", use_container_width=True):
58
+ if raw_paths:
59
+
60
+ path_list = [p.strip().strip("'\"") for p in raw_paths.strip().split('\n') if p.strip()]
61
+
62
+ valid_paths = [p for p in path_list if os.path.exists(p)]
63
+ invalid_paths = [p for p in path_list if not os.path.exists(p)]
64
+
65
+ if invalid_paths:
66
+ st.warning(f"路径不存在,已跳过:\n- " + "\n- ".join(invalid_paths))
67
+
68
+ if not valid_paths:
69
+ st.error("未找到任何有效的本地文件路径。")
70
+ else:
71
+ current_memory_file_name = agent.load_file_name()
72
+ new_paths = [p for p in valid_paths if p not in current_memory_file_name]
73
+
74
+ if not new_paths:
75
+ st.info("所有指定的路径文件均已加载。")
76
+ else:
77
+ files_to_process = [PathFileWrapper(p) for p in new_paths]
78
+ try:
79
+ with st.spinner("正在处理数据..."):
80
+ df, dfs = process_complex_data(files_to_process, agent)
81
+ if df is not None:
82
+ agent.add_df(df)
83
+ agent.save_dfs(dfs)
84
+ for p in new_paths:
85
+ agent.save_file_name(p)
86
+ st.rerun()
87
+ except Exception as err:
88
+ st.error(f"本地文件读取失败:{err}")
89
+
90
+ dfs = agent.load_dfs()
91
+ if dfs is not None and len(dfs) >= 2:
92
+ load_concat_file(dfs, agent)
93
+
94
+
95
+ def loading_basic_info(agent):
96
+
97
+ df = agent.load_df()
98
+ if df is not None:
99
+ r, c = df.shape
100
+ missing = int(df.isnull().sum().sum())
101
+ col1, col2, col3 = st.columns(3)
102
+ col1.metric("行数", r)
103
+ col2.metric("列数", c)
104
+ col3.metric("缺失值总数", missing)
105
+
106
+ dtype_info = pd.DataFrame({
107
+ "列名": df.columns,
108
+ "类型": df.dtypes.astype(str),
109
+ "非空": df.count().values,
110
+ "缺失%": (df.isnull().mean() * 100).round(2).values,
111
+ }).reset_index(drop=True)
112
+
113
+ selected_index = sac.tabs([
114
+ sac.TabsItem(label='数据类型概览'),
115
+ sac.TabsItem(label='数据预览'),
116
+ ],color='#5980AE',)
117
+
118
+ if selected_index == "数据类型概览":
119
+ st.dataframe(dtype_info, use_container_width=True)
120
+ elif selected_index == "数据预览":
121
+ if st.button("🎲 随机抽样"):
122
+ display_df = df.sample(10)
123
+ st.dataframe(display_df, use_container_width=True)
124
+ else:
125
+ st.dataframe(df.head(10), use_container_width=True)
126
+
127
+
128
+ def loading_chat(agent, auto=False) -> None:
129
+
130
+ df = agent.load_df()
131
+ if df is None:
132
+ return
133
+
134
+ with st.chat_message("assistant"):
135
+ st.write(
136
+ "我是 Autostat 数据分析助手,很高兴为您服务!\n\n"
137
+ "请先上传您的数据文件,上传完成后,您可以在下方和我对话,也可以直接点击按钮解析数据含义。"
138
+ )
139
+ analyze_btn = st.button("🔍 解析含义")
140
+ result_placeholder = st.empty()
141
+
142
+ # 渲染历史对话
143
+ chat_history = agent.load_memory()
144
+
145
+ for idx, entry in enumerate(chat_history):
146
+ bubble = st.chat_message(entry["role"])
147
+ content = entry["content"]
148
+ if isinstance(content, str):
149
+ bubble.write(content)
150
+
151
+ already_generated = any(
152
+ entry["role"] == "assistant" and "含义" in str(entry["content"])
153
+ for entry in chat_history
154
+ )
155
+
156
+ if analyze_btn or (auto and not already_generated):
157
+ st.chat_message("user").write("请帮我解析数据含义")
158
+ agent.add_memory({"role": "user", "content": "请帮我解析数据含义"})
159
+ with st.spinner("分析中..."):
160
+ desc = agent.do_data_description(df)
161
+
162
+ agent.finish_auto()
163
+ st.chat_message("assistant").write(desc)
164
+ agent.add_memory({"role": "assistant", "content": desc})
165
+ st.rerun()
166
+
167
+ # 用户自定义输入
168
+ user_input = st.chat_input("请输入需求,例如“帮我分析xx列”")
169
+ if user_input:
170
+ st.chat_message("user").write(user_input)
171
+ agent.add_memory({"role": "user", "content": user_input})
172
+ with st.spinner("处理中…"):
173
+ reply = agent.do_data_description(df, user_input)
174
+
175
+ st.chat_message("assistant").write(reply)
176
+ agent.add_memory({"role": "assistant", "content": reply})
177
+ st.rerun()
178
+
179
+
180
+ if __name__ == "__main__":
181
+
182
+ agent = st.session_state.data_loading_agent
183
+ planner = st.session_state.planner_agent
184
+ auto = planner.loading_auto
185
+
186
+ if st.session_state.auto_mode == True:
187
+ if (agent.finish_auto_task == True and planner.switched_prep == False) or planner.loading_auto == False:
188
+ planner.finish_loading_auto()
189
+ st.switch_page("workflow/preprocessing/preprocessing_render.py")
190
+
191
+ c1,c2 = st.columns(2)
192
+ with c1:
193
+ st.title("数据导入")
194
+ with c2:
195
+ st.write("")
196
+ st.write("")
197
+ sac.buttons([
198
+ sac.ButtonsItem(label='Github', icon='github', href='https://github.com/Automated-Statistician/AutoSTAT'),
199
+ sac.ButtonsItem(label='Doc', icon=sac.BsIcon(name='bi bi-file-earmark-post-fill', size=16), href='https://automated-statistician.github.io/autostatdoc.github.io/'),
200
+ ], align='end', color='dark', variant='filled', index=None)
201
+ st.markdown("---")
202
+
203
+ c = st.columns(2)
204
+ with c[0].expander('数据上传', True):
205
+ loading_data_file(agent)
206
+ with c[1].expander('数据建议', True):
207
+ loading_chat(agent, auto)
208
+ with c[0].expander('数据展示', True):
209
+ loading_basic_info(agent)
210
+