AlauStone commited on
Commit
df8e3ca
·
verified ·
1 Parent(s): 8d51928

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -3
app.py CHANGED
@@ -5,7 +5,6 @@ import time
5
  import logging
6
  import hashlib
7
  import uuid
8
- import threading
9
  from datetime import datetime
10
 
11
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
@@ -1026,29 +1025,35 @@ def process_upload(uploaded_files, target_prefix, scope):
1026
  try:
1027
  all_new_chunks = []
1028
  all_new_sources = []
1029
-
1030
- with st.spinner("正在处理文档,请稍候..."):
1031
  for f in uploaded_files:
1032
  try:
 
1033
  f.seek(0)
1034
  _save_uploaded_file_to_storage(scope, f)
 
 
1035
  f.seek(0)
1036
  raw_text = extract_text(f)
1037
  if not raw_text.strip():
 
1038
  continue
1039
  chunks = _get_text_splitter().split_text(raw_text)
1040
  all_new_chunks.extend(chunks)
1041
  all_new_sources.extend([f.name] * len(chunks))
1042
  except Exception as file_err:
1043
  logger.error(f"文件 {f.name} 处理失败: {file_err}", exc_info=True)
 
1044
 
1045
  if all_new_chunks:
 
1046
  batch_size = 64
1047
  all_vecs = []
1048
  for i in range(0, len(all_new_chunks), batch_size):
1049
  batch = all_new_chunks[i:i + batch_size]
1050
  all_vecs.extend(encode_texts(batch))
1051
 
 
1052
  file_groups = {}
1053
  for chunk, vec, src in zip(all_new_chunks, all_vecs, all_new_sources):
1054
  file_groups.setdefault(src, ([], []))
@@ -1058,9 +1063,12 @@ def process_upload(uploaded_files, target_prefix, scope):
1058
  for src_file, (chunks, vecs) in file_groups.items():
1059
  _save_chunks_to_db(scope, chunks, vecs, src_file)
1060
 
 
1061
  ukey = f"_upload_ver_{target_prefix}"
1062
  st.session_state[ukey] = st.session_state.get(ukey, 0) + 1
 
1063
  st.session_state[f"_sync_count_{target_prefix}"] = _count_chunks(scope)
 
1064
  _list_uploaded_files_db.clear()
1065
  st.toast(f"✅ 导入 {len(all_new_chunks)} 个切片")
1066
  st.rerun()
@@ -1075,6 +1083,7 @@ def process_upload(uploaded_files, target_prefix, scope):
1075
  # =========================
1076
  # 6.5 聊天记录持久化(Supabase chat_history 表)
1077
  # =========================
 
1078
 
1079
 
1080
  def _async_run(fn, *args):
@@ -1273,6 +1282,7 @@ with st.sidebar:
1273
  col_del.button("🗑", key=f"delpriv_{fname}")
1274
 
1275
  priv_upload_key = f"upload_private_{st.session_state.get('_upload_ver_private', 0)}"
 
1276
  def _on_priv_upload_change():
1277
  st.session_state["_pending_upload_private"] = True
1278
  priv_files = st.file_uploader(
@@ -1283,6 +1293,7 @@ with st.sidebar:
1283
  key=priv_upload_key,
1284
  on_change=_on_priv_upload_change,
1285
  )
 
1286
  if priv_files and st.session_state.pop("_pending_upload_private", False):
1287
  process_upload(priv_files, "private", PRIVATE_SCOPE)
1288
 
 
5
  import logging
6
  import hashlib
7
  import uuid
 
8
  from datetime import datetime
9
 
10
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 
1025
  try:
1026
  all_new_chunks = []
1027
  all_new_sources = []
1028
+ with st.spinner("正在自动解析文档并更新索引..."):
 
1029
  for f in uploaded_files:
1030
  try:
1031
+ # 保存原始文件到 Supabase Storage
1032
  f.seek(0)
1033
  _save_uploaded_file_to_storage(scope, f)
1034
+
1035
+ # 解析文本
1036
  f.seek(0)
1037
  raw_text = extract_text(f)
1038
  if not raw_text.strip():
1039
+ st.warning(f"文件 {f.name} 内容为空,已跳过。")
1040
  continue
1041
  chunks = _get_text_splitter().split_text(raw_text)
1042
  all_new_chunks.extend(chunks)
1043
  all_new_sources.extend([f.name] * len(chunks))
1044
  except Exception as file_err:
1045
  logger.error(f"文件 {f.name} 处理失败: {file_err}", exc_info=True)
1046
+ st.warning(f"⚠️ 文件 {f.name} 处理失败:{str(file_err)[:100]},已跳过。")
1047
 
1048
  if all_new_chunks:
1049
+ # 分批编码
1050
  batch_size = 64
1051
  all_vecs = []
1052
  for i in range(0, len(all_new_chunks), batch_size):
1053
  batch = all_new_chunks[i:i + batch_size]
1054
  all_vecs.extend(encode_texts(batch))
1055
 
1056
+ # 按 source_file 分组写入 Supabase
1057
  file_groups = {}
1058
  for chunk, vec, src in zip(all_new_chunks, all_vecs, all_new_sources):
1059
  file_groups.setdefault(src, ([], []))
 
1063
  for src_file, (chunks, vecs) in file_groups.items():
1064
  _save_chunks_to_db(scope, chunks, vecs, src_file)
1065
 
1066
+ # 递增上传组件 key
1067
  ukey = f"_upload_ver_{target_prefix}"
1068
  st.session_state[ukey] = st.session_state.get(ukey, 0) + 1
1069
+ # 立即刷新缓存的切片计数
1070
  st.session_state[f"_sync_count_{target_prefix}"] = _count_chunks(scope)
1071
+ # 清除文件列表缓存
1072
  _list_uploaded_files_db.clear()
1073
  st.toast(f"✅ 导入 {len(all_new_chunks)} 个切片")
1074
  st.rerun()
 
1083
  # =========================
1084
  # 6.5 聊天记录持久化(Supabase chat_history 表)
1085
  # =========================
1086
+ import threading
1087
 
1088
 
1089
  def _async_run(fn, *args):
 
1282
  col_del.button("🗑", key=f"delpriv_{fname}")
1283
 
1284
  priv_upload_key = f"upload_private_{st.session_state.get('_upload_ver_private', 0)}"
1285
+ # 使用 on_change 回调标记有新文件上传
1286
  def _on_priv_upload_change():
1287
  st.session_state["_pending_upload_private"] = True
1288
  priv_files = st.file_uploader(
 
1293
  key=priv_upload_key,
1294
  on_change=_on_priv_upload_change,
1295
  )
1296
+ # 检查是否有待处理的上传(通过 on_change 标记)
1297
  if priv_files and st.session_state.pop("_pending_upload_private", False):
1298
  process_upload(priv_files, "private", PRIVATE_SCOPE)
1299