Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,6 @@ import time
|
|
| 5 |
import logging
|
| 6 |
import hashlib
|
| 7 |
import uuid
|
| 8 |
-
import threading
|
| 9 |
from datetime import datetime
|
| 10 |
|
| 11 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
@@ -1026,29 +1025,35 @@ def process_upload(uploaded_files, target_prefix, scope):
|
|
| 1026 |
try:
|
| 1027 |
all_new_chunks = []
|
| 1028 |
all_new_sources = []
|
| 1029 |
-
|
| 1030 |
-
with st.spinner("正在处理文档,请稍候..."):
|
| 1031 |
for f in uploaded_files:
|
| 1032 |
try:
|
|
|
|
| 1033 |
f.seek(0)
|
| 1034 |
_save_uploaded_file_to_storage(scope, f)
|
|
|
|
|
|
|
| 1035 |
f.seek(0)
|
| 1036 |
raw_text = extract_text(f)
|
| 1037 |
if not raw_text.strip():
|
|
|
|
| 1038 |
continue
|
| 1039 |
chunks = _get_text_splitter().split_text(raw_text)
|
| 1040 |
all_new_chunks.extend(chunks)
|
| 1041 |
all_new_sources.extend([f.name] * len(chunks))
|
| 1042 |
except Exception as file_err:
|
| 1043 |
logger.error(f"文件 {f.name} 处理失败: {file_err}", exc_info=True)
|
|
|
|
| 1044 |
|
| 1045 |
if all_new_chunks:
|
|
|
|
| 1046 |
batch_size = 64
|
| 1047 |
all_vecs = []
|
| 1048 |
for i in range(0, len(all_new_chunks), batch_size):
|
| 1049 |
batch = all_new_chunks[i:i + batch_size]
|
| 1050 |
all_vecs.extend(encode_texts(batch))
|
| 1051 |
|
|
|
|
| 1052 |
file_groups = {}
|
| 1053 |
for chunk, vec, src in zip(all_new_chunks, all_vecs, all_new_sources):
|
| 1054 |
file_groups.setdefault(src, ([], []))
|
|
@@ -1058,9 +1063,12 @@ def process_upload(uploaded_files, target_prefix, scope):
|
|
| 1058 |
for src_file, (chunks, vecs) in file_groups.items():
|
| 1059 |
_save_chunks_to_db(scope, chunks, vecs, src_file)
|
| 1060 |
|
|
|
|
| 1061 |
ukey = f"_upload_ver_{target_prefix}"
|
| 1062 |
st.session_state[ukey] = st.session_state.get(ukey, 0) + 1
|
|
|
|
| 1063 |
st.session_state[f"_sync_count_{target_prefix}"] = _count_chunks(scope)
|
|
|
|
| 1064 |
_list_uploaded_files_db.clear()
|
| 1065 |
st.toast(f"✅ 导入 {len(all_new_chunks)} 个切片")
|
| 1066 |
st.rerun()
|
|
@@ -1075,6 +1083,7 @@ def process_upload(uploaded_files, target_prefix, scope):
|
|
| 1075 |
# =========================
|
| 1076 |
# 6.5 聊天记录持久化(Supabase chat_history 表)
|
| 1077 |
# =========================
|
|
|
|
| 1078 |
|
| 1079 |
|
| 1080 |
def _async_run(fn, *args):
|
|
@@ -1273,6 +1282,7 @@ with st.sidebar:
|
|
| 1273 |
col_del.button("🗑", key=f"delpriv_{fname}")
|
| 1274 |
|
| 1275 |
priv_upload_key = f"upload_private_{st.session_state.get('_upload_ver_private', 0)}"
|
|
|
|
| 1276 |
def _on_priv_upload_change():
|
| 1277 |
st.session_state["_pending_upload_private"] = True
|
| 1278 |
priv_files = st.file_uploader(
|
|
@@ -1283,6 +1293,7 @@ with st.sidebar:
|
|
| 1283 |
key=priv_upload_key,
|
| 1284 |
on_change=_on_priv_upload_change,
|
| 1285 |
)
|
|
|
|
| 1286 |
if priv_files and st.session_state.pop("_pending_upload_private", False):
|
| 1287 |
process_upload(priv_files, "private", PRIVATE_SCOPE)
|
| 1288 |
|
|
|
|
| 5 |
import logging
|
| 6 |
import hashlib
|
| 7 |
import uuid
|
|
|
|
| 8 |
from datetime import datetime
|
| 9 |
|
| 10 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
|
|
| 1025 |
try:
|
| 1026 |
all_new_chunks = []
|
| 1027 |
all_new_sources = []
|
| 1028 |
+
with st.spinner("正在自动解析文档并更新索引..."):
|
|
|
|
| 1029 |
for f in uploaded_files:
|
| 1030 |
try:
|
| 1031 |
+
# 保存原始文件到 Supabase Storage
|
| 1032 |
f.seek(0)
|
| 1033 |
_save_uploaded_file_to_storage(scope, f)
|
| 1034 |
+
|
| 1035 |
+
# 解析文本
|
| 1036 |
f.seek(0)
|
| 1037 |
raw_text = extract_text(f)
|
| 1038 |
if not raw_text.strip():
|
| 1039 |
+
st.warning(f"文件 {f.name} 内容为空,已跳过。")
|
| 1040 |
continue
|
| 1041 |
chunks = _get_text_splitter().split_text(raw_text)
|
| 1042 |
all_new_chunks.extend(chunks)
|
| 1043 |
all_new_sources.extend([f.name] * len(chunks))
|
| 1044 |
except Exception as file_err:
|
| 1045 |
logger.error(f"文件 {f.name} 处理失败: {file_err}", exc_info=True)
|
| 1046 |
+
st.warning(f"⚠️ 文件 {f.name} 处理失败:{str(file_err)[:100]},已跳过。")
|
| 1047 |
|
| 1048 |
if all_new_chunks:
|
| 1049 |
+
# 分批编码
|
| 1050 |
batch_size = 64
|
| 1051 |
all_vecs = []
|
| 1052 |
for i in range(0, len(all_new_chunks), batch_size):
|
| 1053 |
batch = all_new_chunks[i:i + batch_size]
|
| 1054 |
all_vecs.extend(encode_texts(batch))
|
| 1055 |
|
| 1056 |
+
# 按 source_file 分组写入 Supabase
|
| 1057 |
file_groups = {}
|
| 1058 |
for chunk, vec, src in zip(all_new_chunks, all_vecs, all_new_sources):
|
| 1059 |
file_groups.setdefault(src, ([], []))
|
|
|
|
| 1063 |
for src_file, (chunks, vecs) in file_groups.items():
|
| 1064 |
_save_chunks_to_db(scope, chunks, vecs, src_file)
|
| 1065 |
|
| 1066 |
+
# 递增上传组件 key
|
| 1067 |
ukey = f"_upload_ver_{target_prefix}"
|
| 1068 |
st.session_state[ukey] = st.session_state.get(ukey, 0) + 1
|
| 1069 |
+
# 立即刷新缓存的切片计数
|
| 1070 |
st.session_state[f"_sync_count_{target_prefix}"] = _count_chunks(scope)
|
| 1071 |
+
# 清除文件列表缓存
|
| 1072 |
_list_uploaded_files_db.clear()
|
| 1073 |
st.toast(f"✅ 导入 {len(all_new_chunks)} 个切片")
|
| 1074 |
st.rerun()
|
|
|
|
| 1083 |
# =========================
|
| 1084 |
# 6.5 聊天记录持久化(Supabase chat_history 表)
|
| 1085 |
# =========================
|
| 1086 |
+
import threading
|
| 1087 |
|
| 1088 |
|
| 1089 |
def _async_run(fn, *args):
|
|
|
|
| 1282 |
col_del.button("🗑", key=f"delpriv_{fname}")
|
| 1283 |
|
| 1284 |
priv_upload_key = f"upload_private_{st.session_state.get('_upload_ver_private', 0)}"
|
| 1285 |
+
# 使用 on_change 回调标记有新文件上传
|
| 1286 |
def _on_priv_upload_change():
|
| 1287 |
st.session_state["_pending_upload_private"] = True
|
| 1288 |
priv_files = st.file_uploader(
|
|
|
|
| 1293 |
key=priv_upload_key,
|
| 1294 |
on_change=_on_priv_upload_change,
|
| 1295 |
)
|
| 1296 |
+
# 检查是否有待处理的上传(通过 on_change 标记)
|
| 1297 |
if priv_files and st.session_state.pop("_pending_upload_private", False):
|
| 1298 |
process_upload(priv_files, "private", PRIVATE_SCOPE)
|
| 1299 |
|