Spaces:

AlauStone
/

rag-assistant

Sleeping

App Files Files Community

AlauStone commited on Mar 31

Commit

ec2e567

verified ·

1 Parent(s): 5ea08e1

Upload app.py

Browse files

Files changed (1) hide show

app.py +137 -136

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import streamlit as st
-import time as _time
-_BOOT = _time.time()
 import json
 import time
 import logging
@@ -11,11 +9,6 @@ from datetime import datetime
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 logger = logging.getLogger(__name__)
-def _perf(label):
-    logger.info(f"[PERF] {label}: {_time.time()-_BOOT:.2f}s")
-_perf("stdlib imports done")
 # numpy 延迟导入
 _np_module = None
 def _get_np():
@@ -23,14 +16,13 @@ def _get_np():
     if _np_module is None:
         import numpy
         _np_module = numpy
-        _perf("numpy loaded")
     return _np_module
 # =========================
 # 1. 页面配置 & 样式注入
 # =========================
 st.set_page_config(page_title="RAG 知识库助手 v3 (HF+Supabase)", page_icon="🛡️", layout="wide")
-_perf("page_config done")
 def inject_custom_css():
@@ -59,7 +51,6 @@ def inject_custom_css():
 inject_custom_css()
 st.title("🛡️ 智能知识库助手 v3")
-_perf("CSS + title done")
 # =========================
 # 1.5 Supabase 客户端初始化
@@ -85,8 +76,6 @@ def _sb():
     return _get_supabase()
-_perf("supabase client ready")
 # =========================
 # 2. 用户管理（Supabase users 表）
 # =========================
@@ -199,10 +188,10 @@ def verify_user(username, password):
     users = _load_users()
     user_info = users.get(username)
     if not user_info or not isinstance(user_info, dict):
-        return False, None
     if user_info.get("password_hash") != _hash_password(password):
-        return False, None
-    return True, user_info.get("role", "user")
 # --- 认证 UI ---
@@ -238,11 +227,14 @@ with st.sidebar:
             if input_username == "" or input_password == "":
                 st.stop()
-            ok, role = verify_user(input_username, input_password)
             if not ok:
                 st.session_state.login_attempts += 1
                 remaining = MAX_LOGIN_ATTEMPTS - st.session_state.login_attempts
-                st.warning(f"⚠️ 用户名或密码错误（剩余 {remaining} 次）")
                 st.stop()
             else:
                 st.session_state.login_attempts = 0
@@ -263,7 +255,10 @@ with st.sidebar:
                 else:
                     ok, msg = register_user(reg_user, reg_pass, reg_code)
                     if ok:
-                        st.success(f"✅ {msg}")
                         time.sleep(1)
                         st.rerun()
                     else:
@@ -272,7 +267,7 @@ with st.sidebar:
 CURRENT_USER = st.session_state.current_user
 IS_ADMIN = st.session_state.current_role == "admin"
-_perf("auth done")
 # =========================
 # 3. 安全配置与 Embedding 策略
@@ -355,32 +350,6 @@ def encode_query(text):
 # 4. Supabase 索引管理（替代本地文件）
 # =========================
-def _load_library(scope):
-    """从 Supabase documents 表加载指定 scope 的所有文档切片。
-    返回 (docs, embeddings, sources)。"""
-    np = _get_np()
-    try:
-        resp = _sb().table("documents").select(
-            "content, embedding, source_file"
-        ).eq("scope", scope).execute()
-        docs = []
-        embeddings = []
-        sources = []
-        for row in resp.data:
-            docs.append(row["content"])
-            emb = row["embedding"]
-            # Supabase REST API 可能返回字符串格式的向量，需要解析
-            if isinstance(emb, str):
-                emb = json.loads(emb)
-            embeddings.append(np.array(emb, dtype=np.float32))
-            sources.append(row["source_file"])
-        return docs, embeddings, sources
-    except Exception as e:
-        logger.error(f"加载索引失败 [scope={scope}]: {e}")
-        return [], [], []
 def _save_chunks_to_db(scope, chunks, vectors, source_file):
     """将新切片批量写入 Supabase documents 表。"""
     rows = []
@@ -523,52 +492,29 @@ def _clear_uploaded_files_storage(scope):
         logger.warning(f"清空文件失败 [scope={scope}]: {e}")
-# --- 初始化 session_state 中的缓存 ---
-def _init_library(key_prefix, scope):
-    """加载 Supabase 中的索引到 session_state。"""
-    docs_key = f"{key_prefix}_docs"
-    emb_key = f"{key_prefix}_embeddings"
-    src_key = f"{key_prefix}_sources"
-    loaded_key = f"{key_prefix}_loaded"
-    if docs_key not in st.session_state or not st.session_state.get(loaded_key):
-        docs, embeddings, sources = _load_library(scope)
-        st.session_state[docs_key] = docs
-        st.session_state[emb_key] = embeddings
-        st.session_state[src_key] = sources
-        st.session_state[loaded_key] = True
-def _refresh_library(key_prefix, scope):
-    """强制从 Supabase 重新加载索引到 session_state。"""
-    docs, embeddings, sources = _load_library(scope)
-    st.session_state[f"{key_prefix}_docs"] = docs
-    st.session_state[f"{key_prefix}_embeddings"] = embeddings
-    st.session_state[f"{key_prefix}_sources"] = sources
-_perf("before init_library")
 PUBLIC_SCOPE = "public"
-_init_library("public", PUBLIC_SCOPE)
 PRIVATE_SCOPE = CURRENT_USER  # 私有库 scope = 用户名
-_init_library("private", PRIVATE_SCOPE)
-_perf("init_library done")
-def _get_embeddings_np(key_prefix):
-    np = _get_np()
-    np_key = f"{key_prefix}_embeddings_np"
-    ver_key = f"{key_prefix}_emb_version"
-    emb_key = f"{key_prefix}_embeddings"
-    emb_list = st.session_state.get(emb_key, [])
-    current_ver = id(emb_list)
-    if np_key not in st.session_state or st.session_state.get(ver_key) != current_ver:
-        if emb_list:
-            st.session_state[np_key] = np.array(emb_list)
-        else:
-            st.session_state[np_key] = np.array([])
-        st.session_state[ver_key] = current_ver
-    return st.session_state[np_key]
 # =========================
@@ -605,7 +551,6 @@ def _get_text_splitter():
     if _text_splitter_cache is None:
         from langchain_text_splitters import RecursiveCharacterTextSplitter
         _text_splitter_cache = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
-        _perf("text_splitter loaded")
     return _text_splitter_cache
 SYSTEM_PROMPT = (
@@ -730,9 +675,6 @@ def process_upload(uploaded_files, target_prefix, scope):
                 for src_file, (chunks, vecs) in file_groups.items():
                     _save_chunks_to_db(scope, chunks, vecs, src_file)
-                # 刷新 session_state 缓存
-                _refresh_library(target_prefix, scope)
                 st.session_state[fp_key] = file_fingerprint
                 # 递增上传组件 key
                 ukey = f"_upload_ver_{target_prefix}"
@@ -772,9 +714,9 @@ model_mapping = {
     "🏢 百度文心 (官方)": "ernie-3.5-8k",
 }
-_perf("before sidebar UI")
 with st.sidebar:
-    pub_chunk_count = len(st.session_state.get("public_docs", []))
     with st.expander(f"📚 公共知识库（{pub_chunk_count} 切片）"):
         st.caption("所有人可搜索")
@@ -789,7 +731,6 @@ with st.sidebar:
                     if col_del.button("🗑", key=f"delpub_{fname}", help=f"删除 {fname}"):
                         _delete_chunks_by_file(PUBLIC_SCOPE, fname)
                         _delete_uploaded_file_from_storage(PUBLIC_SCOPE, fname)
-                        _refresh_library("public", PUBLIC_SCOPE)
                         st.success(f"已删除 {fname}")
                         time.sleep(0.5)
                         st.rerun()
@@ -812,7 +753,6 @@ with st.sidebar:
                 if st.button("🗑️ 清空公共库", use_container_width=True, type="secondary", key="clear_pub"):
                     _clear_all_chunks(PUBLIC_SCOPE)
                     _clear_uploaded_files_storage(PUBLIC_SCOPE)
-                    _refresh_library("public", PUBLIC_SCOPE)
                     st.success("公共知识库已清空。")
                     time.sleep(0.5)
                     st.rerun()
@@ -820,7 +760,7 @@ with st.sidebar:
             st.caption("*仅管理员可维护公共库*")
     # --- 私有知识库 ---
-    priv_chunk_count = len(st.session_state.get("private_docs", []))
     with st.expander(f"🔒 我的私有库（{priv_chunk_count} 切片）"):
         st.caption(f"用户：{CURRENT_USER}，仅自己可见")
@@ -833,7 +773,6 @@ with st.sidebar:
                 if col_del.button("🗑", key=f"delpriv_{fname}", help=f"删除 {fname}"):
                     _delete_chunks_by_file(PRIVATE_SCOPE, fname)
                     _delete_uploaded_file_from_storage(PRIVATE_SCOPE, fname)
-                    _refresh_library("private", PRIVATE_SCOPE)
                     st.success(f"已删除 {fname}")
                     time.sleep(0.5)
                     st.rerun()
@@ -853,7 +792,6 @@ with st.sidebar:
             if st.button("🗑️ 清空我的私有库", use_container_width=True, type="secondary", key="clear_priv"):
                 _clear_all_chunks(PRIVATE_SCOPE)
                 _clear_uploaded_files_storage(PRIVATE_SCOPE)
-                _refresh_library("private", PRIVATE_SCOPE)
                 st.success("私有知识库已清空。")
                 time.sleep(0.5)
                 st.rerun()
@@ -878,7 +816,7 @@ with st.sidebar:
         new_pass1 = st.text_input("新密码", type="password", key="self_new_pass1")
         new_pass2 = st.text_input("确认新密码", type="password", key="self_new_pass2")
         if st.button("✅ 确认修改", key="btn_change_pass"):
-            ok, _ = verify_user(CURRENT_USER, old_pass)
             if not ok:
                 st.error("当前密码错误")
             elif len(new_pass1) < 4:
@@ -971,49 +909,59 @@ with st.sidebar:
                     display_users[k] = v
             st.json(display_users)
-    # --- 清空聊天记录 ---
-    with st.expander("🧹 清空聊天记录"):
-        st.caption("清空后不可恢复")
-        if st.button("确认清空", use_container_width=True, type="secondary", key="btn_clear_chat"):
-            st.session_state.messages = []
-            st.rerun()
 # =========================
 # 8. 核心搜索逻辑（合并公共库 + 私有库）
 # =========================
-def _cosine_scores(query_vec, matrix):
-    np = _get_np()
-    query_norm = np.linalg.norm(query_vec)
-    if query_norm < 1e-10:
-        return np.zeros(matrix.shape[0])
-    mat_norms = np.linalg.norm(matrix, axis=1)
-    mat_norms = np.maximum(mat_norms, 1e-10)
-    return (matrix @ query_vec) / (mat_norms * query_norm)
 def search_local(query, top_k, threshold):
     query_vec = encode_query(query)
-    all_results = []
-    pub_docs = st.session_state.get("public_docs", [])
-    pub_np = _get_embeddings_np("public")
-    if pub_docs and pub_np.size > 0:
-        scores = _cosine_scores(query_vec, pub_np)
-        for i, s in enumerate(scores):
-            if s > threshold:
-                all_results.append((float(s), pub_docs[i]))
-    priv_docs = st.session_state.get("private_docs", [])
-    priv_np = _get_embeddings_np("private")
-    if priv_docs and priv_np.size > 0:
-        scores = _cosine_scores(query_vec, priv_np)
-        for i, s in enumerate(scores):
-            if s > threshold:
-                all_results.append((float(s), priv_docs[i]))
-    all_results.sort(key=lambda x: x[0], reverse=True)
-    return [doc for _, doc in all_results[:top_k]]
 # =========================
@@ -1106,11 +1054,63 @@ def llm_answer(query, context_docs, selected_display_name, web_enabled):
     yield "❌ 抱歉，所有免费和收费线路均暂时不可用。"
 # =========================
 # 10. 聊天渲染
 # =========================
 if "messages" not in st.session_state:
-    st.session_state.messages = []
 for m in st.session_state.messages:
     with st.chat_message(m["role"]):
@@ -1120,6 +1120,7 @@ for m in st.session_state.messages:
 if q := st.chat_input("输入问题...", key="chat_input_v3"):
     st.session_state.messages.append({"role": "user", "content": q})
     with st.chat_message("user"):
         st.markdown(q)
@@ -1142,8 +1143,8 @@ if q := st.chat_input("输入问题...", key="chat_input_v3"):
             st.session_state.messages.append(
                 {"role": "assistant", "content": full_response, "meta": meta_info}
             )
         except Exception as e:
             logger.error(f"模型调用异常: {e}")
             container.error(f"❌ 抱歉，连接模型时出错了: {str(e)}")
-_perf("script execution complete")

 import streamlit as st
 import json
 import time
 import logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 logger = logging.getLogger(__name__)
 # numpy 延迟导入
 _np_module = None
 def _get_np():
     if _np_module is None:
         import numpy
         _np_module = numpy
     return _np_module
 # =========================
 # 1. 页面配置 & 样式注入
 # =========================
 st.set_page_config(page_title="RAG 知识库助手 v3 (HF+Supabase)", page_icon="🛡️", layout="wide")
 def inject_custom_css():
 inject_custom_css()
 st.title("🛡️ 智能知识库助手 v3")
 # =========================
 # 1.5 Supabase 客户端初始化
     return _get_supabase()
 # =========================
 # 2. 用户管理（Supabase users 表）
 # =========================
     users = _load_users()
     user_info = users.get(username)
     if not user_info or not isinstance(user_info, dict):
+        return False, None, "not_found"
     if user_info.get("password_hash") != _hash_password(password):
+        return False, None, "wrong_password"
+    return True, user_info.get("role", "user"), ""
 # --- 认证 UI ---
             if input_username == "" or input_password == "":
                 st.stop()
+            ok, role, reason = verify_user(input_username, input_password)
             if not ok:
                 st.session_state.login_attempts += 1
                 remaining = MAX_LOGIN_ATTEMPTS - st.session_state.login_attempts
+                if reason == "not_found":
+                    st.warning(f"⚠️ 用户不存在，请先注册（剩余 {remaining} 次）")
+                else:
+                    st.warning(f"⚠️ 密码错误（剩余 {remaining} 次）")
                 st.stop()
             else:
                 st.session_state.login_attempts = 0
                 else:
                     ok, msg = register_user(reg_user, reg_pass, reg_code)
                     if ok:
+                        st.session_state.current_user = reg_user
+                        st.session_state.current_role = "user"
+                        st.session_state.login_attempts = 0
+                        st.success(f"✅ 注册成功，已自动登录")
                         time.sleep(1)
                         st.rerun()
                     else:
 CURRENT_USER = st.session_state.current_user
 IS_ADMIN = st.session_state.current_role == "admin"
 # =========================
 # 3. 安全配置与 Embedding 策略
 # 4. Supabase 索引管理（替代本地文件）
 # =========================
 def _save_chunks_to_db(scope, chunks, vectors, source_file):
     """将新切片批量写入 Supabase documents 表。"""
     rows = []
         logger.warning(f"清空文件失败 [scope={scope}]: {e}")
 PUBLIC_SCOPE = "public"
 PRIVATE_SCOPE = CURRENT_USER  # 私有库 scope = 用户名
+# --- 定时同步：检测其他用户对文档库的修改 ---
+_SYNC_INTERVAL = 30  # 每 30 秒检查一次
+def _check_and_sync():
+    """检测文档数量变化，用于多用户同步感知。"""
+    now = time.time()
+    last_check = st.session_state.get("_sync_last_check", 0)
+    if now - last_check < _SYNC_INTERVAL:
+        return
+    st.session_state["_sync_last_check"] = now
+    for scope, label in [(PUBLIC_SCOPE, "public"), (PRIVATE_SCOPE, "private")]:
+        count_key = f"_sync_count_{label}"
+        current_count = _count_chunks(scope)
+        prev_count = st.session_state.get(count_key, -1)
+        if prev_count >= 0 and current_count != prev_count:
+            logger.info(f"[SYNC] {label} 库变更: {prev_count} -> {current_count}")
+        st.session_state[count_key] = current_count
+_check_and_sync()
 # =========================
     if _text_splitter_cache is None:
         from langchain_text_splitters import RecursiveCharacterTextSplitter
         _text_splitter_cache = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
     return _text_splitter_cache
 SYSTEM_PROMPT = (
                 for src_file, (chunks, vecs) in file_groups.items():
                     _save_chunks_to_db(scope, chunks, vecs, src_file)
                 st.session_state[fp_key] = file_fingerprint
                 # 递增上传组件 key
                 ukey = f"_upload_ver_{target_prefix}"
     "🏢 百度文心 (官方)": "ernie-3.5-8k",
 }
 with st.sidebar:
+    pub_chunk_count = st.session_state.get("_sync_count_public", _count_chunks(PUBLIC_SCOPE))
     with st.expander(f"📚 公共知识库（{pub_chunk_count} 切片）"):
         st.caption("所有人可搜索")
                     if col_del.button("🗑", key=f"delpub_{fname}", help=f"删除 {fname}"):
                         _delete_chunks_by_file(PUBLIC_SCOPE, fname)
                         _delete_uploaded_file_from_storage(PUBLIC_SCOPE, fname)
                         st.success(f"已删除 {fname}")
                         time.sleep(0.5)
                         st.rerun()
                 if st.button("🗑️ 清空公共库", use_container_width=True, type="secondary", key="clear_pub"):
                     _clear_all_chunks(PUBLIC_SCOPE)
                     _clear_uploaded_files_storage(PUBLIC_SCOPE)
                     st.success("公共知识库已清空。")
                     time.sleep(0.5)
                     st.rerun()
             st.caption("*仅管理员可维护公共库*")
     # --- 私有知识库 ---
+    priv_chunk_count = st.session_state.get("_sync_count_private", _count_chunks(PRIVATE_SCOPE))
     with st.expander(f"🔒 我的私有库（{priv_chunk_count} 切片）"):
         st.caption(f"用户：{CURRENT_USER}，仅自己可见")
                 if col_del.button("🗑", key=f"delpriv_{fname}", help=f"删除 {fname}"):
                     _delete_chunks_by_file(PRIVATE_SCOPE, fname)
                     _delete_uploaded_file_from_storage(PRIVATE_SCOPE, fname)
                     st.success(f"已删除 {fname}")
                     time.sleep(0.5)
                     st.rerun()
             if st.button("🗑️ 清空我的私有库", use_container_width=True, type="secondary", key="clear_priv"):
                 _clear_all_chunks(PRIVATE_SCOPE)
                 _clear_uploaded_files_storage(PRIVATE_SCOPE)
                 st.success("私有知识库已清空。")
                 time.sleep(0.5)
                 st.rerun()
         new_pass1 = st.text_input("新密码", type="password", key="self_new_pass1")
         new_pass2 = st.text_input("确认新密码", type="password", key="self_new_pass2")
         if st.button("✅ 确认修改", key="btn_change_pass"):
+            ok, _, _ = verify_user(CURRENT_USER, old_pass)
             if not ok:
                 st.error("当前密码错误")
             elif len(new_pass1) < 4:
                     display_users[k] = v
             st.json(display_users)
+    # --- 聊天记录管理 ---
+    with st.expander("💬 聊天记录"):
+        hist_tab_new, hist_tab_history = st.tabs(["当前对话", "历史记录"])
+        with hist_tab_new:
+            st.caption("清空当前对话（数据库记录保留）")
+            if st.button("🧹 清空当前对话", use_container_width=True, type="secondary", key="btn_clear_chat"):
+                st.session_state.messages = []
+                st.rerun()
+            st.caption("清空所有历史记录（不可恢复）")
+            if st.button("🗑️ 清空全部记录", use_container_width=True, type="secondary", key="btn_clear_all_hist"):
+                _clear_chat_history_db(CURRENT_USER)
+                st.session_state.messages = []
+                st.success("所有聊天记录已清空")
+                time.sleep(0.5)
+                st.rerun()
+        with hist_tab_history:
+            if st.button("🔄 加载历史记录", use_container_width=True, key="btn_load_hist"):
+                st.session_state["_show_history"] = True
+            if st.session_state.get("_show_history"):
+                history = _load_chat_history(CURRENT_USER, limit=100)
+                if not history:
+                    st.info("暂无历史记录")
+                else:
+                    st.caption(f"共 {len(history)} 条记录")
+                    for msg in history:
+                        ts = msg.get("created_at", "")[:16].replace("T", " ")
+                        icon = "🧑" if msg["role"] == "user" else "🤖"
+                        preview = msg["content"][:80].replace("\n", " ")
+                        st.text(f"{icon} [{ts}] {preview}{'...' if len(msg['content']) > 80 else ''}")
 # =========================
 # 8. 核心搜索逻辑（合并公共库 + 私有库）
 # =========================
 def search_local(query, top_k, threshold):
+    """使用 pgvector 数据库端向量搜索（替代内存计算）。"""
     query_vec = encode_query(query)
+    scopes = [PUBLIC_SCOPE, PRIVATE_SCOPE]
+    try:
+        resp = _sb().rpc("match_documents", {
+            "query_embedding": query_vec.tolist() if hasattr(query_vec, 'tolist') else list(query_vec),
+            "match_scopes": scopes,
+            "match_threshold": float(threshold),
+            "match_count": int(top_k),
+        }).execute()
+        return [row["content"] for row in resp.data] if resp.data else []
+    except Exception as e:
+        logger.error(f"pgvector 搜索失败: {e}")
+        return []
 # =========================
     yield "❌ 抱歉，所有免费和收费线路均暂时不可用。"
+# =========================
+# 9.5 聊天记录持久化（Supabase chat_history 表）
+# =========================
+def _save_chat_message(username, role, content, meta=""):
+    """保存单条聊天消息到数据库。"""
+    try:
+        _sb().table("chat_history").insert({
+            "username": username,
+            "role": role,
+            "content": content,
+            "meta": meta or "",
+        }).execute()
+    except Exception as e:
+        logger.warning(f"保存聊天记录失败: {e}")
+def _load_chat_history(username, limit=50):
+    """加载用户最近的聊天记录。"""
+    try:
+        resp = _sb().table("chat_history").select(
+            "role, content, meta, created_at"
+        ).eq("username", username).order(
+            "created_at", desc=True
+        ).limit(limit).execute()
+        if not resp.data:
+            return []
+        # 反转回时间正序
+        rows = list(reversed(resp.data))
+        return [
+            {"role": r["role"], "content": r["content"],
+             "meta": r.get("meta", ""), "created_at": r.get("created_at", "")}
+            for r in rows
+        ]
+    except Exception as e:
+        logger.warning(f"加载聊天记录失败: {e}")
+        return []
+def _clear_chat_history_db(username):
+    """清空用户在数据库中的所有聊天记录。"""
+    try:
+        _sb().table("chat_history").delete().eq("username", username).execute()
+    except Exception as e:
+        logger.warning(f"清空聊天记录失败: {e}")
 # =========================
 # 10. 聊天渲染
 # =========================
 if "messages" not in st.session_state:
+    # 首次加载时从数据库恢复最近对话
+    saved = _load_chat_history(CURRENT_USER, limit=50)
+    st.session_state.messages = [
+        {"role": m["role"], "content": m["content"],
+         **({"meta": m["meta"]} if m.get("meta") else {})}
+        for m in saved
+    ]
 for m in st.session_state.messages:
     with st.chat_message(m["role"]):
 if q := st.chat_input("输入问题...", key="chat_input_v3"):
     st.session_state.messages.append({"role": "user", "content": q})
+    _save_chat_message(CURRENT_USER, "user", q)
     with st.chat_message("user"):
         st.markdown(q)
             st.session_state.messages.append(
                 {"role": "assistant", "content": full_response, "meta": meta_info}
             )
+            _save_chat_message(CURRENT_USER, "assistant", full_response, meta_info)
         except Exception as e:
             logger.error(f"模型调用异常: {e}")
             container.error(f"❌ 抱歉，连接模型时出错了: {str(e)}")