Spaces:

DrPie
/

eGoV_chatbot

Sleeping

App Files Files Community

DrPie commited on Aug 28, 2025

Commit

25ffee2

verified ·

1 Parent(s): ea8833b

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -71

app.py CHANGED Viewed

@@ -1,15 +1,8 @@
-# app.py
-import os, shutil, gzip, pickle, re, json
-import numpy as np
-import faiss
-from sentence_transformers import SentenceTransformer
-from rank_bm25 import BM25Okapi
-import google.generativeai as genai
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-from huggingface_hub import login, hf_hub_download
-# --- CACHE CONFIG (HF Spaces chỉ ghi vào /tmp) ---
 os.environ["HF_HOME"] = "/tmp/hf_home"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
@@ -19,93 +12,86 @@ os.makedirs("/tmp/.cache", exist_ok=True)
 shutil.rmtree("/.cache", ignore_errors=True)
 # --- LOGIN HF HUB ---
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
 if HF_TOKEN:
     login(HF_TOKEN)
 else:
-    print("⚠️ Warning: HF token not found. Chỉ truy cập public repo được thôi.")
 # --- LOAD DỮ LIỆU ---
-HF_REPO_ID = "DrPie/eGoV_Data"
 REPO_TYPE = "dataset"
 print("--- KHỞI ĐỘNG MÁY CHỦ CHATBOT ---")
 try:
     RAW_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="toan_bo_du_lieu_final.json", repo_type=REPO_TYPE)
-    FAISS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="index.faiss", repo_type=REPO_TYPE)
-    METAS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="metas.pkl.gz", repo_type=REPO_TYPE)
     BM25_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="bm25.pkl.gz", repo_type=REPO_TYPE)
     IDMAP_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="id_to_record.pkl", repo_type=REPO_TYPE)
-    print("✅ Đã tải file dữ liệu từ HF Hub!")
     API_KEY = os.environ.get("GOOGLE_API_KEY")
     if not API_KEY:
-        raise ValueError("GOOGLE_API_KEY chưa có trong Secrets")
     genai.configure(api_key=API_KEY)
     generation_model = genai.GenerativeModel('gemini-2.5-flash')
-    embedding_model = SentenceTransformer("AITeamVN/Vietnamese_Embedding")
-    faiss_index = faiss.read_index(FAISS_PATH)
-    with gzip.open(METAS_PATH, "rb") as f:
-        metadatas = pickle.load(f)
     with gzip.open(BM25_PATH, "rb") as f:
         bm25 = pickle.load(f)
     with open(IDMAP_PATH, "rb") as f:
         procedure_map = pickle.load(f)
-    print(f"✅ Sẵn có {faiss_index.ntotal} chunks kiến thức.")
-    print(f"✅ Có {len(procedure_map)} thủ tục hành chính.")
 except Exception as e:
     print(f"❌ Lỗi khi tải tài nguyên: {e}")
-# --- HÀM XỬ LÝ ---
 def classify_followup(text: str):
     text = text.lower().strip()
     score = 0
-    strong_followup = [r"\b(nó|cái (này|đó|ấy)|thủ tục (này|đó|ấy))\b",
-                       r"\b(vừa (nói|hỏi)|trước đó|ở trên|phía trên)\b",
-                       r"\b(tiếp theo|tiếp|còn nữa|ngoài ra)\b",
-                       r"\b(thế (thì|à)|vậy (thì|à)|như vậy)\b"]
-    detail_qs = [r"\b(mất bao lâu|thời gian|bao nhiêu tiền|chi phí|phí)\b",
-                 r"\b(ở đâu|tại đâu|chỗ nào|địa chỉ)\b",
-                 r"\b(cần (gì|những gì)|yêu cầu|điều kiện)\b"]
-    specific_services = [r"\b(làm|cấp|gia hạn|đổi|đăng ký)\s+(căn cước|cmnd|cccd)\b",
-                         r"\b(làm|cấp|gia hạn|đổi)\s+hộ chiếu\b",
-                         r"\b(đăng ký)\s+(kết hôn|sinh|tử|hộ khẩu)\b"]
-    if any(re.search(p, text) for p in strong_followup): score -= 3
-    if any(re.search(p, text) for p in detail_qs): score -= 2
-    if any(re.search(p, text) for p in specific_services): score += 3
-    if len(text.split()) <= 4: score -= 1
     return 0 if score < 0 else 1
-def minmax_scale(arr):
-    arr = np.array(arr, dtype="float32")
-    if len(arr) == 0 or np.max(arr) == np.min(arr):
-        return np.zeros_like(arr)
-    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
-def hybrid_retrieve(query: str, top_k=3, alpha=0.7):
-    """Hybrid search: kết hợp semantic (FAISS) và lexical (BM25)."""
-    qv = embedding_model.encode([query], normalize_embeddings=True).astype("float32")
-    D, I = faiss_index.search(qv, top_k * 5)
-    vec_scores = (1 - D[0]).tolist()
-    vec_idx = I[0].tolist()
     tokenized_query = query.split()
-    bm25_scores_all = bm25.get_scores(tokenized_query)
-    bm25_top_idx = np.argsort(-bm25_scores_all)[:top_k * 5].tolist()
-    # Gộp và chuẩn hóa điểm
-    union_idx = list(dict.fromkeys(vec_idx + bm25_top_idx))
-    vec_map = {i: s for i, s in zip(vec_idx, vec_scores)}
-    vec_list = [vec_map.get(i, 0.0) for i in union_idx]
-    bm25_list = [bm25_scores_all[i] for i in union_idx]
-    fused = alpha * minmax_scale(vec_list) + (1 - alpha) * minmax_scale(bm25_list)
-    order = np.argsort(-fused)
-    return [union_idx[i] for i in order[:top_k]]
 def get_full_procedure_text(parent_id):
     procedure = procedure_map.get(parent_id)
@@ -121,7 +107,7 @@ def get_full_procedure_text(parent_id):
         "thu_tuc_lien_quan": "Thủ tục liên quan",
         "nguon": "Nguồn"
     }
-    parts = [f"{field_map[k]}:\n{str(v).strip()}" for k, v in procedure.items() if v and k in field_map]
     return "\n\n".join(parts)
 # --- FLASK APP ---
@@ -139,6 +125,7 @@ def chat():
     data = request.json
     user_query = data.get('question')
     session_id = data.get('session_id', 'default')
     if not user_query:
         return jsonify({"error": "Không có câu hỏi"}), 400
@@ -150,23 +137,27 @@ def chat():
     if classify_followup(user_query) == 0 and current_history:
         context = current_history[-1].get('context', '')
     else:
-        retrieved_indices = hybrid_retrieve(user_query)
         if retrieved_indices:
-            parent_id = metadatas[retrieved_indices[0]]["parent_id"]
             context = get_full_procedure_text(parent_id)
     history_str = "\n".join([f"{item['role']}: {item['content']}" for item in current_history])
-    prompt = f"""Bạn là trợ lý eGov-Bot. Trả lời tiếng Việt chính xác dựa vào DỮ LIỆU sau.
 Nếu thiếu dữ liệu, hãy nói "Mình chưa có thông tin" và đưa link nguồn trong dữ liệu để tham khảo.
-Lịch sử trò chuyện:
-{history_str}
-DỮ LIỆU: --- {context} ---
 CÂU HỎI: {user_query}"""
     response = generation_model.generate_content(prompt)
     answer = response.text
     current_history.append({'role': 'user', 'content': user_query})
     current_history.append({'role': 'model', 'content': answer, 'context': context})
     return jsonify({"answer": answer})
 if __name__ == '__main__':

+# --- MUST BE AT THE TOP ---
+import os
+import shutil
+# Đặt cache vào /tmp (HF Space cho phép ghi vào /tmp)
 os.environ["HF_HOME"] = "/tmp/hf_home"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
 shutil.rmtree("/.cache", ignore_errors=True)
 # --- LOGIN HF HUB ---
+from huggingface_hub import login, hf_hub_download
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
 if HF_TOKEN:
     login(HF_TOKEN)
 else:
+    print("Warning: HF token not found. Only public repos will be accessible.")
 # --- LOAD DỮ LIỆU ---
+HF_REPO_ID = "DrPie/eGoV_Data"  # dataset repo chứa dữ liệu
 REPO_TYPE = "dataset"
+import pickle, gzip, re, json
+import numpy as np
+from rank_bm25 import BM25Okapi
+import google.generativeai as genai
+from flask import Flask, request, jsonify
+from flask_cors import CORS
 print("--- KHỞI ĐỘNG MÁY CHỦ CHATBOT ---")
 try:
+    print("Đang tải các tài nguyên cần thiết từ Hugging Face Hub...")
     RAW_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="toan_bo_du_lieu_final.json", repo_type=REPO_TYPE)
     BM25_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="bm25.pkl.gz", repo_type=REPO_TYPE)
     IDMAP_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="id_to_record.pkl", repo_type=REPO_TYPE)
+    print("✅ Tải file dữ liệu thành công!")
     API_KEY = os.environ.get("GOOGLE_API_KEY")
     if not API_KEY:
+        raise ValueError("Lỗi: GOOGLE_API_KEY chưa được thiết lập trong Secrets của Space")
     genai.configure(api_key=API_KEY)
     generation_model = genai.GenerativeModel('gemini-2.5-flash')
+    # Không còn embedding và FAISS
     with gzip.open(BM25_PATH, "rb") as f:
         bm25 = pickle.load(f)
     with open(IDMAP_PATH, "rb") as f:
         procedure_map = pickle.load(f)
+    print(f"✅ BM25 loaded, tổng {len(procedure_map)} thủ tục hành chính.")
 except Exception as e:
     print(f"❌ Lỗi khi tải tài nguyên: {e}")
+# --- LOGIC XỬ LÝ ---
 def classify_followup(text: str):
     text = text.lower().strip()
     score = 0
+    strong_followup = [
+        r"\b(nó|cái (này|đó|ấy)|thủ tục (này|đó|ấy))\b",
+        r"\b(vừa (nói|hỏi)|trước đó|ở trên|phía trên)\b",
+        r"\b(tiếp theo|tiếp|còn nữa|ngoài ra)\b",
+        r"\b(thế (thì|à)|vậy (thì|à)|như vậy)\b"
+    ]
+    detail_qs = [
+        r"\b(mất bao lâu|thời gian|bao nhiêu tiền|chi phí|phí)\b",
+        r"\b(ở đâu|tại đâu|chỗ nào|địa chỉ)\b",
+        r"\b(cần (gì|những gì)|yêu cầu|điều kiện)\b"
+    ]
+    specific_services = [
+        r"\b(làm|cấp|gia hạn|đổi|đăng ký)\s+(căn cước|cmnd|cccd)\b",
+        r"\b(làm|cấp|gia hạn|đổi)\s+hộ chiếu\b",
+        r"\b(đăng ký)\s+(kết hôn|sinh|tử|hộ khẩu)\b"
+    ]
+    if any(re.search(p, text) for p in strong_followup):
+        score -= 3
+    if any(re.search(p, text) for p in detail_qs):
+        score -= 2
+    if any(re.search(p, text) for p in specific_services):
+        score += 3
+    if len(text.split()) <= 4:
+        score -= 1
     return 0 if score < 0 else 1
+def retrieve(query: str, top_k=3):
+    # Chỉ dùng BM25
     tokenized_query = query.split()
+    bm25_scores = bm25.get_scores(tokenized_query)
+    top_idx = np.argsort(-bm25_scores)[:top_k].tolist()
+    return top_idx
 def get_full_procedure_text(parent_id):
     procedure = procedure_map.get(parent_id)
         "thu_tuc_lien_quan": "Thủ tục liên quan",
         "nguon": "Nguồn"
     }
+    parts = [f"{field_map[k]}:\n{str(v).strip()}" for k,v in procedure.items() if v and k in field_map]
     return "\n\n".join(parts)
 # --- FLASK APP ---
     data = request.json
     user_query = data.get('question')
     session_id = data.get('session_id', 'default')
     if not user_query:
         return jsonify({"error": "Không có câu hỏi"}), 400
     if classify_followup(user_query) == 0 and current_history:
         context = current_history[-1].get('context', '')
     else:
+        retrieved_indices = retrieve(user_query)
         if retrieved_indices:
+            parent_id = retrieved_indices[0]
             context = get_full_procedure_text(parent_id)
     history_str = "\n".join([f"{item['role']}: {item['content']}" for item in current_history])
+    prompt = f"""Bạn là trợ lý eGov-Bot. Trả lời tiếng Việt, chính xác, dựa vào DỮ LIỆU sau.
 Nếu thiếu dữ liệu, hãy nói "Mình chưa có thông tin" và đưa link nguồn trong dữ liệu để tham khảo.
+Lịch sử trò chuyện: {history_str}
+DỮ LIỆU:
+---
+{context}
+---
 CÂU HỎI: {user_query}"""
     response = generation_model.generate_content(prompt)
     answer = response.text
     current_history.append({'role': 'user', 'content': user_query})
     current_history.append({'role': 'model', 'content': answer, 'context': context})
     return jsonify({"answer": answer})
 if __name__ == '__main__':