Spaces:

DrPie
/

eGoV_chatbot

Sleeping

App Files Files Community

DrPie commited on Aug 28, 2025

Commit

ea8833b

verified ·

1 Parent(s): 4998819

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -43

app.py CHANGED Viewed

@@ -1,11 +1,17 @@
 # app.py
-# --- MUST BE AT THE TOP ---
-import os
-import shutil
-# Đặt cache vào /tmp (HF Space cho phép ghi vào /tmp)
 os.environ["HF_HOME"] = "/tmp/hf_home"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"   # warning deprecated OK
 os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
 os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
 os.environ["HOME"] = "/tmp"
@@ -13,40 +19,29 @@ os.makedirs("/tmp/.cache", exist_ok=True)
 shutil.rmtree("/.cache", ignore_errors=True)
 # --- LOGIN HF HUB ---
-from huggingface_hub import login, hf_hub_download
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
 if HF_TOKEN:
     login(HF_TOKEN)
 else:
-    print("Warning: HF token not found. Only public repos will be accessible.")
 # --- LOAD DỮ LIỆU ---
-HF_REPO_ID = "DrPie/eGoV_Data"  # dataset repo chứa dữ liệu
 REPO_TYPE = "dataset"
-import pickle, gzip, re, json
-import numpy as np
-import faiss
-from sentence_transformers import SentenceTransformer
-from rank_bm25 import BM25Okapi
-import google.generativeai as genai
-from flask import Flask, request, jsonify
-from flask_cors import CORS
 print("--- KHỞI ĐỘNG MÁY CHỦ CHATBOT ---")
 try:
-    print("Đang tải các tài nguyên cần thiết từ Hugging Face Hub...")
     RAW_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="toan_bo_du_lieu_final.json", repo_type=REPO_TYPE)
     FAISS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="index.faiss", repo_type=REPO_TYPE)
     METAS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="metas.pkl.gz", repo_type=REPO_TYPE)
     BM25_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="bm25.pkl.gz", repo_type=REPO_TYPE)
     IDMAP_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="id_to_record.pkl", repo_type=REPO_TYPE)
-    print("✅ Tải file dữ liệu thành công!")
     API_KEY = os.environ.get("GOOGLE_API_KEY")
     if not API_KEY:
-        raise ValueError("Lỗi: GOOGLE_API_KEY chưa được thiết lập trong Secrets của Space")
     genai.configure(api_key=API_KEY)
     generation_model = genai.GenerativeModel('gemini-2.5-flash')
@@ -62,11 +57,10 @@ try:
     print(f"✅ Sẵn có {faiss_index.ntotal} chunks kiến thức.")
     print(f"✅ Có {len(procedure_map)} thủ tục hành chính.")
 except Exception as e:
     print(f"❌ Lỗi khi tải tài nguyên: {e}")
-# --- LOGIC XỬ LÝ ---
 def classify_followup(text: str):
     text = text.lower().strip()
     score = 0
@@ -88,40 +82,52 @@ def classify_followup(text: str):
 def minmax_scale(arr):
     arr = np.array(arr, dtype="float32")
-    if len(arr) == 0 or np.max(arr) == np.min(arr): return np.zeros_like(arr)
     return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
-def retrieve(query: str, top_k=3):
     qv = embedding_model.encode([query], normalize_embeddings=True).astype("float32")
-    D, I = faiss_index.search(qv, top_k*5)
     vec_scores = (1 - D[0]).tolist()
     vec_idx = I[0].tolist()
     tokenized_query = query.split()
     bm25_scores_all = bm25.get_scores(tokenized_query)
-    bm25_top_idx = np.argsort(-bm25_scores_all)[:top_k*5].tolist()
     union_idx = list(dict.fromkeys(vec_idx + bm25_top_idx))
-    vec_map = {i: s for i,s in zip(vec_idx, vec_scores)}
-    vec_list = [vec_map.get(i,0.0) for i in union_idx]
     bm25_list = [bm25_scores_all[i] for i in union_idx]
-    fused = 0.7 * minmax_scale(vec_list) + 0.3 * minmax_scale(bm25_list)
     order = np.argsort(-fused)
     return [union_idx[i] for i in order[:top_k]]
 def get_full_procedure_text(parent_id):
     procedure = procedure_map.get(parent_id)
-    if not procedure: return "Không tìm thấy thủ tục."
-    field_map = {"ten_thu_tuc": "Tên thủ tục", "cach_thuc_thuc_hien": "Cách thức thực hiện",
-                 "thanh_phan_ho_so": "Thành phần hồ sơ", "trinh_tu_thuc_hien": "Trình tự thực hiện",
-                 "co_quan_thuc_hien": "Cơ quan thực hiện", "yeu_cau_dieu_kien": "Yêu cầu, điều kiện",
-                 "thu_tuc_lien_quan": "Thủ tục liên quan", "nguon": "Nguồn"}
-    parts = [f"{field_map[k]}:\n{str(v).strip()}" for k,v in procedure.items() if v and k in field_map]
     return "\n\n".join(parts)
-# --- FLASK APP & ROUTE /chat ---
 app = Flask(__name__)
 CORS(app)
-# Route test
 @app.route('/', methods=['GET'])
 def home():
     return "eGov-Bot backend is running!", 200
@@ -135,30 +141,32 @@ def chat():
     session_id = data.get('session_id', 'default')
     if not user_query:
         return jsonify({"error": "Không có câu hỏi"}), 400
     if session_id not in chat_histories:
         chat_histories[session_id] = []
     current_history = chat_histories[session_id]
     context = ""
     if classify_followup(user_query) == 0 and current_history:
         context = current_history[-1].get('context', '')
     else:
-        retrieved_indices = retrieve(user_query)
         if retrieved_indices:
             parent_id = metadatas[retrieved_indices[0]]["parent_id"]
             context = get_full_procedure_text(parent_id)
     history_str = "\n".join([f"{item['role']}: {item['content']}" for item in current_history])
-    prompt = f"""Bạn là trợ lý eGov-Bot. Trả lời tiếng Việt, chính xác, dựa vào DỮ LIỆU sau.
 Nếu thiếu dữ liệu, hãy nói "Mình chưa có thông tin" và đưa link nguồn trong dữ liệu để tham khảo.
 Lịch sử trò chuyện:
 {history_str}
 DỮ LIỆU: --- {context} ---
 CÂU HỎI: {user_query}"""
     response = generation_model.generate_content(prompt)
     answer = response.text
-    current_history.append({'role':'user','content':user_query})
-    current_history.append({'role':'model','content':answer,'context':context})
     return jsonify({"answer": answer})
 if __name__ == '__main__':

 # app.py
+import os, shutil, gzip, pickle, re, json
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+import google.generativeai as genai
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from huggingface_hub import login, hf_hub_download
+# --- CACHE CONFIG (HF Spaces chỉ ghi vào /tmp) ---
 os.environ["HF_HOME"] = "/tmp/hf_home"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
 os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
 os.environ["HOME"] = "/tmp"
 shutil.rmtree("/.cache", ignore_errors=True)
 # --- LOGIN HF HUB ---
 HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
 if HF_TOKEN:
     login(HF_TOKEN)
 else:
+    print("⚠️ Warning: HF token not found. Chỉ truy cập public repo được thôi.")
 # --- LOAD DỮ LIỆU ---
+HF_REPO_ID = "DrPie/eGoV_Data"
 REPO_TYPE = "dataset"
 print("--- KHỞI ĐỘNG MÁY CHỦ CHATBOT ---")
 try:
     RAW_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="toan_bo_du_lieu_final.json", repo_type=REPO_TYPE)
     FAISS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="index.faiss", repo_type=REPO_TYPE)
     METAS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="metas.pkl.gz", repo_type=REPO_TYPE)
     BM25_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="bm25.pkl.gz", repo_type=REPO_TYPE)
     IDMAP_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="id_to_record.pkl", repo_type=REPO_TYPE)
+    print("✅ Đã tải file dữ liệu từ HF Hub!")
     API_KEY = os.environ.get("GOOGLE_API_KEY")
     if not API_KEY:
+        raise ValueError("GOOGLE_API_KEY chưa có trong Secrets")
     genai.configure(api_key=API_KEY)
     generation_model = genai.GenerativeModel('gemini-2.5-flash')
     print(f"✅ Sẵn có {faiss_index.ntotal} chunks kiến thức.")
     print(f"✅ Có {len(procedure_map)} thủ tục hành chính.")
 except Exception as e:
     print(f"❌ Lỗi khi tải tài nguyên: {e}")
+# --- HÀM XỬ LÝ ---
 def classify_followup(text: str):
     text = text.lower().strip()
     score = 0
 def minmax_scale(arr):
     arr = np.array(arr, dtype="float32")
+    if len(arr) == 0 or np.max(arr) == np.min(arr):
+        return np.zeros_like(arr)
     return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
+def hybrid_retrieve(query: str, top_k=3, alpha=0.7):
+    """Hybrid search: kết hợp semantic (FAISS) và lexical (BM25)."""
     qv = embedding_model.encode([query], normalize_embeddings=True).astype("float32")
+    D, I = faiss_index.search(qv, top_k * 5)
     vec_scores = (1 - D[0]).tolist()
     vec_idx = I[0].tolist()
     tokenized_query = query.split()
     bm25_scores_all = bm25.get_scores(tokenized_query)
+    bm25_top_idx = np.argsort(-bm25_scores_all)[:top_k * 5].tolist()
+    # Gộp và chuẩn hóa điểm
     union_idx = list(dict.fromkeys(vec_idx + bm25_top_idx))
+    vec_map = {i: s for i, s in zip(vec_idx, vec_scores)}
+    vec_list = [vec_map.get(i, 0.0) for i in union_idx]
     bm25_list = [bm25_scores_all[i] for i in union_idx]
+    fused = alpha * minmax_scale(vec_list) + (1 - alpha) * minmax_scale(bm25_list)
     order = np.argsort(-fused)
     return [union_idx[i] for i in order[:top_k]]
 def get_full_procedure_text(parent_id):
     procedure = procedure_map.get(parent_id)
+    if not procedure:
+        return "Không tìm thấy thủ tục."
+    field_map = {
+        "ten_thu_tuc": "Tên thủ tục",
+        "cach_thuc_thuc_hien": "Cách thức thực hiện",
+        "thanh_phan_ho_so": "Thành phần hồ sơ",
+        "trinh_tu_thuc_hien": "Trình tự thực hiện",
+        "co_quan_thuc_hien": "Cơ quan thực hiện",
+        "yeu_cau_dieu_kien": "Yêu cầu, điều kiện",
+        "thu_tuc_lien_quan": "Thủ tục liên quan",
+        "nguon": "Nguồn"
+    }
+    parts = [f"{field_map[k]}:\n{str(v).strip()}" for k, v in procedure.items() if v and k in field_map]
     return "\n\n".join(parts)
+# --- FLASK APP ---
 app = Flask(__name__)
 CORS(app)
 @app.route('/', methods=['GET'])
 def home():
     return "eGov-Bot backend is running!", 200
     session_id = data.get('session_id', 'default')
     if not user_query:
         return jsonify({"error": "Không có câu hỏi"}), 400
     if session_id not in chat_histories:
         chat_histories[session_id] = []
     current_history = chat_histories[session_id]
     context = ""
     if classify_followup(user_query) == 0 and current_history:
         context = current_history[-1].get('context', '')
     else:
+        retrieved_indices = hybrid_retrieve(user_query)
         if retrieved_indices:
             parent_id = metadatas[retrieved_indices[0]]["parent_id"]
             context = get_full_procedure_text(parent_id)
     history_str = "\n".join([f"{item['role']}: {item['content']}" for item in current_history])
+    prompt = f"""Bạn là trợ lý eGov-Bot. Trả lời tiếng Việt chính xác dựa vào DỮ LIỆU sau.
 Nếu thiếu dữ liệu, hãy nói "Mình chưa có thông tin" và đưa link nguồn trong dữ liệu để tham khảo.
 Lịch sử trò chuyện:
 {history_str}
 DỮ LIỆU: --- {context} ---
 CÂU HỎI: {user_query}"""
     response = generation_model.generate_content(prompt)
     answer = response.text
+    current_history.append({'role': 'user', 'content': user_query})
+    current_history.append({'role': 'model', 'content': answer, 'context': context})
     return jsonify({"answer": answer})
 if __name__ == '__main__':