Spaces:

DrPie
/

eGoV_chatbot

Sleeping

App Files Files Community

DrPie commited on Aug 28, 2025

Commit

76c06ab

verified ·

1 Parent(s): 25ffee2

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -152

app.py CHANGED Viewed

@@ -1,164 +1,103 @@
-# --- MUST BE AT THE TOP ---
 import os
-import shutil
-# Đặt cache vào /tmp (HF Space cho phép ghi vào /tmp)
-os.environ["HF_HOME"] = "/tmp/hf_home"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
-os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
-os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
-os.environ["HOME"] = "/tmp"
-os.makedirs("/tmp/.cache", exist_ok=True)
-shutil.rmtree("/.cache", ignore_errors=True)
-# --- LOGIN HF HUB ---
-from huggingface_hub import login, hf_hub_download
-HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
-if HF_TOKEN:
-    login(HF_TOKEN)
-else:
-    print("Warning: HF token not found. Only public repos will be accessible.")
-# --- LOAD DỮ LIỆU ---
-HF_REPO_ID = "DrPie/eGoV_Data"  # dataset repo chứa dữ liệu
-REPO_TYPE = "dataset"
-import pickle, gzip, re, json
 import numpy as np
-from rank_bm25 import BM25Okapi
-import google.generativeai as genai
 from flask import Flask, request, jsonify
-from flask_cors import CORS
-print("--- KHỞI ĐỘNG MÁY CHỦ CHATBOT ---")
-try:
-    print("Đang tải các tài nguyên cần thiết từ Hugging Face Hub...")
-    RAW_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="toan_bo_du_lieu_final.json", repo_type=REPO_TYPE)
-    BM25_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="bm25.pkl.gz", repo_type=REPO_TYPE)
-    IDMAP_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="id_to_record.pkl", repo_type=REPO_TYPE)
-    print("✅ Tải file dữ liệu thành công!")
-    API_KEY = os.environ.get("GOOGLE_API_KEY")
-    if not API_KEY:
-        raise ValueError("Lỗi: GOOGLE_API_KEY chưa được thiết lập trong Secrets của Space")
-    genai.configure(api_key=API_KEY)
-    generation_model = genai.GenerativeModel('gemini-2.5-flash')
-    # Không còn embedding và FAISS
-    with gzip.open(BM25_PATH, "rb") as f:
-        bm25 = pickle.load(f)
-    with open(IDMAP_PATH, "rb") as f:
-        procedure_map = pickle.load(f)
-    print(f"✅ BM25 loaded, tổng {len(procedure_map)} thủ tục hành chính.")
-except Exception as e:
-    print(f"❌ Lỗi khi tải tài nguyên: {e}")
-# --- LOGIC XỬ LÝ ---
-def classify_followup(text: str):
-    text = text.lower().strip()
-    score = 0
-    strong_followup = [
-        r"\b(nó|cái (này|đó|ấy)|thủ tục (này|đó|ấy))\b",
-        r"\b(vừa (nói|hỏi)|trước đó|ở trên|phía trên)\b",
-        r"\b(tiếp theo|tiếp|còn nữa|ngoài ra)\b",
-        r"\b(thế (thì|à)|vậy (thì|à)|như vậy)\b"
-    ]
-    detail_qs = [
-        r"\b(mất bao lâu|thời gian|bao nhiêu tiền|chi phí|phí)\b",
-        r"\b(ở đâu|tại đâu|chỗ nào|địa chỉ)\b",
-        r"\b(cần (gì|những gì)|yêu cầu|điều kiện)\b"
-    ]
-    specific_services = [
-        r"\b(làm|cấp|gia hạn|đổi|đăng ký)\s+(căn cước|cmnd|cccd)\b",
-        r"\b(làm|cấp|gia hạn|đổi)\s+hộ chiếu\b",
-        r"\b(đăng ký)\s+(kết hôn|sinh|tử|hộ khẩu)\b"
-    ]
-    if any(re.search(p, text) for p in strong_followup):
-        score -= 3
-    if any(re.search(p, text) for p in detail_qs):
-        score -= 2
-    if any(re.search(p, text) for p in specific_services):
-        score += 3
-    if len(text.split()) <= 4:
-        score -= 1
-    return 0 if score < 0 else 1
-def retrieve(query: str, top_k=3):
-    # Chỉ dùng BM25
-    tokenized_query = query.split()
-    bm25_scores = bm25.get_scores(tokenized_query)
-    top_idx = np.argsort(-bm25_scores)[:top_k].tolist()
-    return top_idx
-def get_full_procedure_text(parent_id):
-    procedure = procedure_map.get(parent_id)
-    if not procedure:
-        return "Không tìm thấy thủ tục."
-    field_map = {
-        "ten_thu_tuc": "Tên thủ tục",
-        "cach_thuc_thuc_hien": "Cách thức thực hiện",
-        "thanh_phan_ho_so": "Thành phần hồ sơ",
-        "trinh_tu_thuc_hien": "Trình tự thực hiện",
-        "co_quan_thuc_hien": "Cơ quan thực hiện",
-        "yeu_cau_dieu_kien": "Yêu cầu, điều kiện",
-        "thu_tuc_lien_quan": "Thủ tục liên quan",
-        "nguon": "Nguồn"
-    }
-    parts = [f"{field_map[k]}:\n{str(v).strip()}" for k,v in procedure.items() if v and k in field_map]
-    return "\n\n".join(parts)
-# --- FLASK APP ---
-app = Flask(__name__)
-CORS(app)
-@app.route('/', methods=['GET'])
-def home():
-    return "eGov-Bot backend is running!", 200
-chat_histories = {}
-@app.route('/chat', methods=['POST'])
 def chat():
-    data = request.json
-    user_query = data.get('question')
-    session_id = data.get('session_id', 'default')
-    if not user_query:
-        return jsonify({"error": "Không có câu hỏi"}), 400
-    if session_id not in chat_histories:
-        chat_histories[session_id] = []
-    current_history = chat_histories[session_id]
-    context = ""
-    if classify_followup(user_query) == 0 and current_history:
-        context = current_history[-1].get('context', '')
-    else:
-        retrieved_indices = retrieve(user_query)
-        if retrieved_indices:
-            parent_id = retrieved_indices[0]
-            context = get_full_procedure_text(parent_id)
-    history_str = "\n".join([f"{item['role']}: {item['content']}" for item in current_history])
-    prompt = f"""Bạn là trợ lý eGov-Bot. Trả lời tiếng Việt, chính xác, dựa vào DỮ LIỆU sau.
-Nếu thiếu dữ liệu, hãy nói "Mình chưa có thông tin" và đưa link nguồn trong dữ liệu để tham khảo.
-Lịch sử trò chuyện: {history_str}
 DỮ LIỆU:
----
-{context}
----
-CÂU HỎI: {user_query}"""
-    response = generation_model.generate_content(prompt)
-    answer = response.text
-    current_history.append({'role': 'user', 'content': user_query})
-    current_history.append({'role': 'model', 'content': answer, 'context': context})
-    return jsonify({"answer": answer})
-if __name__ == '__main__':
-    app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))

 import os
+import re
+import unicodedata
+import pickle
 import numpy as np
 from flask import Flask, request, jsonify
+from rank_bm25 import BM25Okapi
+from huggingface_hub import InferenceClient
+# ===================== #
+#   TIỀN XỬ LÝ VĂN BẢN  #
+# ===================== #
+def normalize_text(text: str) -> str:
+    text = text.lower()
+    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')  # bỏ dấu tiếng Việt
+    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # bỏ ký tự đặc biệt
+    return text
+def tokenize(text: str):
+    return normalize_text(text).split()
+# ===================== #
+#   LOAD DỮ LIỆU       #
+# ===================== #
+# File id_to_record.pkl chứa dict: id -> {ten_thu_tuc, mo_ta, yeu_cau, co_quan, link ...}
+with open("id_to_record.pkl", "rb") as f:
+    id_to_record = pickle.load(f)
+# Tạo corpus cho BM25: mỗi record nối các trường thành 1 text
+corpus = []
+for rid, rec in id_to_record.items():
+    fields = [str(rec.get(k, "")) for k in ["ten_thu_tuc", "mo_ta", "yeu_cau", "co_quan", "linh_vuc"]]
+    text = " ".join(fields)
+    corpus.append(tokenize(text))
+bm25 = BM25Okapi(corpus)
+# ===================== #
+#   KHỞI TẠO FLASK APP  #
+# ===================== #
+app = Flask(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_MODEL = os.getenv("HF_MODEL", "gemini-pro")  # đổi sang model bạn dùng
+client = InferenceClient(token=HF_TOKEN)
+# ===================== #
+#   HÀM LẤY CONTEXT     #
+# ===================== #
+def retrieve_context(query: str, top_k: int = 5):
+    tokens = tokenize(query)
+    scores = bm25.get_scores(tokens)
+    top_idx = np.argsort(-scores)[:top_k]
+    context_parts = []
+    for idx in top_idx:
+        if scores[idx] > 0:  # chỉ lấy nếu score > 0
+            rid = list(id_to_record.keys())[idx]
+            rec = id_to_record[rid]
+            # context gồm tên, mô tả, yêu cầu và link nếu có
+            ctx = f"Tên: {rec.get('ten_thu_tuc','')}\nMô tả: {rec.get('mo_ta','')}\nYêu cầu: {rec.get('yeu_cau','')}\nCơ quan: {rec.get('co_quan','')}\nLink: {rec.get('link','')}"
+            context_parts.append(ctx)
+    return "\n\n".join(context_parts)
+# ===================== #
+#   ROUTE /chat         #
+# ===================== #
+@app.route("/chat", methods=["POST"])
 def chat():
+    user_query = request.json.get("query", "")
+    if not user_query.strip():
+        return jsonify({"answer": "Bạn chưa nhập câu hỏi."})
+    context = retrieve_context(user_query)
+    prompt = f"""
+Bạn là trợ lý eGov-Bot, trả lời bằng tiếng Việt.
+Ưu tiên dùng thông tin từ DỮ LIỆU dưới đây để trả lời.
+Nếu dữ liệu không đủ, có thể suy luận hợp lý hoặc trả lời rằng chưa có đủ thông tin.
+Nếu có link nguồn trong dữ liệu, hãy cung cấp.
 DỮ LIỆU:
+{context if context.strip() else "Không tìm thấy thông tin nào khớp trực tiếp."}
+CÂU HỎI: {user_query}
+"""
+    try:
+        response = client.text_generation(model=HF_MODEL, prompt=prompt, max_new_tokens=512)
+        return jsonify({"answer": response.strip()})
+    except Exception as e:
+        return jsonify({"answer": f"Lỗi khi gọi model: {e}"})
+# ===================== #
+#   MAIN APP            #
+# ===================== #
+if __name__ == "__main__":
+    # Debug mode cho dev, production có thể bỏ
+    app.run(host="0.0.0.0", port=7860)