Spaces:

DrPie
/

eGoV_chatbot

Sleeping

App Files Files Community

DrPie commited on Aug 28, 2025

Commit

d80e6d2

verified ·

1 Parent(s): 76c06ab

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -95

app.py CHANGED Viewed

@@ -1,103 +1,204 @@
-import os
-import re
-import unicodedata
-import pickle
-import numpy as np
 from flask import Flask, request, jsonify
 from rank_bm25 import BM25Okapi
-from huggingface_hub import InferenceClient
-# ===================== #
-#   TIỀN XỬ LÝ VĂN BẢN  #
-# ===================== #
-def normalize_text(text: str) -> str:
-    text = text.lower()
-    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')  # bỏ dấu tiếng Việt
-    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # bỏ ký tự đặc biệt
-    return text
-def tokenize(text: str):
-    return normalize_text(text).split()
-# ===================== #
-#   LOAD DỮ LIỆU       #
-# ===================== #
-# File id_to_record.pkl chứa dict: id -> {ten_thu_tuc, mo_ta, yeu_cau, co_quan, link ...}
-with open("id_to_record.pkl", "rb") as f:
-    id_to_record = pickle.load(f)
-# Tạo corpus cho BM25: mỗi record nối các trường thành 1 text
-corpus = []
-for rid, rec in id_to_record.items():
-    fields = [str(rec.get(k, "")) for k in ["ten_thu_tuc", "mo_ta", "yeu_cau", "co_quan", "linh_vuc"]]
-    text = " ".join(fields)
-    corpus.append(tokenize(text))
-bm25 = BM25Okapi(corpus)
-# ===================== #
-#   KHỞI TẠO FLASK APP  #
-# ===================== #
-app = Flask(__name__)
-HF_TOKEN = os.getenv("HF_TOKEN")
-HF_MODEL = os.getenv("HF_MODEL", "gemini-pro")  # đổi sang model bạn dùng
-client = InferenceClient(token=HF_TOKEN)
-# ===================== #
-#   HÀM LẤY CONTEXT     #
-# ===================== #
-def retrieve_context(query: str, top_k: int = 5):
-    tokens = tokenize(query)
-    scores = bm25.get_scores(tokens)
-    top_idx = np.argsort(-scores)[:top_k]
-    context_parts = []
-    for idx in top_idx:
-        if scores[idx] > 0:  # chỉ lấy nếu score > 0
-            rid = list(id_to_record.keys())[idx]
-            rec = id_to_record[rid]
-            # context gồm tên, mô tả, yêu cầu và link nếu có
-            ctx = f"Tên: {rec.get('ten_thu_tuc','')}\nMô tả: {rec.get('mo_ta','')}\nYêu cầu: {rec.get('yeu_cau','')}\nCơ quan: {rec.get('co_quan','')}\nLink: {rec.get('link','')}"
-            context_parts.append(ctx)
-    return "\n\n".join(context_parts)
-# ===================== #
-#   ROUTE /chat         #
-# ===================== #
-@app.route("/chat", methods=["POST"])
 def chat():
-    user_query = request.json.get("query", "")
-    if not user_query.strip():
-        return jsonify({"answer": "Bạn chưa nhập câu hỏi."})
-    context = retrieve_context(user_query)
-    prompt = f"""
-Bạn là trợ lý eGov-Bot, trả lời bằng tiếng Việt.
-Ưu tiên dùng thông tin từ DỮ LIỆU dưới đây để trả lời.
-Nếu dữ liệu không đủ, có thể suy luận hợp lý hoặc trả lời rằng chưa có đủ thông tin.
-Nếu có link nguồn trong dữ liệu, hãy cung cấp.
 DỮ LIỆU:
-{context if context.strip() else "Không tìm thấy thông tin nào khớp trực tiếp."}
-CÂU HỎI: {user_query}
-"""
     try:
-        response = client.text_generation(model=HF_MODEL, prompt=prompt, max_new_tokens=512)
-        return jsonify({"answer": response.strip()})
     except Exception as e:
-        return jsonify({"answer": f"Lỗi khi gọi model: {e}"})
-# ===================== #
-#   MAIN APP            #
-# ===================== #
-if __name__ == "__main__":
-    # Debug mode cho dev, production có thể bỏ
-    app.run(host="0.0.0.0", port=7860)

+# =================== #
+#   Cache + Env setup #
+# =================== #
+import os, shutil
+# Đặt cache vào /tmp để tránh lỗi permission trên Spaces
+os.environ["HF_HOME"] = "/tmp/hf_home"
+os.environ["HF_HUB_CACHE"] = "/tmp/hf_cache"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
+os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
+os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
+os.environ["HOME"] = "/tmp"
+for p in ["/tmp/hf_home","/tmp/hf_cache","/tmp/hf_datasets","/tmp/.cache"]:
+    os.makedirs(p, exist_ok=True)
+# Xóa cache cũ nếu có
+shutil.rmtree("/.cache", ignore_errors=True)
+# =================== #
+#   Import thư viện   #
+# =================== #
+import time, hashlib, gzip, pickle, json, traceback, re
 from flask import Flask, request, jsonify
+from flask_cors import CORS
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
+import google.generativeai as genai
+from cachetools import TTLCache
+from huggingface_hub import login, hf_hub_download
+# ================ #
+#   Load ENV & HF  #
+# ================ #
+HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
+if HF_TOKEN:
+    try:
+        login(HF_TOKEN)
+        print("HF login successful")
+    except Exception as e:
+        print("Warning: HF login failed:", e)
+else:
+    print("Warning: HF_TOKEN not found")
+HF_REPO_ID = os.environ.get("HF_REPO_ID", "DrPie/eGoV_Data")
+REPO_TYPE = os.environ.get("REPO_TYPE", "dataset")
+EMB_MODEL = os.environ.get("EMB_MODEL", "AITeamVN/Vietnamese_Embedding")
+GENAI_MODEL = os.environ.get("GENAI_MODEL", "gemini-2.5-flash")
+TOP_K = int(os.environ.get("TOP_K", "3"))
+FAISS_CANDIDATES = int(os.environ.get("FAISS_CANDIDATES", str(max(10, TOP_K*5))))
+BM25_PREFILTER = int(os.environ.get("BM25_PREFILTER", "200"))
+CACHE_TTL = int(os.environ.get("CACHE_TTL", "3600"))
+CACHE_MAX = int(os.environ.get("CACHE_MAX", "2000"))
+print("--- KHỞI ĐỘNG MÁY CHỦ CHATBOT (optimized & id_to_record) ---")
+# ================ #
+#   Download data  #
+# ================ #
+RAW_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="toan_bo_du_lieu_final.json", repo_type=REPO_TYPE)
+FAISS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="index.faiss", repo_type=REPO_TYPE)
+BM25_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="bm25.pkl.gz", repo_type=REPO_TYPE)
+METAS_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="metas.pkl.gz", repo_type=REPO_TYPE)
+# Load id_to_record.pkl nếu có
+try:
+    ID2REC_PATH = hf_hub_download(repo_id=HF_REPO_ID, filename="id_to_record.pkl", repo_type=REPO_TYPE)
+    with open(ID2REC_PATH,"rb") as f:
+        id_to_record = pickle.load(f)
+except Exception as e:
+    print("⚠️ Không tải được id_to_record.pkl:", e)
+    id_to_record = {}
+# ================ #
+#   Load resources #
+# ================ #
+faiss_index = faiss.read_index(FAISS_PATH)
+with gzip.open(BM25_PATH,"rb") as f: bm25 = pickle.load(f)
+with gzip.open(METAS_PATH,"rb") as f:
+    metas = pickle.load(f)
+if isinstance(metas,dict) and "corpus" in metas:
+    corpus = metas["corpus"]
+else:
+    corpus = metas
+# Lưu list key để tránh tạo lại nhiều lần
+meta_keys = list(range(len(corpus)))
+# Load embedding model
+device = os.environ.get("DEVICE","cpu")
+embedding_model = SentenceTransformer(EMB_MODEL, device=device)
+# Load raw_data làm fallback để build procedure_map
+try:
+    with open(RAW_PATH,"r",encoding="utf-8") as f:
+        raw_data = json.load(f)
+    procedure_map = {item.get('nguon') or item.get('parent_id') or str(i): item for i,item in enumerate(raw_data)}
+except Exception:
+    procedure_map = {}
+# GenAI init
+API_KEY = os.environ.get("GOOGLE_API_KEY")
+generation_model = None
+if API_KEY:
+    try:
+        genai.configure(api_key=API_KEY)
+        generation_model = genai.GenerativeModel(GENAI_MODEL)
+    except Exception as e:
+        print("Warning: cannot init GenAI:", e)
+answer_cache = TTLCache(maxsize=CACHE_MAX, ttl=CACHE_TTL)
+# =================== #
+# Utility / Retrieve  #
+# =================== #
+def minmax_scale(arr):
+    arr=np.array(arr,dtype="float32")
+    return np.zeros_like(arr) if len(arr)==0 or np.max(arr)==np.min(arr) else (arr-np.min(arr))/(np.max(arr)-np.min(arr))
+def classify_followup(text:str)->int:
+    # như code gốc, bỏ bớt regex nặng để nhanh hơn
+    t=text.lower().strip()
+    if len(t.split())<=4: return 0
+    if re.search(r"\b(nó|cái này|thế thì|vậy thì)\b",t): return 0
+    return 1
+def retrieve(query:str, top_k=TOP_K):
+    qv = embedding_model.encode([query],normalize_embeddings=True).astype("float32")
+    D,I = faiss_index.search(qv, max(FAISS_CANDIDATES, top_k*5))
+    vec_idx = I[0].tolist()
+    vec_scores = (1-D[0]).tolist()
+    # BM25 prefilter
+    try:
+        bm25_scores_all = bm25.get_scores(query.split())
+        bm25_top_idx = np.argsort(-bm25_scores_all)[:BM25_PREFILTER].tolist()
+    except Exception:
+        bm25_top_idx=[]
+    union_idx = list(dict.fromkeys(vec_idx+bm25_top_idx))
+    vec_map = {i:s for i,s in zip(vec_idx,vec_scores)}
+    vec_list=[vec_map.get(i,0.0) for i in union_idx]
+    bm25_list=[bm25_scores_all[i] if i<len(bm25_scores_all) else 0.0 for i in union_idx]
+    fused=0.7*minmax_scale(vec_list)+0.3*minmax_scale(bm25_list)
+    order=np.argsort(-fused)
+    return [union_idx[i] for i in order[:top_k]]
+def get_full_procedure_text_by_parent(pid):
+    rec=None
+    if id_to_record:
+        rec=id_to_record.get(pid)
+    if not rec:
+        rec=procedure_map.get(pid)
+    if not rec: return "Không tìm thấy thủ tục."
+    field_map={"ten_thu_tuc":"Tên thủ tục","cach_thuc_thuc_hien":"Cách thức thực hiện","thanh_phan_ho_so":"Thành phần hồ sơ","trinh_tu_thuc_hien":"Trình tự thực hiện","co_quan_thuc_hien":"Cơ quan thực hiện","yeu_cau_dieu_kien":"Yêu cầu, điều kiện","nguon":"Nguồn"}
+    return "\n\n".join([f"{field_map[k]}:\n{v}" for k,v in rec.items() if k in field_map and v])
+# ================ #
+# Flask endpoints  #
+# ================ #
+app=Flask(__name__)
+CORS(app)
+chat_histories={}
+@app.route("/health")
+def health(): return {"status":"ok"}
+@app.route("/chat",methods=["POST"])
 def chat():
+    data=request.get_json(force=True)
+    user_query=data.get("question")
+    sid=data.get("session_id","default")
+    if not user_query: return jsonify({"error":"No question provided"}),400
+    if sid not in chat_histories: chat_histories[sid]=[]
+    hist=chat_histories[sid]
+    if classify_followup(user_query)==0 and hist:
+        context=hist[-1].get("context","")
+    else:
+        idxs=retrieve(user_query,TOP_K)
+        if idxs:
+            meta=metas[idxs[0]]
+            pid=meta.get("parent_id") or meta.get("nguon")
+            context=get_full_procedure_text_by_parent(pid)
+        else: context=""
+    history_str="\n".join([f"{m['role']}: {m['content']}" for m in hist])
+    prompt=f"""Bạn là trợ lý eGov-Bot dịch vụ công Việt Nam.
+Trả lời tiếng Việt, chính xác, dựa dữ liệu nếu có.
+Nếu thiếu dữ liệu, nói "Mình chưa có thông tin" và đưa link nguồn trong dữ liệu.
+Lịch sử: {history_str}
 DỮ LIỆU:
+{context}
+CÂU HỎI: {user_query}"""
+    if not generation_model:
+        return jsonify({"answer":"LLM model chưa sẵn sàng (kiểm tra GOOGLE_API_KEY)."})
     try:
+        resp=generation_model.generate_content(prompt)
+        ans=getattr(resp,"text",str(resp))
     except Exception as e:
+        return jsonify({"error":"LLM call failed","detail":str(e)}),200
+    hist.append({'role':'user','content':user_query})
+    hist.append({'role':'model','content':ans,'context':context})
+    return jsonify({"answer":ans})
+if __name__=="__main__":
+    app.run(host="0.0.0.0",port=int(os.environ.get("PORT",7860)))