Spaces:
Sleeping
Sleeping
File size: 8,461 Bytes
b781577 1469b0c b781577 1469b0c b781577 22aa375 b781577 87c3fc4 8989a2f 22aa375 87c3fc4 0506d3c 22aa375 1469b0c b781577 1469b0c 22aa375 1469b0c 22aa375 bd36bb7 0506d3c 22aa375 aed2a35 0506d3c 22aa375 26b98c5 7f6d394 bd36bb7 da32c1e 0506d3c 22aa375 0506d3c 8a55f94 22aa375 26b98c5 22aa375 8a55f94 22aa375 8a55f94 22aa375 8a55f94 22aa375 8a55f94 22aa375 8a55f94 22aa375 8a55f94 22aa375 0506d3c 22aa375 8989a2f 22aa375 8a55f94 0506d3c 22aa375 8a55f94 8989a2f 0506d3c 8989a2f 0506d3c 22aa375 0506d3c 22aa375 6196eed 22aa375 8989a2f 22aa375 8a55f94 0506d3c 22aa375 0506d3c 22aa375 0506d3c 22aa375 1469b0c b781577 8a55f94 22aa375 8a55f94 0506d3c 8a55f94 c7cd191 8a55f94 0506d3c 22aa375 8a55f94 0506d3c 8a55f94 22aa375 8a55f94 0506d3c 8a55f94 0506d3c 8a55f94 22aa375 1469b0c 22aa375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | __import__("pysqlite3")
import sys
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
import os
import logging
import traceback
import gradio as gr
import pandas as pd
import docx2txt
import chromadb
from chromadb.config import Settings
from shutil import rmtree
# --- CÁC THƯ VIỆN LANGCHAIN ---
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers.ensemble import EnsembleRetriever
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
# --- THƯ VIỆN TỐI ƯU TỐC ĐỘ (CACHE & RERANK) ---
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
# --- CẤU HÌNH HỆ THỐNG ---
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
DATA_PATH = "medical_data"
DB_PATH = "chroma_db"
CACHE_DB_PATH = "llm_cache.db" # File lưu bộ nhớ đệm
MAX_HISTORY_TURNS = 6
FORCE_REBUILD_DB = False
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
# --- KÍCH HOẠT CACHING ---
# Hệ thống sẽ lưu câu trả lời vào file .db.
# Lần sau gặp câu hỏi y hệt, nó sẽ lấy từ đệm ra ngay lập tức.
if not os.path.exists(CACHE_DB_PATH):
logging.info("Khởi tạo file cache mới.")
set_llm_cache(SQLiteCache(database_path=CACHE_DB_PATH))
def process_excel_file(file_path: str, filename: str) -> list[Document]:
"""Xử lý Excel: Biến mỗi dòng thành một Document."""
docs = []
try:
if file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path)
df.dropna(how='all', inplace=True)
df.fillna("Không có thông tin", inplace=True)
for idx, row in df.iterrows():
content_parts = []
for col_name, val in row.items():
clean_val = str(val).strip()
if clean_val and clean_val.lower() != "nan":
content_parts.append(f"{col_name}: {clean_val}")
if content_parts:
page_content = f"Dữ liệu từ file {filename} (Dòng {idx+1}):\n" + "\n".join(content_parts)
metadata = {"source": filename, "row": idx+1, "type": "excel_record"}
docs.append(Document(page_content=page_content, metadata=metadata))
except Exception as e:
logging.error(f"Lỗi xử lý Excel {filename}: {e}")
return docs
def load_documents_from_folder(folder_path: str) -> list[Document]:
logging.info(f"--- Bắt đầu quét thư mục: {folder_path} ---")
documents: list[Document] = []
if not os.path.exists(folder_path):
os.makedirs(folder_path, exist_ok=True)
return []
for root, _, files in os.walk(folder_path):
for filename in files:
file_path = os.path.join(root, filename)
filename_lower = filename.lower()
try:
if filename_lower.endswith(".pdf"):
loader = PyPDFLoader(file_path)
docs = loader.load()
for d in docs: d.metadata["source"] = filename
documents.extend(docs)
elif filename_lower.endswith(".docx"):
text = docx2txt.process(file_path)
if text.strip():
documents.append(Document(page_content=text, metadata={"source": filename}))
elif filename_lower.endswith((".xlsx", ".xls", ".csv")):
excel_docs = process_excel_file(file_path, filename)
documents.extend(excel_docs)
elif filename_lower.endswith((".txt", ".md")):
with open(file_path, "r", encoding="utf-8") as f: text = f.read()
if text.strip():
documents.append(Document(page_content=text, metadata={"source": filename}))
except Exception as e:
logging.error(f"Lỗi đọc file {filename}: {e}")
logging.info(f"Tổng cộng đã load: {len(documents)} tài liệu gốc.")
return documents
def get_retriever_chain():
logging.info("--- Tải Embedding Model ---")
# Chạy trên CPU để tiết kiệm resource, đổi 'cpu' thành 'cuda' nếu có GPU
embedding_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
model_kwargs={'device': 'cpu'}
)
vectorstore = None
chroma_settings = Settings(anonymized_telemetry=False)
if FORCE_REBUILD_DB and os.path.exists(DB_PATH):
rmtree(DB_PATH, ignore_errors=True)
# 1. TỐI ƯU: Kiểm tra nhanh DB bằng count() thay vì load toàn bộ
if os.path.exists(DB_PATH) and os.listdir(DB_PATH):
try:
vectorstore = Chroma(
persist_directory=DB_PATH,
embedding_function=embedding_model,
client_settings=chroma_settings
)
if vectorstore._collection.count() > 0:
logging.info(f"Đã kết nối DB cũ. Size: {vectorstore._collection.count()}")
else:
vectorstore = None
except Exception as e:
logging.error(f"DB lỗi: {e}. Reset DB...")
rmtree(DB_PATH, ignore_errors=True)
vectorstore = None
if not vectorstore:
logging.info("--- Tạo Index dữ liệu mới ---")
raw_docs = load_documents_from_folder(DATA_PATH)
if not raw_docs:
logging.warning("Không có dữ liệu trong thư mục medical_data.")
return None
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(raw_docs)
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embedding_model,
persist_directory=DB_PATH,
client_settings=chroma_settings
)
logging.info("Đã lưu VectorStore thành công.")
# 2. TỐI ƯU: Giảm k ban đầu xuống 6 để bớt tính toán
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 6})
# 3. TỐI ƯU: Sử dụng FlashRank (Siêu nhẹ & Nhanh) thay vì CrossEncoder
logging.info("--- Tải Reranker Model (FlashRank) ---")
compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2") # Model ~40MB
final_retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=vector_retriever
)
return final_retriever
class DeepMedBot:
def __init__(self):
self.rag_chain = None
self.ready = False
if not GOOGLE_API_KEY:
logging.error("⚠️ Thiếu GOOGLE_API_KEY!")
return
try:
self.retr2.5-flash",
temperature=0.3,
google_api_key=GOOGLE_API_KEY
)
self._build_chains()
self.ready = True
logging.info("✅ Bot DeepMed đã sẵn sàng!")
except Exception as e:
logging.error(f"🔥 Lỗi khởi tạo bot: {e}")
logging.debug(traceback.format_exc())
def _build_chains(self):
context_system_prompt = (
"Viết lại câu hỏi của người dùng thành câu đầy đủ ngữ cảnh. "
"KHÔNG trả lời, chỉ viết lại."
)
context_prompt = ChatPromptTemplate.from_messages([
Ba)")
chat_interface = gr.ChatInterface(
fn=gradio_chat_stream,
)
if __name__ == "__main__":
demo.launch() |