change commit

Browse files

Files changed (16) hide show

core/gradio/{gradio_rag_qwen.py → gradio_rag.py} +33 -24
core/hash_file/hash_data_goc.py +76 -95
core/hash_file/hash_file.py +14 -3
core/preprocessing/docling_processor.py +69 -9
core/preprocessing/pdf_parser.py +11 -8
core/rag/chunk.py +67 -54
core/rag/embedding_model.py +24 -10
core/rag/generator.py +16 -7
core/rag/retrival.py +61 -57
core/rag/vector_store.py +52 -30
evaluation/eval_utils.py +19 -4
evaluation/ragas_eval.py +32 -57
scripts/build_data.py +43 -41
scripts/run_eval.py +7 -5
test/parse_data_hash_test.py +0 -102
test/test_chunk.py +17 -7

core/gradio/{gradio_rag_qwen.py → gradio_rag.py} RENAMED Viewed

@@ -1,3 +1,8 @@
 from __future__ import annotations
 import os
 import sys
@@ -8,6 +13,7 @@ import gradio as gr
 from dotenv import find_dotenv, load_dotenv
 from openai import OpenAI
 REPO_ROOT = Path(__file__).resolve().parents[2]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
@@ -15,14 +21,18 @@ if str(REPO_ROOT) not in sys.path:
 @dataclass
 class GradioConfig:
     server_host: str = "127.0.0.1"
     server_port: int = 7860
 def _load_env() -> None:
     dotenv_path = find_dotenv(usecwd=True) or ""
     load_dotenv(dotenv_path=dotenv_path or None, override=False)
 from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
 from core.rag.vector_store import ChromaConfig, ChromaVectorDB
 from core.rag.retrival import Retriever, RetrievalMode, get_retrieval_config
@@ -30,19 +40,20 @@ from core.rag.generator import RAGContextBuilder, build_context, build_prompt, S
 _load_env()
-RETRIEVAL_MODE = RetrievalMode.HYBRID_RERANK  # Test with debug logs
-# LLM Config
-LLM_MODEL = os.getenv("LLM_MODEL", "qwen/qwen3-32b")
-LLM_API_BASE = "https://api.groq.com/openai/v1"
-LLM_API_KEY_ENV = "GROQ_API_KEY"
-# Load retrieval config
 GRADIO_CFG = GradioConfig()
 RETRIEVAL_CFG = get_retrieval_config()
 class AppState:
     def __init__(self) -> None:
         self.db: Optional[ChromaVectorDB] = None
         self.retriever: Optional[Retriever] = None
@@ -50,39 +61,38 @@ class AppState:
         self.client: Optional[OpenAI] = None
-STATE = AppState()
 def _init_resources() -> None:
     if STATE.db is not None:
         return
     print(f" Đang khởi tạo Database & Re-ranker...")
     print(f" Retrieval Mode: {RETRIEVAL_MODE.value}")
     emb = QwenEmbeddings(EmbeddingConfig())
     db_cfg = ChromaConfig()
-    STATE.db = ChromaVectorDB(
-        embedder=emb,
-        config=db_cfg,
-    )
     STATE.retriever = Retriever(vector_db=STATE.db)
-    # LLM Client
     api_key = (os.getenv(LLM_API_KEY_ENV) or "").strip()
     if not api_key:
         raise RuntimeError(f"Missing {LLM_API_KEY_ENV}")
     STATE.client = OpenAI(api_key=api_key, base_url=LLM_API_BASE)
-    # RAGContextBuilder - chỉ retrieve
     STATE.rag_builder = RAGContextBuilder(retriever=STATE.retriever)
     print(" Đã sẵn sàng!")
 def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
     _init_resources()
     assert STATE.db is not None
@@ -90,7 +100,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
     assert STATE.retriever is not None
     assert STATE.rag_builder is not None
-    # Bước 1: Retrieve và prepare context
     prepared = STATE.rag_builder.retrieve_and_prepare(
         message,
         k=RETRIEVAL_CFG.top_k,
@@ -103,7 +113,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
         yield "Xin lỗi, tôi không tìm thấy thông tin phù hợp trong dữ liệu."
         return
-    # Bước 2: Gọi LLM streaming để generate answer
     completion = STATE.client.chat.completions.create(
         model=LLM_MODEL,
         messages=[{"role": "user", "content": prepared["prompt"]}],
@@ -112,6 +122,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
         stream=True,
     )
     acc = ""
     for chunk in completion:
         delta = getattr(chunk.choices[0].delta, "content", "") or ""
@@ -119,7 +130,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
             acc += delta
             yield acc
-    # Debug info with mode indicator
     debug_info = f"\n\n---\n\n**Retrieved (Top {len(results)} | Mode: {RETRIEVAL_MODE.value})**\n\n"
     for i, r in enumerate(results, 1):
         md = r.get("metadata", {})
@@ -127,7 +138,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
         rerank_score = r.get("rerank_score")
         distance = r.get("distance")
-        # Extract metadata
         source = md.get("source_file", "N/A")
         doc_type = md.get("document_type", "N/A")
         header = md.get("header_path", "")
@@ -135,7 +146,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
         program = md.get("program_name", "")
         issued_year = md.get("issued_year", "")
-        # Show relevant scores based on mode
         score_info = ""
         if rerank_score is not None:
             score_info += f"Rerank: `{rerank_score:.4f}` "
@@ -144,7 +155,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
         if not score_info:
             score_info = f"Rank: `{r.get('final_rank', i)}`"
-        # Build metadata line
         meta_parts = [f"**Nguồn:** {source}", f"**Loại:** {doc_type}"]
         if issued_year:
             meta_parts.append(f"**Năm:** {issued_year}")
@@ -162,9 +173,7 @@ def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
     yield acc + debug_info
-# Create Gradio interface
 demo = gr.ChatInterface(
     fn=rag_chat,
     title=f"HUST RAG Assistant",

+"""
+Giao diện Gradio cho hệ thống RAG - Trợ lý học vụ HUST.
+Cho phép người dùng đặt câu hỏi và nhận câu trả lời từ hệ thống RAG.
+"""
 from __future__ import annotations
 import os
 import sys
 from dotenv import find_dotenv, load_dotenv
 from openai import OpenAI
+# Thêm thư mục gốc vào Python path
 REPO_ROOT = Path(__file__).resolve().parents[2]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 @dataclass
 class GradioConfig:
+    """Cấu hình Gradio server: host và port."""
     server_host: str = "127.0.0.1"
     server_port: int = 7860
 def _load_env() -> None:
+    """Tải biến môi trường từ file .env."""
     dotenv_path = find_dotenv(usecwd=True) or ""
     load_dotenv(dotenv_path=dotenv_path or None, override=False)
+# Import các module RAG
 from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
 from core.rag.vector_store import ChromaConfig, ChromaVectorDB
 from core.rag.retrival import Retriever, RetrievalMode, get_retrieval_config
 _load_env()
+# Cấu hình retrieval và LLM
+RETRIEVAL_MODE = RetrievalMode.HYBRID_RERANK  # Chế độ tìm kiếm
+LLM_MODEL = os.getenv("LLM_MODEL", "qwen/qwen3-32b")  # Model LLM
+LLM_API_BASE = "https://api.groq.com/openai/v1"  # Groq API endpoint
+LLM_API_KEY_ENV = "GROQ_API_KEY"  # Biến môi trường chứa API key
+# Khởi tạo cấu hình
 GRADIO_CFG = GradioConfig()
 RETRIEVAL_CFG = get_retrieval_config()
 class AppState:
+    """Quản lý trạng thái ứng dụng: database, retriever, LLM client."""
     def __init__(self) -> None:
         self.db: Optional[ChromaVectorDB] = None
         self.retriever: Optional[Retriever] = None
         self.client: Optional[OpenAI] = None
+STATE = AppState()  # Singleton state
 def _init_resources() -> None:
+    """Khởi tạo các tài nguyên: DB, Retriever, LLM client (lazy init)."""
     if STATE.db is not None:
         return
     print(f" Đang khởi tạo Database & Re-ranker...")
     print(f" Retrieval Mode: {RETRIEVAL_MODE.value}")
+    # Khởi tạo embedding và database
     emb = QwenEmbeddings(EmbeddingConfig())
     db_cfg = ChromaConfig()
+    STATE.db = ChromaVectorDB(embedder=emb, config=db_cfg)
     STATE.retriever = Retriever(vector_db=STATE.db)
+    # Khởi tạo LLM client
     api_key = (os.getenv(LLM_API_KEY_ENV) or "").strip()
     if not api_key:
         raise RuntimeError(f"Missing {LLM_API_KEY_ENV}")
     STATE.client = OpenAI(api_key=api_key, base_url=LLM_API_BASE)
+    # Khởi tạo RAG builder
     STATE.rag_builder = RAGContextBuilder(retriever=STATE.retriever)
     print(" Đã sẵn sàng!")
 def rag_chat(message: str, history: List[Dict[str, str]] | None = None):
+    """Xử lý chat: retrieve documents -> gọi LLM -> stream response"""
     _init_resources()
     assert STATE.db is not None
     assert STATE.retriever is not None
     assert STATE.rag_builder is not None
+    # Retrieve và chuẩn bị context
     prepared = STATE.rag_builder.retrieve_and_prepare(
         message,
         k=RETRIEVAL_CFG.top_k,
         yield "Xin lỗi, tôi không tìm thấy thông tin phù hợp trong dữ liệu."
         return
+    # Gọi LLM với streaming
     completion = STATE.client.chat.completions.create(
         model=LLM_MODEL,
         messages=[{"role": "user", "content": prepared["prompt"]}],
         stream=True,
     )
+    # Stream response
     acc = ""
     for chunk in completion:
         delta = getattr(chunk.choices[0].delta, "content", "") or ""
             acc += delta
             yield acc
+    # Thêm debug info về các documents đã retrieve
     debug_info = f"\n\n---\n\n**Retrieved (Top {len(results)} | Mode: {RETRIEVAL_MODE.value})**\n\n"
     for i, r in enumerate(results, 1):
         md = r.get("metadata", {})
         rerank_score = r.get("rerank_score")
         distance = r.get("distance")
+        # Trích xuất metadata
         source = md.get("source_file", "N/A")
         doc_type = md.get("document_type", "N/A")
         header = md.get("header_path", "")
         program = md.get("program_name", "")
         issued_year = md.get("issued_year", "")
+        # Format score
         score_info = ""
         if rerank_score is not None:
             score_info += f"Rerank: `{rerank_score:.4f}` "
         if not score_info:
             score_info = f"Rank: `{r.get('final_rank', i)}`"
+        # Format metadata
         meta_parts = [f"**Nguồn:** {source}", f"**Loại:** {doc_type}"]
         if issued_year:
             meta_parts.append(f"**Năm:** {issued_year}")
     yield acc + debug_info
+# Tạo giao diện Gradio
 demo = gr.ChatInterface(
     fn=rag_chat,
     title=f"HUST RAG Assistant",

core/hash_file/hash_data_goc.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import sys
-import os
 import json
 import shutil
 from pathlib import Path
-current_file = Path(__file__).resolve()
-project_root = current_file.parent.parent.parent
-if str(project_root) not in sys.path:
-    sys.path.insert(0, str(project_root))
 from core.hash_file.hash_file import HashProcessor
@@ -16,130 +14,113 @@ HF_RAW_PDF_REPO = "hungnha/Do_An_Dataset"
 def download_from_hf(cache_dir: Path) -> Path:
-    try:
-        from huggingface_hub import snapshot_download
-    except ImportError:
-        print("Installing huggingface_hub...")
-        os.system("pip install huggingface_hub")
-        from huggingface_hub import snapshot_download
     if cache_dir.exists() and any(cache_dir.iterdir()):
         print(f"Cache đã tồn tại: {cache_dir}")
         return cache_dir / "data_rag"
-    print(f"Đang tải PDF từ HuggingFace: {HF_RAW_PDF_REPO}")
     snapshot_download(
         repo_id=HF_RAW_PDF_REPO,
         repo_type="dataset",
         local_dir=str(cache_dir),
         local_dir_use_symlinks=False,
     )
-    print("Tải xong!")
     return cache_dir / "data_rag"
 def main():
     import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--source", type=str, help="Đường dẫn local tới PDF (bỏ qua để tải từ HF)")
     parser.add_argument("--download-only", action="store_true", help="Chỉ tải về, không copy")
     args = parser.parse_args()
-    data_dir = project_root / "data"
     files_dir = data_dir / "files"
     files_dir.mkdir(parents=True, exist_ok=True)
-    # Xác định nguồn PDF
     if args.source:
         source_root = Path(args.source)
         if not source_root.exists():
-            print(f"Thư mục nguồn không tồn tại: {source_root}")
-            return
     else:
         # Tải từ HuggingFace
-        cache_dir = data_dir / "raw_pdf_cache"
-        source_root = download_from_hf(cache_dir)
         if args.download_only:
-            print(f"PDF đã cache tại: {source_root}")
-            return
     if not source_root.exists():
-        print(f"Không tìm thấy thư mục PDF: {source_root}")
-        return
-    hash_processor = HashProcessor(verbose=False)
-    hash_file_path = data_dir / "hash_data_goc_index.json"
-    existing_hashes = {}
-    if hash_file_path.exists():
-        with open(hash_file_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-            existing_hashes = {item['filename']: item['hash'] for item in data.get('train', [])}
-        print(f"Đã tải {len(existing_hashes)} hash từ index cũ")
-    print(f"Đang quét file từ: {source_root}")
-    pdf_files = list(source_root.rglob("*.pdf"))
-    print(f"Tìm thấy {len(pdf_files)} files PDF\n")
-    hash_results = []
-    skipped = 0
-    processed = 0
-    for idx, source_path in enumerate(pdf_files):
-        relative_path = source_path.relative_to(source_root)
-        filename = str(relative_path)
-        dest_path = files_dir / relative_path
-        dest_path.parent.mkdir(parents=True, exist_ok=True)
-        # Kiểm tra file đã tồn tại và hash khớp chưa
-        if dest_path.exists() and filename in existing_hashes:
-            current_hash = hash_processor.get_file_hash(str(dest_path))
-            if current_hash == existing_hashes[filename]:
-                hash_results.append({
-                    'filename': filename,
-                    'hash': current_hash,
-                    'index': idx
-                })
-                skipped += 1
-                continue
-        try:
-            shutil.copy2(source_path, dest_path)
-            file_hash = hash_processor.get_file_hash(str(dest_path))
-            if file_hash is None:
-                print(f"Lỗi tính hash cho file {filename}")
-                continue
-            hash_results.append({
-                'filename': filename,
-                'hash': file_hash,
-                'index': idx
-            })
-            processed += 1
-            if (idx + 1) % 10 == 0:
-                print(f"Processed {idx + 1}/{len(pdf_files)} files")
-        except Exception as e:
-            print(f"Lỗi khi xử lý file {filename}: {e}")
-            continue
-    output_data = {
-        'train': hash_results,
-        'total_files': len(hash_results)
-    }
-    with open(hash_file_path, 'w', encoding='utf-8') as f:
-        json.dump(output_data, f, ensure_ascii=False, indent=2)
-    print(f"\nHoàn tất!")
-    print(f"Tổng số file: {len(hash_results)}")
-    print(f"Đã xử lý mới: {processed}")
-    print(f"Đã bỏ qua (trùng hash): {skipped}")
-    print(f"File index: {hash_file_path}")
 if __name__ == "__main__":

 import sys
 import json
 import shutil
 from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 from core.hash_file.hash_file import HashProcessor
 def download_from_hf(cache_dir: Path) -> Path:
+    """Tải PDF từ HuggingFace, trả về đường dẫn tới folder data_rag."""
+    from huggingface_hub import snapshot_download
+    # Kiểm tra cache đã tồn tại chưa
     if cache_dir.exists() and any(cache_dir.iterdir()):
         print(f"Cache đã tồn tại: {cache_dir}")
         return cache_dir / "data_rag"
+    print(f"Đang tải từ HuggingFace: {HF_RAW_PDF_REPO}")
     snapshot_download(
         repo_id=HF_RAW_PDF_REPO,
         repo_type="dataset",
         local_dir=str(cache_dir),
         local_dir_use_symlinks=False,
     )
     return cache_dir / "data_rag"
+def load_existing_hashes(path: Path) -> dict:
+    """Đọc hash index cũ từ file JSON."""
+    if not path.exists():
+        return {}
+    try:
+        data = json.loads(path.read_text(encoding='utf-8'))
+        return {item['filename']: item['hash'] for item in data.get('train', [])}
+    except Exception:
+        return {}
+def process_pdfs(source_root: Path, dest_dir: Path, existing_hashes: dict) -> tuple:
+    """Copy PDFs và tính hash. Trả về (results, processed, skipped)."""
+    hasher = HashProcessor(verbose=False)
+    pdf_files = list(source_root.rglob("*.pdf"))
+    print(f"Tìm thấy {len(pdf_files)} file PDF\n")
+    results, processed, skipped = [], 0, 0
+    for idx, src in enumerate(pdf_files):
+        rel_path = str(src.relative_to(source_root))
+        dest = dest_dir / rel_path
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        # Bỏ qua nếu file không thay đổi (hash khớp)
+        if dest.exists() and rel_path in existing_hashes:
+            current_hash = hasher.get_file_hash(str(dest))
+            if current_hash == existing_hashes[rel_path]:
+                results.append({'filename': rel_path, 'hash': current_hash, 'index': idx})
+                skipped += 1
+                continue
+        # Copy và tính hash
+        try:
+            shutil.copy2(src, dest)
+            file_hash = hasher.get_file_hash(str(dest))
+            if file_hash:
+                results.append({'filename': rel_path, 'hash': file_hash, 'index': idx})
+                processed += 1
+        except Exception as e:
+            print(f"Lỗi: {rel_path} - {e}")
+        # Hiển thị tiến độ
+        if (idx + 1) % 20 == 0:
+            print(f"Tiến độ: {idx + 1}/{len(pdf_files)}")
+    return results, processed, skipped
 def main():
     import argparse
+    parser = argparse.ArgumentParser(description="Tải PDF và tạo hash index")
+    parser.add_argument("--source", type=str, help="Đường dẫn local tới PDFs (bỏ qua tải HF)")
     parser.add_argument("--download-only", action="store_true", help="Chỉ tải về, không copy")
     args = parser.parse_args()
+    data_dir = PROJECT_ROOT / "data"
     files_dir = data_dir / "files"
     files_dir.mkdir(parents=True, exist_ok=True)
+    hash_file = data_dir / "hash_data_goc_index.json"
+    # Xác định thư mục nguồn
     if args.source:
         source_root = Path(args.source)
         if not source_root.exists():
+            return print(f"Không tìm thấy thư mục nguồn: {source_root}")
     else:
         # Tải từ HuggingFace
+        source_root = download_from_hf(data_dir / "raw_pdf_cache")
         if args.download_only:
+            return print(f"PDF đã cache tại: {source_root}")
     if not source_root.exists():
+        return print(f"Không tìm thấy thư mục PDF: {source_root}")
+    # Xử lý
+    existing = load_existing_hashes(hash_file)
+    print(f"Đã tải {len(existing)} hash từ index cũ")
+    results, processed, skipped = process_pdfs(source_root, files_dir, existing)
+    # Lưu kết quả
+    hash_file.write_text(json.dumps({
+        'train': results,
+        'total_files': len(results)
+    }, ensure_ascii=False, indent=2), encoding='utf-8')
+    print(f"\nHoàn tất! Tổng: {len(results)} | Mới: {processed} | Bỏ qua: {skipped}")
+    print(f"File index: {hash_file}")
 if __name__ == "__main__":

core/hash_file/hash_file.py CHANGED Viewed

@@ -9,19 +9,23 @@ from pathlib import Path
 from typing import Dict, List, Optional
 from datetime import datetime
-# Constants
-CHUNK_SIZE = 8192  # 8KB chunks for reading files
 DEFAULT_FILE_EXTENSION = '.pdf'
 class HashProcessor:
     def __init__(self, verbose: bool = True):
         self.verbose = verbose
         self.logger = logging.getLogger(__name__)
         if not verbose:
             self.logger.setLevel(logging.WARNING)
     def get_file_hash(self, path: str) -> Optional[str]:
         h = hashlib.sha256()
         try:
             with open(path, "rb") as f:
@@ -41,6 +45,7 @@ class HashProcessor:
         file_extension: str = DEFAULT_FILE_EXTENSION,
         recursive: bool = False
     ) -> Dict[str, List[Dict[str, str]]]:
         source_path = Path(source_dir)
         if not source_path.exists():
             raise FileNotFoundError(f"Thư mục không tồn tại: {source_dir}")
@@ -73,6 +78,7 @@ class HashProcessor:
         return hash_to_files
     def load_processed_index(self, index_file: str) -> Dict:
         if os.path.exists(index_file):
             try:
                 with open(index_file, "r", encoding="utf-8") as f:
@@ -86,6 +92,10 @@ class HashProcessor:
         return {}
     def save_processed_index(self, index_file: str, processed_hashes: Dict) -> None:
         temp_name = None
         try:
             os.makedirs(os.path.dirname(index_file), exist_ok=True)
@@ -106,8 +116,9 @@ class HashProcessor:
                 os.remove(temp_name)
     def get_current_timestamp(self) -> str:
         return datetime.now().isoformat()
     def get_string_hash(self, text: str) -> str:
         return hashlib.sha256(text.encode('utf-8')).hexdigest()

 from typing import Dict, List, Optional
 from datetime import datetime
+# Hằng số
+CHUNK_SIZE = 8192  # Đọc file theo chunk 8KB
 DEFAULT_FILE_EXTENSION = '.pdf'
 class HashProcessor:
+    """Lớp xử lý hash cho files - dùng để phát hiện thay đổi và tránh xử lý lại."""
     def __init__(self, verbose: bool = True):
+        """Khởi tạo HashProcessor."""
         self.verbose = verbose
         self.logger = logging.getLogger(__name__)
         if not verbose:
             self.logger.setLevel(logging.WARNING)
     def get_file_hash(self, path: str) -> Optional[str]:
+        """Tính SHA256 hash của một file."""
         h = hashlib.sha256()
         try:
             with open(path, "rb") as f:
         file_extension: str = DEFAULT_FILE_EXTENSION,
         recursive: bool = False
     ) -> Dict[str, List[Dict[str, str]]]:
+        """Quét thư mục và tính hash cho tất cả files."""
         source_path = Path(source_dir)
         if not source_path.exists():
             raise FileNotFoundError(f"Thư mục không tồn tại: {source_dir}")
         return hash_to_files
     def load_processed_index(self, index_file: str) -> Dict:
+        """Đọc file index đã xử lý từ JSON."""
         if os.path.exists(index_file):
             try:
                 with open(index_file, "r", encoding="utf-8") as f:
         return {}
     def save_processed_index(self, index_file: str, processed_hashes: Dict) -> None:
+        """Lưu index đã xử lý vào file JSON (atomic write).
+        Ghi vào file tạm trước, sau đó rename để đảm bảo an toàn.
+        """
         temp_name = None
         try:
             os.makedirs(os.path.dirname(index_file), exist_ok=True)
                 os.remove(temp_name)
     def get_current_timestamp(self) -> str:
+        """Lấy timestamp hiện tại theo định dạng ISO."""
         return datetime.now().isoformat()
     def get_string_hash(self, text: str) -> str:
+        """Tính SHA256 hash của một chuỗi text."""
         return hashlib.sha256(text.encode('utf-8')).hexdigest()

core/preprocessing/docling_processor.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import re
 import gc
 import signal
 import logging
 from datetime import datetime
@@ -12,19 +13,35 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructur
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 class DoclingProcessor:
     def __init__(self, output_dir: str, use_ocr: bool = True, timeout: int = 300, images_scale: float = 3.0):
         self.output_dir = output_dir
         self.timeout = timeout
         self.logger = logging.getLogger(__name__)
         os.makedirs(output_dir, exist_ok=True)
-        # Pipeline options
         opts = PdfPipelineOptions(do_ocr=use_ocr, do_table_structure=True)
         opts.table_structure_options = TableStructureOptions(do_cell_matching=True, mode=TableFormerMode.ACCURATE)
         opts.images_scale = images_scale
         if use_ocr:
             ocr = EasyOcrOptions()
             ocr.lang = ["vi"]
@@ -34,39 +51,69 @@ class DoclingProcessor:
         self.converter = DocumentConverter(format_options={
             InputFormat.PDF: FormatOption(backend=PyPdfiumDocumentBackend, pipeline_cls=StandardPdfPipeline, pipeline_options=opts)
         })
-        self.logger.info(f"🔧 Docling | OCR={use_ocr} | Table=accurate | Scale={images_scale} | timeout={timeout}s")
     def clean_markdown(self, text: str) -> str:
         text = re.sub(r'\n\s*Trang\s+\d+\s*\n', '\n', text)
         return re.sub(r'\n{3,}', '\n\n', text).strip()
     def parse_document(self, file_path: str) -> str | None:
         if not os.path.exists(file_path):
             return None
         filename = os.path.basename(file_path)
         try:
             signal.signal(signal.SIGALRM, lambda s, f: (_ for _ in ()).throw(TimeoutError()))
             signal.alarm(self.timeout)
             result = self.converter.convert(file_path)
             md = result.document.export_to_markdown(image_placeholder="")
             signal.alarm(0)
             md = self.clean_markdown(md)
             return f"---\nfilename: {filename}\nfilepath: {file_path}\npage_count: {len(result.document.pages)}\nprocessed_at: {datetime.now().isoformat()}\n---\n\n{md}"
         except TimeoutError:
-            self.logger.warning(f" Timeout: {filename}")
             signal.alarm(0)
             return None
         except Exception as e:
-            self.logger.error(f" Failed: {filename}: {e}")
             signal.alarm(0)
             return None
     def parse_directory(self, source_dir: str) -> dict:
         source_path = Path(source_dir)
         pdf_files = list(source_path.rglob("*.pdf"))
-        self.logger.info(f" Found {len(pdf_files)} PDFs in {source_dir}")
         results = {"total": len(pdf_files), "parsed": 0, "skipped": 0, "errors": 0}
         for i, fp in enumerate(pdf_files):
             try:
                 rel = fp.relative_to(source_path)
@@ -75,20 +122,33 @@ class DoclingProcessor:
             out = Path(self.output_dir) / rel.with_suffix(".md")
             out.parent.mkdir(parents=True, exist_ok=True)
-            if out.exists():
                 results["skipped"] += 1
                 continue
-            md = self.parse_document(str(fp))
             if md:
                 out.write_text(md, encoding="utf-8")
                 results["parsed"] += 1
             else:
                 results["errors"] += 1
             if (i + 1) % 10 == 0:
                 gc.collect()
-                self.logger.info(f" {i+1}/{len(pdf_files)} (skip: {results['skipped']})")
-        self.logger.info(f" Done: {results['parsed']} parsed, {results['skipped']} skipped, {results['errors']} errors")
         return results

 import os
 import re
 import gc
+import sys
 import signal
 import logging
 from datetime import datetime
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+# Thêm project root vào path để import HashProcessor
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from core.hash_file.hash_file import HashProcessor
 class DoclingProcessor:
+    """Chuyển đổi PDF sang Markdown bằng Docling."""
     def __init__(self, output_dir: str, use_ocr: bool = True, timeout: int = 300, images_scale: float = 3.0):
+        """Khởi tạo processor với cấu hình OCR và table extraction."""
         self.output_dir = output_dir
         self.timeout = timeout
         self.logger = logging.getLogger(__name__)
+        self.hasher = HashProcessor(verbose=False)
         os.makedirs(output_dir, exist_ok=True)
+        # File lưu hash index
+        self.hash_index_path = Path(output_dir) / "docling_hash_index.json"
+        self.hash_index = self.hasher.load_processed_index(str(self.hash_index_path))
+        # Cấu hình pipeline PDF
         opts = PdfPipelineOptions(do_ocr=use_ocr, do_table_structure=True)
         opts.table_structure_options = TableStructureOptions(do_cell_matching=True, mode=TableFormerMode.ACCURATE)
         opts.images_scale = images_scale
+        # Cấu hình OCR tiếng Việt
         if use_ocr:
             ocr = EasyOcrOptions()
             ocr.lang = ["vi"]
         self.converter = DocumentConverter(format_options={
             InputFormat.PDF: FormatOption(backend=PyPdfiumDocumentBackend, pipeline_cls=StandardPdfPipeline, pipeline_options=opts)
         })
+        self.logger.info(f"Docling | OCR={use_ocr} | Table=accurate | Scale={images_scale} | timeout={timeout}s")
     def clean_markdown(self, text: str) -> str:
+        """Xóa số trang và khoảng trắng thừa."""
         text = re.sub(r'\n\s*Trang\s+\d+\s*\n', '\n', text)
         return re.sub(r'\n{3,}', '\n\n', text).strip()
+    def _should_process(self, pdf_path: str, output_path: Path) -> bool:
+        """Kiểm tra xem file PDF có cần xử lý lại không (dựa trên hash)."""
+        # Nếu output chưa tồn tại -> cần xử lý
+        if not output_path.exists():
+            return True
+        # Tính hash file PDF hiện tại
+        current_hash = self.hasher.get_file_hash(pdf_path)
+        if not current_hash:
+            return True
+        # So sánh với hash đã lưu
+        saved_hash = self.hash_index.get(pdf_path, {}).get("hash")
+        return current_hash != saved_hash
+    def _save_hash(self, pdf_path: str, file_hash: str) -> None:
+        """Lưu hash của file đã xử lý vào index."""
+        self.hash_index[pdf_path] = {
+            "hash": file_hash,
+            "processed_at": self.hasher.get_current_timestamp()
+        }
     def parse_document(self, file_path: str) -> str | None:
+        """Chuyển đổi 1 file PDF sang Markdown với timeout."""
         if not os.path.exists(file_path):
             return None
         filename = os.path.basename(file_path)
         try:
+            # Đặt timeout để tránh treo
             signal.signal(signal.SIGALRM, lambda s, f: (_ for _ in ()).throw(TimeoutError()))
             signal.alarm(self.timeout)
             result = self.converter.convert(file_path)
             md = result.document.export_to_markdown(image_placeholder="")
             signal.alarm(0)
             md = self.clean_markdown(md)
+            # Thêm frontmatter metadata
             return f"---\nfilename: {filename}\nfilepath: {file_path}\npage_count: {len(result.document.pages)}\nprocessed_at: {datetime.now().isoformat()}\n---\n\n{md}"
         except TimeoutError:
+            self.logger.warning(f"Timeout: {filename}")
             signal.alarm(0)
             return None
         except Exception as e:
+            self.logger.error(f"Lỗi: {filename}: {e}")
             signal.alarm(0)
             return None
     def parse_directory(self, source_dir: str) -> dict:
+        """Xử lý toàn bộ thư mục PDF, bỏ qua file không thay đổi (dựa trên hash)."""
         source_path = Path(source_dir)
         pdf_files = list(source_path.rglob("*.pdf"))
+        self.logger.info(f"Tìm thấy {len(pdf_files)} file PDF trong {source_dir}")
         results = {"total": len(pdf_files), "parsed": 0, "skipped": 0, "errors": 0}
         for i, fp in enumerate(pdf_files):
             try:
                 rel = fp.relative_to(source_path)
             out = Path(self.output_dir) / rel.with_suffix(".md")
             out.parent.mkdir(parents=True, exist_ok=True)
+            pdf_path = str(fp)
+            # Kiểm tra hash để quyết định có cần xử lý không
+            if not self._should_process(pdf_path, out):
                 results["skipped"] += 1
                 continue
+            # Tính hash trước khi xử lý
+            file_hash = self.hasher.get_file_hash(pdf_path)
+            md = self.parse_document(pdf_path)
             if md:
                 out.write_text(md, encoding="utf-8")
                 results["parsed"] += 1
+                # Lưu hash sau khi xử lý thành công
+                if file_hash:
+                    self._save_hash(pdf_path, file_hash)
             else:
                 results["errors"] += 1
+            # Dọn memory mỗi 10 files
             if (i + 1) % 10 == 0:
                 gc.collect()
+                self.logger.info(f"{i+1}/{len(pdf_files)} (bỏ qua: {results['skipped']})")
+        # Lưu hash index sau khi xử lý xong
+        self.hasher.save_processed_index(str(self.hash_index_path), self.hash_index)
+        self.logger.info(f"Xong: {results['parsed']} đã xử lý, {results['skipped']} bỏ qua, {results['errors']} lỗi")
         return results

core/preprocessing/pdf_parser.py CHANGED Viewed

@@ -1,19 +1,22 @@
 from docling_processor import DoclingProcessor
-PDF_FILE = "data/data_raw/quyet_dinh/quy-dinh-chuan-ngoai-ngu-2021.pdf"
-SOURCE_DIR = "data/data_raw"
-OUTPUT_DIR = "data"
-USE_OCR = False
 if __name__ == "__main__":
     processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR)
     if PDF_FILE:
-        print(f"Parsing: {PDF_FILE}")
         result = processor.parse_document(PDF_FILE)
-        print(f"Done: {result}" if result else "Skipped/failed")
     else:
-        print(f"Parsing: {SOURCE_DIR}")
         r = processor.parse_directory(SOURCE_DIR)
-        print(f"Total: {r['total']} | OK: {r['parsed']} | Skip: {r['skipped']} | Err: {r['errors']}")

 from docling_processor import DoclingProcessor
+# Cấu hình đường dẫn
+PDF_FILE = ""  # File đơn lẻ (để trống nếu muốn parse cả thư mục)
+SOURCE_DIR = "data/data_raw"  # Thư mục chứa PDFs
+OUTPUT_DIR = "data"           # Thư mục xuất Markdown
+USE_OCR = False               # Bật OCR cho PDF scan
 if __name__ == "__main__":
     processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR)
     if PDF_FILE:
+        # Parse 1 file đơn lẻ
+        print(f"Đang xử lý: {PDF_FILE}")
         result = processor.parse_document(PDF_FILE)
+        print("Xong!" if result else "Lỗi hoặc bỏ qua")
     else:
+        # Parse cả thư mục
+        print(f"Đang xử lý thư mục: {SOURCE_DIR}")
         r = processor.parse_directory(SOURCE_DIR)
+        print(f"Tổng: {r['total']} | Thành công: {r['parsed']} | Bỏ qua: {r['skipped']} | Lỗi: {r['errors']}")

core/rag/chunk.py CHANGED Viewed

@@ -10,37 +10,41 @@ from llama_index.core import Document
 from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
 from llama_index.core.schema import BaseNode, TextNode
-# Config
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 150
 MIN_CHUNK_SIZE = 200
 TABLE_ROWS_PER_CHUNK = 15
-# Small-to-Big Config
 ENABLE_TABLE_SUMMARY = True
-MIN_TABLE_ROWS_FOR_SUMMARY = 0  # Summarize ALL tables regardless of size
-SUMMARY_MODEL = "nex-agi/DeepSeek-V3.1-Nex-N1"
-SILICONFLOW_BASE_URL = "https://api.siliconflow.com/v1"
-# Regex
 COURSE_PATTERN = re.compile(r"Học\s*phần\s+(.+?)\s*\(\s*m[ãa]\s+([^\)]+)\)", re.I | re.DOTALL)
 TABLE_PLACEHOLDER = re.compile(r"__TBL_(\d+)__")
 HEADER_KEYWORDS = {'TT', 'STT', 'MÃ', 'TÊN', 'KHỐI', 'SỐ', 'ID', 'NO', '#'}
 FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.DOTALL)
-# Pattern để trích xuất số bảng và tiêu đề (ví dụ: "Bảng 3.1 Danh mục các học phần...")
 TABLE_TITLE_PATTERN = re.compile(r"(?:^|\n)#+\s*(?:Bảng|BẢNG)\s*(\d+(?:\.\d+)?)\s*[.:]*\s*(.+?)(?:\n|$)", re.IGNORECASE)
 def _is_table_row(line: str) -> bool:
     s = line.strip()
     return s.startswith("|") and s.endswith("|") and s.count("|") >= 2
 def _is_separator(line: str) -> bool:
     if not _is_table_row(line):
         return False
     return not line.strip().replace("|", "").replace("-", "").replace(":", "").replace(" ", "")
 def _is_header(line: str) -> bool:
     if not _is_table_row(line):
         return False
     cells = [c.strip() for c in line.split("|") if c.strip()]
@@ -50,6 +54,7 @@ def _is_header(line: str) -> bool:
 def _extract_tables(text: str) -> Tuple[List[Tuple[str, List[str]]], str]:
     lines, tables, last_header, i = text.split("\n"), [], None, 0
     while i < len(lines) - 1:
@@ -73,7 +78,7 @@ def _extract_tables(text: str) -> Tuple[List[Tuple[str, List[str]]], str]:
         else:
             i += 1
-    # Replace tables with placeholders
     result, tbl_idx, i = [], 0, 0
     while i < len(lines):
         if tbl_idx < len(tables) and i < len(lines) - 1 and _is_table_row(lines[i]) and _is_separator(lines[i + 1]):
@@ -90,6 +95,7 @@ def _extract_tables(text: str) -> Tuple[List[Tuple[str, List[str]]], str]:
 def _split_table(header: str, rows: List[str], max_rows: int = TABLE_ROWS_PER_CHUNK) -> List[str]:
     if len(rows) <= max_rows:
         return [header + "\n".join(rows)]
@@ -98,26 +104,29 @@ def _split_table(header: str, rows: List[str], max_rows: int = TABLE_ROWS_PER_CH
         chunk_rows = rows[i:i + max_rows]
         chunks.append(chunk_rows)
-    # Merge last chunk if too small (< 5 rows)
     if len(chunks) > 1 and len(chunks[-1]) < 5:
         chunks[-2].extend(chunks[-1])
         chunks.pop()
     return [header + "\n".join(r) for r in chunks]
 _summary_client: Optional[OpenAI] = None
 def _get_summary_client() -> Optional[OpenAI]:
     global _summary_client
     if _summary_client is not None:
         return _summary_client
-    api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
     if not api_key:
-        print("SILICONFLOW_API_KEY not set. Table summarization disabled.")
         return None
-    _summary_client = OpenAI(api_key=api_key, base_url=SILICONFLOW_BASE_URL)
     return _summary_client
@@ -130,17 +139,17 @@ def _summarize_table(
     max_retries: int = 5,
     base_delay: float = 2.0
 ) -> str:
-    """Summarize a table with retry logic. Raises exception if all retries fail."""
     import time
     if not ENABLE_TABLE_SUMMARY:
-        raise RuntimeError("Table summarization is disabled but required. Set ENABLE_TABLE_SUMMARY = True")
     client = _get_summary_client()
     if client is None:
-        raise RuntimeError("SILICONFLOW_API_KEY not set. Cannot summarize tables.")
-    # Build table identifier string
     table_id_parts = []
     if table_number:
         table_id_parts.append(f"Bảng {table_number}")
@@ -149,7 +158,7 @@ def _summarize_table(
     if source_file:
         table_id_parts.append(f"từ file {source_file}")
-    table_identifier = " - ".join(table_id_parts) if table_id_parts else "Unknown table"
     prompt = f"""Tóm tắt ngắn gọn nội dung bảng sau bằng tiếng Việt.
@@ -179,20 +188,17 @@ Bảng:
             if summary.strip():
                 return summary.strip()
             else:
-                raise ValueError("Empty summary returned from API")
         except Exception as e:
             last_error = e
-            delay = base_delay * (2 ** attempt)  # Exponential backoff: 2, 4, 8, 16, 32 seconds
-            print(f"⚠️  Retry {attempt + 1}/{max_retries} for {table_identifier}: {e}")
-            print(f"   Waiting {delay:.1f}s before retry...")
             time.sleep(delay)
-    # All retries failed
-    raise RuntimeError(f"Failed to summarize {table_identifier} after {max_retries} retries. Last error: {last_error}")
 def _create_table_nodes(
@@ -203,11 +209,11 @@ def _create_table_nodes(
     table_title: str = "",
     source_file: str = ""
 ) -> List[TextNode]:
-    """Create table nodes. For large tables, creates parent+summary nodes with retry until success."""
-    # Count rows to decide if we should summarize
     row_count = table_text.count("\n")
-    # Add table info to metadata
     table_meta = {**metadata}
     if table_number:
         table_meta["table_number"] = table_number
@@ -215,10 +221,15 @@ def _create_table_nodes(
         table_meta["table_title"] = table_title
     if row_count < MIN_TABLE_ROWS_FOR_SUMMARY:
-        # Table too small, just return as-is (no summary needed)
         return [TextNode(text=table_text, metadata={**table_meta, "is_table": True})]
-    # Generate summary with retry logic (will raise exception if all retries fail)
     summary = _summarize_table(
         table_text,
         context_hint,
@@ -227,37 +238,36 @@ def _create_table_nodes(
         source_file=source_file
     )
-    # Create parent node (raw table - will NOT be embedded)
     parent_id = str(uuid.uuid4())
     parent_node = TextNode(
         text=table_text,
         metadata={
             **table_meta,
             "is_table": True,
-            "is_parent": True,  # Flag to skip embedding
             "node_id": parent_id,
         }
     )
     parent_node.id_ = parent_id
-    # Create summary node (will be embedded for search)
     summary_node = TextNode(
         text=summary,
         metadata={
             **table_meta,
             "is_table_summary": True,
-            "parent_id": parent_id,  # Link to parent
         }
     )
-    table_id = f"Bảng {table_number}" if table_number else "table"
-    print(f"✅ Created summary for {table_id} ({row_count} rows)")
     return [parent_node, summary_node]
 def _enrich_metadata(node: BaseNode, source_path: Path | None) -> None:
     if source_path:
         node.metadata.update({"source_path": str(source_path), "source_file": source_path.name})
     if "Học phần" in (text := node.get_content()) and (m := COURSE_PATTERN.search(text)):
@@ -265,6 +275,7 @@ def _enrich_metadata(node: BaseNode, source_path: Path | None) -> None:
 def _chunk_text(text: str, metadata: dict) -> List[BaseNode]:
     if len(text) <= CHUNK_SIZE:
         return [TextNode(text=text, metadata=metadata.copy())]
     return SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).get_nodes_from_documents(
@@ -273,6 +284,7 @@ def _chunk_text(text: str, metadata: dict) -> List[BaseNode]:
 def _extract_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
     match = FRONTMATTER_PATTERN.match(text)
     if not match:
         return {}, text
@@ -286,22 +298,23 @@ def _extract_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
 def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[BaseNode]:
     if not text or not text.strip():
         return []
     path = Path(source_path) if source_path else None
-    # Extract YAML frontmatter as metadata (không chunk)
     frontmatter_meta, text = _extract_frontmatter(text)
     tables, text_with_placeholders = _extract_tables(text)
-    # Base metadata from frontmatter + source path
     base_meta = {**frontmatter_meta}
     if path:
         base_meta.update({"source_path": str(path), "source_file": path.name})
-    # Parse by headings
     doc = Document(text=text_with_placeholders, metadata=base_meta.copy())
     heading_nodes = MarkdownNodeParser().get_nodes_from_documents([doc])
@@ -316,14 +329,13 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
         last_end = 0
         for match in matches:
-            # Text before table
             before_text = content[last_end:match.start()].strip()
-            # Extract table number and title from text before table
             table_number = ""
             table_title = ""
             if before_text:
-                # Look for patterns like "## Bảng 3.1 Danh mục các học phần..."
                 title_match = TABLE_TITLE_PATTERN.search(before_text)
                 if title_match:
                     table_number = title_match.group(1).strip()
@@ -332,15 +344,15 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
             if before_text and len(before_text) >= MIN_CHUNK_SIZE:
                 nodes.extend(_chunk_text(before_text, meta) if len(before_text) > CHUNK_SIZE else [TextNode(text=before_text, metadata=meta.copy())])
-            # Table chunks - using Small-to-Big pattern
             if (idx := int(match.group(1))) < len(tables):
                 header, rows = tables[idx]
                 table_chunks = _split_table(header, rows)
-                # Get context hint from header path
                 context_hint = meta.get("Header 1", "") or meta.get("section", "")
-                # Get source file for summary
                 source_file = meta.get("source_file", "") or (path.name if path else "")
                 for i, chunk in enumerate(table_chunks):
@@ -348,7 +360,7 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
                     if len(table_chunks) > 1:
                         chunk_meta["table_part"] = f"{i+1}/{len(table_chunks)}"
-                    # Create parent + summary nodes if applicable
                     table_nodes = _create_table_nodes(
                         chunk,
                         chunk_meta,
@@ -361,11 +373,11 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
             last_end = match.end()
-        # Text after table
         if (after := content[last_end:].strip()) and len(after) >= MIN_CHUNK_SIZE:
             nodes.extend(_chunk_text(after, meta) if len(after) > CHUNK_SIZE else [TextNode(text=after, metadata=meta.copy())])
     final: List[BaseNode] = []
     i = 0
     while i < len(nodes):
@@ -373,12 +385,12 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
         curr_content = curr.get_content()
         curr_is_table = curr.metadata.get("is_table")
-        # Skip empty or whitespace-only nodes
         if not curr_content.strip():
             i += 1
             continue
-        # If current node is small non-table and there's a next node
         if not curr_is_table and len(curr_content) < MIN_CHUNK_SIZE and i + 1 < len(nodes):
             next_node = nodes[i + 1]
             next_is_table = next_node.metadata.get("is_table")
@@ -405,7 +417,8 @@ def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[Bas
 def chunk_markdown_file(path: str | Path) -> List[BaseNode]:
     p = Path(path)
     if not p.exists():
-        raise FileNotFoundError(f"File not found: {p}")
     return chunk_markdown(p.read_text(encoding="utf-8"), source_path=p)

 from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
 from llama_index.core.schema import BaseNode, TextNode
+# Cấu hình chunking
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 150
 MIN_CHUNK_SIZE = 200
 TABLE_ROWS_PER_CHUNK = 15
+# Cấu hình Small-to-Big
 ENABLE_TABLE_SUMMARY = True
+MIN_TABLE_ROWS_FOR_SUMMARY = 0
+SUMMARY_MODEL = "openai/gpt-oss-120b"
+GROQ_BASE_URL = "https://api.groq.com/openai/v1"
+# Regex patterns
 COURSE_PATTERN = re.compile(r"Học\s*phần\s+(.+?)\s*\(\s*m[ãa]\s+([^\)]+)\)", re.I | re.DOTALL)
 TABLE_PLACEHOLDER = re.compile(r"__TBL_(\d+)__")
 HEADER_KEYWORDS = {'TT', 'STT', 'MÃ', 'TÊN', 'KHỐI', 'SỐ', 'ID', 'NO', '#'}
 FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.DOTALL)
 TABLE_TITLE_PATTERN = re.compile(r"(?:^|\n)#+\s*(?:Bảng|BẢNG)\s*(\d+(?:\.\d+)?)\s*[.:]*\s*(.+?)(?:\n|$)", re.IGNORECASE)
 def _is_table_row(line: str) -> bool:
+    """Kiểm tra dòng có phải là hàng trong bảng Markdown không."""
     s = line.strip()
     return s.startswith("|") and s.endswith("|") and s.count("|") >= 2
 def _is_separator(line: str) -> bool:
+    """Kiểm tra dòng có phải là separator của bảng (|---|---|)."""
     if not _is_table_row(line):
         return False
     return not line.strip().replace("|", "").replace("-", "").replace(":", "").replace(" ", "")
 def _is_header(line: str) -> bool:
+    """Kiểm tra dòng có phải là header của bảng không."""
     if not _is_table_row(line):
         return False
     cells = [c.strip() for c in line.split("|") if c.strip()]
 def _extract_tables(text: str) -> Tuple[List[Tuple[str, List[str]]], str]:
+    """Trích xuất bảng từ text và thay bằng placeholder."""
     lines, tables, last_header, i = text.split("\n"), [], None, 0
     while i < len(lines) - 1:
         else:
             i += 1
+    # Thay bảng bằng placeholder
     result, tbl_idx, i = [], 0, 0
     while i < len(lines):
         if tbl_idx < len(tables) and i < len(lines) - 1 and _is_table_row(lines[i]) and _is_separator(lines[i + 1]):
 def _split_table(header: str, rows: List[str], max_rows: int = TABLE_ROWS_PER_CHUNK) -> List[str]:
+    """Chia bảng lớn thành nhiều chunks nhỏ."""
     if len(rows) <= max_rows:
         return [header + "\n".join(rows)]
         chunk_rows = rows[i:i + max_rows]
         chunks.append(chunk_rows)
+    # Gộp chunk cuối nếu quá nhỏ (< 5 dòng)
     if len(chunks) > 1 and len(chunks[-1]) < 5:
         chunks[-2].extend(chunks[-1])
         chunks.pop()
     return [header + "\n".join(r) for r in chunks]
 _summary_client: Optional[OpenAI] = None
 def _get_summary_client() -> Optional[OpenAI]:
+    """Lấy Groq client để tóm tắt bảng."""
     global _summary_client
     if _summary_client is not None:
         return _summary_client
+    api_key = os.getenv("GROQ_API_KEY", "").strip()
     if not api_key:
+        print("Chưa đặt GROQ_API_KEY. Tắt tính năng tóm tắt bảng.")
         return None
+    _summary_client = OpenAI(api_key=api_key, base_url=GROQ_BASE_URL)
     return _summary_client
     max_retries: int = 5,
     base_delay: float = 2.0
 ) -> str:
+    """Tóm tắt bảng bằng LLM với retry logic."""
     import time
     if not ENABLE_TABLE_SUMMARY:
+        raise RuntimeError("Tính năng tóm tắt bảng đã tắt. Đặt ENABLE_TABLE_SUMMARY = True")
     client = _get_summary_client()
     if client is None:
+        raise RuntimeError("Chưa đặt GROQ_API_KEY. Không thể tóm tắt bảng.")
+    # Tạo chuỗi định danh bảng
     table_id_parts = []
     if table_number:
         table_id_parts.append(f"Bảng {table_number}")
     if source_file:
         table_id_parts.append(f"từ file {source_file}")
+    table_identifier = " - ".join(table_id_parts) if table_id_parts else "Bảng không xác định"
     prompt = f"""Tóm tắt ngắn gọn nội dung bảng sau bằng tiếng Việt.
             if summary.strip():
                 return summary.strip()
             else:
+                raise ValueError("API trả về summary rỗng")
         except Exception as e:
             last_error = e
+            delay = base_delay * (2 ** attempt)  # Exponential backoff: 2, 4, 8, 16, 32 giây
+            print(f"Thử lại {attempt + 1}/{max_retries} cho {table_identifier}: {e}")
+            print(f"   Đợi {delay:.1f}s trước khi thử lại...")
             time.sleep(delay)
+    # Tất cả retry đều thất bại
+    raise RuntimeError(f"Không thể tóm tắt {table_identifier} sau {max_retries} lần thử. Lỗi cuối: {last_error}")
 def _create_table_nodes(
     table_title: str = "",
     source_file: str = ""
 ) -> List[TextNode]:
+    """Tạo nodes cho bảng. Bảng lớn sẽ có parent + summary node."""
+    # Đếm số dòng để quyết định có cần tóm tắt không
     row_count = table_text.count("\n")
+    # Thêm thông tin bảng vào metadata
     table_meta = {**metadata}
     if table_number:
         table_meta["table_number"] = table_number
         table_meta["table_title"] = table_title
     if row_count < MIN_TABLE_ROWS_FOR_SUMMARY:
+        # Bảng quá nhỏ, không cần tóm tắt
+        return [TextNode(text=table_text, metadata={**table_meta, "is_table": True})]
+    # Kiểm tra có thể tóm tắt không (cần API key)
+    if _get_summary_client() is None:
+        # Không có API key -> trả về node bảng đơn giản, không tóm tắt
         return [TextNode(text=table_text, metadata={**table_meta, "is_table": True})]
+    # Tạo summary với retry logic
     summary = _summarize_table(
         table_text,
         context_hint,
         source_file=source_file
     )
+    # Tạo parent node (bảng gốc - KHÔNG embed)
     parent_id = str(uuid.uuid4())
     parent_node = TextNode(
         text=table_text,
         metadata={
             **table_meta,
             "is_table": True,
+            "is_parent": True,  # Flag để bỏ qua embedding
             "node_id": parent_id,
         }
     )
     parent_node.id_ = parent_id
+    # Tạo summary node (SẼ được embed để search)
     summary_node = TextNode(
         text=summary,
         metadata={
             **table_meta,
             "is_table_summary": True,
+            "parent_id": parent_id,  # Link tới parent
         }
     )
+    table_id = f"Bảng {table_number}" if table_number else "bảng"
+    print(f"Đã tạo summary cho {table_id} ({row_count} dòng)")
     return [parent_node, summary_node]
 def _enrich_metadata(node: BaseNode, source_path: Path | None) -> None:
+    """Bổ sung metadata từ source path và trích xuất thông tin học phần."""
     if source_path:
         node.metadata.update({"source_path": str(source_path), "source_file": source_path.name})
     if "Học phần" in (text := node.get_content()) and (m := COURSE_PATTERN.search(text)):
 def _chunk_text(text: str, metadata: dict) -> List[BaseNode]:
+    """Chia text thành chunks theo kích thước cấu hình."""
     if len(text) <= CHUNK_SIZE:
         return [TextNode(text=text, metadata=metadata.copy())]
     return SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).get_nodes_from_documents(
 def _extract_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
+    """Trích xuất YAML frontmatter từ đầu file."""
     match = FRONTMATTER_PATTERN.match(text)
     if not match:
         return {}, text
 def chunk_markdown(text: str, source_path: str | Path | None = None) -> List[BaseNode]:
+    """Chunk một file Markdown thành các nodes."""
     if not text or not text.strip():
         return []
     path = Path(source_path) if source_path else None
+    # Trích xuất YAML frontmatter làm metadata (không chunk)
     frontmatter_meta, text = _extract_frontmatter(text)
     tables, text_with_placeholders = _extract_tables(text)
+    # Metadata cơ bản từ frontmatter + source path
     base_meta = {**frontmatter_meta}
     if path:
         base_meta.update({"source_path": str(path), "source_file": path.name})
+    # Parse theo headings
     doc = Document(text=text_with_placeholders, metadata=base_meta.copy())
     heading_nodes = MarkdownNodeParser().get_nodes_from_documents([doc])
         last_end = 0
         for match in matches:
+            # Text trước bảng
             before_text = content[last_end:match.start()].strip()
+            # Trích xuất số bảng và tiêu đề từ text trước bảng
             table_number = ""
             table_title = ""
             if before_text:
                 title_match = TABLE_TITLE_PATTERN.search(before_text)
                 if title_match:
                     table_number = title_match.group(1).strip()
             if before_text and len(before_text) >= MIN_CHUNK_SIZE:
                 nodes.extend(_chunk_text(before_text, meta) if len(before_text) > CHUNK_SIZE else [TextNode(text=before_text, metadata=meta.copy())])
+            # Chunk bảng - sử dụng Small-to-Big pattern
             if (idx := int(match.group(1))) < len(tables):
                 header, rows = tables[idx]
                 table_chunks = _split_table(header, rows)
+                # Lấy context hint từ header path
                 context_hint = meta.get("Header 1", "") or meta.get("section", "")
+                # Lấy source file cho summary
                 source_file = meta.get("source_file", "") or (path.name if path else "")
                 for i, chunk in enumerate(table_chunks):
                     if len(table_chunks) > 1:
                         chunk_meta["table_part"] = f"{i+1}/{len(table_chunks)}"
+                    # Tạo parent + summary nodes nếu cần
                     table_nodes = _create_table_nodes(
                         chunk,
                         chunk_meta,
             last_end = match.end()
+        # Text sau bảng
         if (after := content[last_end:].strip()) and len(after) >= MIN_CHUNK_SIZE:
             nodes.extend(_chunk_text(after, meta) if len(after) > CHUNK_SIZE else [TextNode(text=after, metadata=meta.copy())])
+    # Gộp các node nhỏ với node kế tiếp
     final: List[BaseNode] = []
     i = 0
     while i < len(nodes):
         curr_content = curr.get_content()
         curr_is_table = curr.metadata.get("is_table")
+        # Bỏ qua node rỗng
         if not curr_content.strip():
             i += 1
             continue
+        # Nếu node hiện tại nhỏ và không phải bảng -> gộp với node sau
         if not curr_is_table and len(curr_content) < MIN_CHUNK_SIZE and i + 1 < len(nodes):
             next_node = nodes[i + 1]
             next_is_table = next_node.metadata.get("is_table")
 def chunk_markdown_file(path: str | Path) -> List[BaseNode]:
+    """Đọc và chunk một file Markdown."""
     p = Path(path)
     if not p.exists():
+        raise FileNotFoundError(f"Không tìm thấy file: {p}")
     return chunk_markdown(p.read_text(encoding="utf-8"), source_path=p)

core/rag/embedding_model.py CHANGED Viewed

@@ -1,26 +1,30 @@
 from __future__ import annotations
 import os
 import logging
 from dataclasses import dataclass
 from typing import List, Sequence
 import numpy as np
 from openai import OpenAI
 from langchain_core.embeddings import Embeddings
-import time
 logger = logging.getLogger(__name__)
 @dataclass
 class EmbeddingConfig:
-    api_base_url: str = "https://api.siliconflow.com/v1"
-    model: str = "Qwen/Qwen3-Embedding-4B"
-    dimension: int = 2048
-    batch_size: int = 16
 _embed_config: EmbeddingConfig | None = None
 def get_embedding_config() -> EmbeddingConfig:
     global _embed_config
     if _embed_config is None:
         _embed_config = EmbeddingConfig()
@@ -28,26 +32,32 @@ def get_embedding_config() -> EmbeddingConfig:
 class QwenEmbeddings(Embeddings):
     def __init__(self, config: EmbeddingConfig | None = None):
         self.config = config or get_embedding_config()
         api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
         if not api_key:
-            raise ValueError("SILICONFLOW_API_KEY environment variable not set")
         self._client = OpenAI(
             api_key=api_key,
             base_url=self.config.api_base_url,
         )
-        logger.info(f"QwenEmbeddings initialized: {self.config.model}")
     def embed_query(self, text: str) -> List[float]:
         return self._embed_texts([text])[0]
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
         return self._embed_texts(texts)
     def _embed_texts(self, texts: Sequence[str]) -> List[List[float]]:
         if not texts:
             return []
@@ -55,9 +65,11 @@ class QwenEmbeddings(Embeddings):
         batch_size = self.config.batch_size
         max_retries = 3
         for i in range(0, len(texts), batch_size):
             batch = list(texts[i:i + batch_size])
             for attempt in range(max_retries):
                 try:
                     response = self._client.embeddings.create(
@@ -68,9 +80,10 @@ class QwenEmbeddings(Embeddings):
                         all_embeddings.append(item.embedding)
                     break
                 except Exception as e:
                     if "rate" in str(e).lower() and attempt < max_retries - 1:
-                        wait_time = 2 ** attempt  # 1s, 2s, 4s
-                        logger.warning(f"Rate limit hit, waiting {wait_time}s...")
                         time.sleep(wait_time)
                     else:
                         raise
@@ -78,9 +91,10 @@ class QwenEmbeddings(Embeddings):
         return all_embeddings
     def embed_texts_np(self, texts: Sequence[str]) -> np.ndarray:
         return np.asarray(self._embed_texts(list(texts)), dtype=np.float32)
-# Legacy alias
 SiliconFlowConfig = EmbeddingConfig
 get_config = get_embedding_config

 from __future__ import annotations
 import os
 import logging
+import time
 from dataclasses import dataclass
 from typing import List, Sequence
 import numpy as np
 from openai import OpenAI
 from langchain_core.embeddings import Embeddings
 logger = logging.getLogger(__name__)
 @dataclass
 class EmbeddingConfig:
+    """Cấu hình cho embedding model."""
+    api_base_url: str = "https://api.siliconflow.com/v1"  # SiliconFlow API
+    model: str = "Qwen/Qwen3-Embedding-4B"                # Model embedding
+    dimension: int = 2048                                  # Số chiều vector
+    batch_size: int = 16                                   # Số text mỗi batch
 _embed_config: EmbeddingConfig | None = None
 def get_embedding_config() -> EmbeddingConfig:
+    """Lấy cấu hình embedding (singleton pattern)."""
     global _embed_config
     if _embed_config is None:
         _embed_config = EmbeddingConfig()
 class QwenEmbeddings(Embeddings):
+    """Wrapper embedding model Qwen qua SiliconFlow API"""
     def __init__(self, config: EmbeddingConfig | None = None):
+        """Khởi tạo embedding client."""
         self.config = config or get_embedding_config()
         api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
         if not api_key:
+            raise ValueError("Chưa đặt biến môi trường SILICONFLOW_API_KEY")
         self._client = OpenAI(
             api_key=api_key,
             base_url=self.config.api_base_url,
         )
+        logger.info(f"Đã khởi tạo QwenEmbeddings: {self.config.model}")
     def embed_query(self, text: str) -> List[float]:
+        """Embed một câu query (dùng cho search)."""
         return self._embed_texts([text])[0]
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed nhiều documents (dùng khi index)."""
         return self._embed_texts(texts)
     def _embed_texts(self, texts: Sequence[str]) -> List[List[float]]:
+        """Embed danh sách texts theo batch với retry logic."""
         if not texts:
             return []
         batch_size = self.config.batch_size
         max_retries = 3
+        # Xử lý theo batch
         for i in range(0, len(texts), batch_size):
             batch = list(texts[i:i + batch_size])
+            # Retry logic cho rate limit
             for attempt in range(max_retries):
                 try:
                     response = self._client.embeddings.create(
                         all_embeddings.append(item.embedding)
                     break
                 except Exception as e:
+                    # Nếu bị rate limit -> đợi rồi thử lại
                     if "rate" in str(e).lower() and attempt < max_retries - 1:
+                        wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s
+                        logger.warning(f"Bị rate limit, đợi {wait_time}s...")
                         time.sleep(wait_time)
                     else:
                         raise
         return all_embeddings
     def embed_texts_np(self, texts: Sequence[str]) -> np.ndarray:
+        """Embed texts và trả về numpy array (tiện cho tính toán)."""
         return np.asarray(self._embed_texts(list(texts)), dtype=np.float32)
+# Alias để tương thích ngược
 SiliconFlowConfig = EmbeddingConfig
 get_config = get_embedding_config

core/rag/generator.py CHANGED Viewed

@@ -5,7 +5,7 @@ if TYPE_CHECKING:
     from core.rag.retrival import Retriever
-# System prompt để sử dụng khi gọi LLM (export cho gradio/eval dùng)
 SYSTEM_PROMPT = """Bạn là Trợ lý học vụ Đại học Bách khoa Hà Nội.
 ## NGUYÊN TẮC:
@@ -16,6 +16,7 @@ SYSTEM_PROMPT = """Bạn là Trợ lý học vụ Đại học Bách khoa Hà N
 def build_context(results: List[Dict[str, Any]], max_chars: int = 8000) -> str:
     parts = []
     for i, r in enumerate(results, 1):
         meta = r.get("metadata", {})
@@ -30,7 +31,7 @@ def build_context(results: List[Dict[str, Any]], max_chars: int = 8000) -> str:
         issued_year = meta.get("issued_year", "")
         content = r.get("content", "").strip()
-        # Build metadata line
         meta_info = f"Nguồn: {source}"
         if header and header != "/":
             meta_info += f" | Mục: {header}"
@@ -53,16 +54,20 @@ def build_context(results: List[Dict[str, Any]], max_chars: int = 8000) -> str:
         parts.append(f"[TÀI LIỆU {i}]\n{meta_info}\n{content}")
     context = "\n---\n".join(parts)
     return context[:max_chars] if len(context) > max_chars else context
 def build_prompt(question: str, context: str) -> str:
     return f"{SYSTEM_PROMPT}\n\n## CONTEXT:\n{context}\n\n## CÂU HỎI: {question}\n\n## TRẢ LỜI:"
 class RAGContextBuilder:
     def __init__(self, retriever: "Retriever", max_context_chars: int = 8000):
         self._retriever = retriever
         self._max_context_chars = max_context_chars
@@ -73,9 +78,11 @@ class RAGContextBuilder:
         initial_k: int = 20,
         mode: str = "hybrid_rerank"
     ) -> Dict[str, Any]:
         results = self._retriever.flexible_search(question, k=k, initial_k=initial_k, mode=mode)
         if not results:
             return {
                 "results": [],
@@ -84,15 +91,17 @@ class RAGContextBuilder:
                 "prompt": "",
             }
         context_text = build_context(results, self._max_context_chars)
         prompt = build_prompt(question, context_text)
         return {
-            "results": results,
-            "contexts": [r.get("content", "")[:1000] for r in results],
-            "context_text": context_text,
-            "prompt": prompt,
         }
 RAGGenerator = RAGContextBuilder

     from core.rag.retrival import Retriever
+# System prompt cho LLM (export để gradio/eval dùng)
 SYSTEM_PROMPT = """Bạn là Trợ lý học vụ Đại học Bách khoa Hà Nội.
 ## NGUYÊN TẮC:
 def build_context(results: List[Dict[str, Any]], max_chars: int = 8000) -> str:
+    """Xây dựng context từ kết quả retrieval để đưa vào prompt."""
     parts = []
     for i, r in enumerate(results, 1):
         meta = r.get("metadata", {})
         issued_year = meta.get("issued_year", "")
         content = r.get("content", "").strip()
+        # Tạo dòng metadata
         meta_info = f"Nguồn: {source}"
         if header and header != "/":
             meta_info += f" | Mục: {header}"
         parts.append(f"[TÀI LIỆU {i}]\n{meta_info}\n{content}")
     context = "\n---\n".join(parts)
+    # Cắt ngắn nếu vượt quá giới hạn
     return context[:max_chars] if len(context) > max_chars else context
 def build_prompt(question: str, context: str) -> str:
+    """Ghép system prompt, context và câu hỏi thành prompt hoàn chỉnh."""
     return f"{SYSTEM_PROMPT}\n\n## CONTEXT:\n{context}\n\n## CÂU HỎI: {question}\n\n## TRẢ LỜI:"
 class RAGContextBuilder:
+    """Kết hợp retrieval và context building thành một bước."""
     def __init__(self, retriever: "Retriever", max_context_chars: int = 8000):
+        """Khởi tạo với retriever và giới hạn context."""
         self._retriever = retriever
         self._max_context_chars = max_context_chars
         initial_k: int = 20,
         mode: str = "hybrid_rerank"
     ) -> Dict[str, Any]:
+        """Retrieve documents và chuẩn bị context + prompt cho LLM."""
+        # Tìm kiếm documents liên quan
         results = self._retriever.flexible_search(question, k=k, initial_k=initial_k, mode=mode)
+        # Không tìm thấy kết quả
         if not results:
             return {
                 "results": [],
                 "prompt": "",
             }
+        # Xây dựng context và prompt
         context_text = build_context(results, self._max_context_chars)
         prompt = build_prompt(question, context_text)
         return {
+            "results": results,                                          # Kết quả retrieval gốc
+            "contexts": [r.get("content", "")[:1000] for r in results],  # Context rút gọn (cho eval)
+            "context_text": context_text,                                # Context đầy đủ
+            "prompt": prompt,                                            # Prompt hoàn chỉnh
         }
+# Alias để tương thích ngược
 RAGGenerator = RAGContextBuilder

core/rag/retrival.py CHANGED Viewed

@@ -22,29 +22,30 @@ logger = logging.getLogger(__name__)
 class RetrievalMode(str, Enum):
-    """Retrieval modes."""
-    VECTOR_ONLY = "vector_only"
-    BM25_ONLY = "bm25_only"
-    HYBRID = "hybrid"
-    HYBRID_RERANK = "hybrid_rerank"
 @dataclass
 class RetrievalConfig:
-    rerank_api_base_url: str = "https://api.siliconflow.com/v1"
-    rerank_model: str = "Qwen/Qwen3-Reranker-4B"
-    rerank_top_n: int = 10
-    initial_k: int = 25  # Reduced to minimize reranker time
-    top_k: int = 5
-    vector_weight: float = 0.5
-    bm25_weight: float = 0.5
 _retrieval_config: RetrievalConfig | None = None
 def get_retrieval_config() -> RetrievalConfig:
     global _retrieval_config
     if _retrieval_config is None:
         _retrieval_config = RetrievalConfig()
@@ -52,6 +53,7 @@ def get_retrieval_config() -> RetrievalConfig:
 class SiliconFlowReranker(BaseDocumentCompressor):
     api_key: str = Field(default="")
     api_base_url: str = Field(default="")
     model: str = Field(default="")
@@ -66,9 +68,11 @@ class SiliconFlowReranker(BaseDocumentCompressor):
         query: str,
         callbacks: Optional[Callbacks] = None,
     ) -> Sequence[Document]:
         if not documents or not self.api_key:
             return list(documents)
         for attempt in range(3):
             try:
                 response = requests.post(
@@ -91,6 +95,7 @@ class SiliconFlowReranker(BaseDocumentCompressor):
                 if "results" not in data:
                     return list(documents)
                 reranked: List[Document] = []
                 for result in data["results"]:
                     doc = documents[result["index"]]
@@ -101,33 +106,36 @@ class SiliconFlowReranker(BaseDocumentCompressor):
                 return reranked
             except Exception as e:
                 if "rate" in str(e).lower() and attempt < 2:
                     time.sleep(2 ** attempt)
                 else:
-                    logger.error(f"Rerank error: {e}")
                     return list(documents)
         return list(documents)
 class Retriever:
     def __init__(self, vector_db: "ChromaVectorDB", use_reranker: bool = True):
         self._vector_db = vector_db
         self._config = get_retrieval_config()
         self._reranker: Optional[SiliconFlowReranker] = None
         self._vector_retriever = self._vector_db.vectorstore.as_retriever(
             search_kwargs={"k": self._config.initial_k}
         )
-        # Lazy-load BM25 - only initialize when needed
         self._bm25_retriever: Optional[BM25Retriever] = None
         self._bm25_initialized = False
         self._ensemble_retriever: Optional[EnsembleRetriever] = None
-        # BM25 cache path (persist to disk)
         from pathlib import Path
         persist_dir = getattr(self._vector_db.config, 'persist_dir', None)
         if persist_dir:
@@ -138,61 +146,57 @@ class Retriever:
         if use_reranker:
             self._reranker = self._init_reranker()
-        logger.info("Retriever initialized")
     def _save_bm25_cache(self, bm25: BM25Retriever) -> None:
-        """Save BM25 retriever to disk for fast loading."""
         if not self._bm25_cache_path:
             return
         try:
             import pickle
             with open(self._bm25_cache_path, 'wb') as f:
                 pickle.dump(bm25, f)
-            logger.info(f"BM25 cache saved to {self._bm25_cache_path}")
         except Exception as e:
-            logger.warning(f"Failed to save BM25 cache: {e}")
     def _load_bm25_cache(self) -> Optional[BM25Retriever]:
         if not self._bm25_cache_path or not self._bm25_cache_path.exists():
             return None
         try:
             import pickle
-            import time
             start = time.time()
             with open(self._bm25_cache_path, 'rb') as f:
                 bm25 = pickle.load(f)
             bm25.k = self._config.initial_k
-            logger.info(f"BM25 loaded from cache in {time.time() - start:.2f}s")
             return bm25
         except Exception as e:
-            logger.warning(f"Failed to load BM25 cache: {e}")
             return None
     def _init_bm25(self) -> Optional[BM25Retriever]:
         if self._bm25_initialized:
             return self._bm25_retriever
         self._bm25_initialized = True
-        # Try loading from cache first
         cached = self._load_bm25_cache()
         if cached:
             self._bm25_retriever = cached
             return cached
-        # Build from scratch
         try:
-            import time
             start = time.time()
-            logger.info("Building BM25 index from documents...")
             docs = self._vector_db.get_all_documents()
             if not docs:
-                logger.warning("No documents found for BM25")
                 return None
             lc_docs = [
@@ -203,19 +207,18 @@ class Retriever:
             bm25.k = self._config.initial_k
             self._bm25_retriever = bm25
-            logger.info(f"BM25 built with {len(docs)} docs in {time.time() - start:.2f}s")
-            # Save to cache for next time
             self._save_bm25_cache(bm25)
             return bm25
         except Exception as e:
-            logger.error(f"Failed to init BM25: {e}")
             return None
     def _get_ensemble_retriever(self) -> EnsembleRetriever:
-        """Get or create ensemble retriever (lazy-loaded)."""
         if self._ensemble_retriever is not None:
             return self._ensemble_retriever
@@ -226,14 +229,15 @@ class Retriever:
                 weights=[self._config.vector_weight, self._config.bm25_weight]
             )
         else:
             self._ensemble_retriever = EnsembleRetriever(
                 retrievers=[self._vector_retriever],
                 weights=[1.0]
             )
         return self._ensemble_retriever
     def _init_reranker(self) -> Optional[SiliconFlowReranker]:
         api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
         if not api_key:
             return None
@@ -245,7 +249,7 @@ class Retriever:
         )
     def _build_final(self):
-        """Build final retriever with reranker (lazy-loaded)."""
         ensemble = self._get_ensemble_retriever()
         if self._reranker:
             return ContextualCompressionRetriever(
@@ -254,21 +258,22 @@ class Retriever:
             )
         return ensemble
     @property
     def has_reranker(self) -> bool:
         return self._reranker is not None
     def _to_result(self, doc: Document, rank: int, **extra) -> Dict[str, Any]:
         metadata = doc.metadata or {}
         content = doc.page_content
-        # Small-to-Big: If this is a summary node, swap with parent (raw table)
         if metadata.get("is_table_summary") and metadata.get("parent_id"):
             parent = self._vector_db.get_parent_node(metadata["parent_id"])
             if parent:
                 content = parent.get("content", content)
-                # Merge metadata, keeping summary info for debugging
                 metadata = {
                     **parent.get("metadata", {}),
                     "original_summary": doc.page_content[:200],
@@ -283,10 +288,10 @@ class Retriever:
             **extra,
         }
     def vector_search(
         self, text: str, *, k: int | None = None, where: Optional[Dict[str, Any]] = None
     ) -> List[Dict[str, Any]]:
         if not text.strip():
             return []
         k = k or self._config.top_k
@@ -294,13 +299,12 @@ class Retriever:
         return [self._to_result(doc, i + 1, distance=score) for i, (doc, score) in enumerate(results)]
     def bm25_search(self, text: str, *, k: int | None = None) -> List[Dict[str, Any]]:
         if not text.strip():
             return []
         bm25 = self._init_bm25()  # Lazy-load BM25
         if not bm25:
             return self.vector_search(text, k=k)
         k = k or self._config.top_k
         bm25.k = k
         results = bm25.invoke(text)
@@ -309,9 +313,9 @@ class Retriever:
     def hybrid_search(
         self, text: str, *, k: int | None = None, initial_k: int | None = None
     ) -> List[Dict[str, Any]]:
         if not text.strip():
             return []
         k = k or self._config.top_k
         if initial_k:
             self._vector_retriever.search_kwargs["k"] = initial_k
@@ -319,7 +323,6 @@ class Retriever:
             if bm25:
                 bm25.k = initial_k
-        # Dùng ensemble_retriever (lazy-loaded, KHÔNG có reranker)
         ensemble = self._get_ensemble_retriever()
         results = ensemble.invoke(text)
         return [self._to_result(doc, i + 1) for i, doc in enumerate(results[:k])]
@@ -332,11 +335,9 @@ class Retriever:
         where: Optional[Dict[str, Any]] = None,
         initial_k: int | None = None,
     ) -> List[Dict[str, Any]]:
-        import time
         if not text.strip():
             return []
         k = k or self._config.top_k
         initial_k = initial_k or self._config.initial_k
@@ -350,16 +351,18 @@ class Retriever:
                 for i, doc in enumerate(results[:k])
             ]
-        # Build final retriever (lazy-loaded ensemble + reranker)
         if initial_k:
             self._vector_retriever.search_kwargs["k"] = initial_k
             bm25 = self._init_bm25()
             if bm25:
                 bm25.k = initial_k
         ensemble = self._get_ensemble_retriever()
         ensemble_results = ensemble.invoke(text)
         if self._reranker:
             results = self._reranker.compress_documents(ensemble_results, text)
         else:
@@ -370,8 +373,6 @@ class Retriever:
             for i, doc in enumerate(results[:k])
         ]
     def flexible_search(
         self,
         text: str,
@@ -381,9 +382,11 @@ class Retriever:
         initial_k: int | None = None,
         where: Optional[Dict[str, Any]] = None,
     ) -> List[Dict[str, Any]]:
         if not text.strip():
             return []
         if isinstance(mode, str):
             try:
                 mode = RetrievalMode(mode.lower())
@@ -393,6 +396,7 @@ class Retriever:
         k = k or self._config.top_k
         initial_k = initial_k or self._config.initial_k
         if mode == RetrievalMode.VECTOR_ONLY:
             return self.vector_search(text, k=k, where=where)
         elif mode == RetrievalMode.BM25_ONLY:
@@ -404,5 +408,5 @@ class Retriever:
         else:  # HYBRID_RERANK
             return self.search_with_rerank(text, k=k, where=where, initial_k=initial_k)
-    # Legacy alias
     query = vector_search

 class RetrievalMode(str, Enum):
+    """Các chế độ retrieval hỗ trợ."""
+    VECTOR_ONLY = "vector_only"      # Chỉ dùng vector search
+    BM25_ONLY = "bm25_only"          # Chỉ dùng BM25 keyword search
+    HYBRID = "hybrid"                 # Kết hợp vector + BM25
+    HYBRID_RERANK = "hybrid_rerank"   # Hybrid + reranking
 @dataclass
 class RetrievalConfig:
+    """Cấu hình cho retrieval system."""
+    rerank_api_base_url: str = "https://api.siliconflow.com/v1"  # API reranker
+    rerank_model: str = "Qwen/Qwen3-Reranker-4B"                 # Model reranker
+    rerank_top_n: int = 10                                        # Số kết quả sau rerank
+    initial_k: int = 25                                           # Số docs lấy ban đầu
+    top_k: int = 5                                                # Số kết quả cuối cùng
+    vector_weight: float = 0.5                                    # Trọng số vector search
+    bm25_weight: float = 0.5                                      # Trọng số BM25
 _retrieval_config: RetrievalConfig | None = None
 def get_retrieval_config() -> RetrievalConfig:
+    """Lấy cấu hình retrieval (singleton pattern)."""
     global _retrieval_config
     if _retrieval_config is None:
         _retrieval_config = RetrievalConfig()
 class SiliconFlowReranker(BaseDocumentCompressor):
+    """Reranker sử dụng SiliconFlow API để sắp xếp lại kết quả."""
     api_key: str = Field(default="")
     api_base_url: str = Field(default="")
     model: str = Field(default="")
         query: str,
         callbacks: Optional[Callbacks] = None,
     ) -> Sequence[Document]:
+        """Rerank documents dựa trên độ liên quan với query."""
         if not documents or not self.api_key:
             return list(documents)
+        # Retry logic với exponential backoff
         for attempt in range(3):
             try:
                 response = requests.post(
                 if "results" not in data:
                     return list(documents)
+                # Tạo danh sách documents đã rerank với score
                 reranked: List[Document] = []
                 for result in data["results"]:
                     doc = documents[result["index"]]
                 return reranked
             except Exception as e:
+                # Rate limit -> đợi rồi thử lại
                 if "rate" in str(e).lower() and attempt < 2:
                     time.sleep(2 ** attempt)
                 else:
+                    logger.error(f"Lỗi rerank: {e}")
                     return list(documents)
         return list(documents)
 class Retriever:
+    """Retriever chính hỗ trợ nhiều chế độ tìm kiếm."""
     def __init__(self, vector_db: "ChromaVectorDB", use_reranker: bool = True):
+        """Khởi tạo retriever với vector DB và reranker."""
         self._vector_db = vector_db
         self._config = get_retrieval_config()
         self._reranker: Optional[SiliconFlowReranker] = None
+        # Vector retriever từ ChromaDB
         self._vector_retriever = self._vector_db.vectorstore.as_retriever(
             search_kwargs={"k": self._config.initial_k}
         )
+        # Lazy-load BM25 - chỉ khởi tạo khi cần
         self._bm25_retriever: Optional[BM25Retriever] = None
         self._bm25_initialized = False
         self._ensemble_retriever: Optional[EnsembleRetriever] = None
+        # Đường dẫn cache BM25 (lưu vào disk)
         from pathlib import Path
         persist_dir = getattr(self._vector_db.config, 'persist_dir', None)
         if persist_dir:
         if use_reranker:
             self._reranker = self._init_reranker()
+        logger.info("Đã khởi tạo Retriever")
     def _save_bm25_cache(self, bm25: BM25Retriever) -> None:
+        """Lưu BM25 index vào cache file."""
         if not self._bm25_cache_path:
             return
         try:
             import pickle
             with open(self._bm25_cache_path, 'wb') as f:
                 pickle.dump(bm25, f)
+            logger.info(f"Đã lưu BM25 cache vào {self._bm25_cache_path}")
         except Exception as e:
+            logger.warning(f"Không thể lưu BM25 cache: {e}")
     def _load_bm25_cache(self) -> Optional[BM25Retriever]:
+        """Tải BM25 index từ cache file."""
         if not self._bm25_cache_path or not self._bm25_cache_path.exists():
             return None
         try:
             import pickle
             start = time.time()
             with open(self._bm25_cache_path, 'rb') as f:
                 bm25 = pickle.load(f)
             bm25.k = self._config.initial_k
+            logger.info(f"Đã tải BM25 từ cache trong {time.time() - start:.2f}s")
             return bm25
         except Exception as e:
+            logger.warning(f"Không thể tải BM25 cache: {e}")
             return None
     def _init_bm25(self) -> Optional[BM25Retriever]:
+        """Khởi tạo BM25 retriever (lazy-load với cache)."""
         if self._bm25_initialized:
             return self._bm25_retriever
         self._bm25_initialized = True
+        # Thử tải từ cache trước
         cached = self._load_bm25_cache()
         if cached:
             self._bm25_retriever = cached
             return cached
+        # Build từ đầu nếu không có cache
         try:
             start = time.time()
+            logger.info("Đang xây dựng BM25 index từ documents...")
             docs = self._vector_db.get_all_documents()
             if not docs:
+                logger.warning("Không tìm thấy documents cho BM25")
                 return None
             lc_docs = [
             bm25.k = self._config.initial_k
             self._bm25_retriever = bm25
+            logger.info(f"Đã xây dựng BM25 với {len(docs)} docs trong {time.time() - start:.2f}s")
+            # Lưu vào cache cho lần sau
             self._save_bm25_cache(bm25)
             return bm25
         except Exception as e:
+            logger.error(f"Không thể khởi tạo BM25: {e}")
             return None
     def _get_ensemble_retriever(self) -> EnsembleRetriever:
+        """Lấy ensemble retriever (vector + BM25)."""
         if self._ensemble_retriever is not None:
             return self._ensemble_retriever
                 weights=[self._config.vector_weight, self._config.bm25_weight]
             )
         else:
+            # Fallback về vector only
             self._ensemble_retriever = EnsembleRetriever(
                 retrievers=[self._vector_retriever],
                 weights=[1.0]
             )
         return self._ensemble_retriever
     def _init_reranker(self) -> Optional[SiliconFlowReranker]:
+        """Khởi tạo reranker nếu có API key."""
         api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
         if not api_key:
             return None
         )
     def _build_final(self):
+        """Build retriever cuối cùng (ensemble + reranker nếu có)."""
         ensemble = self._get_ensemble_retriever()
         if self._reranker:
             return ContextualCompressionRetriever(
             )
         return ensemble
     @property
     def has_reranker(self) -> bool:
+        """Kiểm tra có reranker không."""
         return self._reranker is not None
     def _to_result(self, doc: Document, rank: int, **extra) -> Dict[str, Any]:
+        """Chuyển Document thành dict result, xử lý Small-to-Big."""
         metadata = doc.metadata or {}
         content = doc.page_content
+        # Small-to-Big: Nếu là summary node -> swap với parent (bảng gốc)
         if metadata.get("is_table_summary") and metadata.get("parent_id"):
             parent = self._vector_db.get_parent_node(metadata["parent_id"])
             if parent:
                 content = parent.get("content", content)
+                # Merge metadata, giữ lại info summary để debug
                 metadata = {
                     **parent.get("metadata", {}),
                     "original_summary": doc.page_content[:200],
             **extra,
         }
     def vector_search(
         self, text: str, *, k: int | None = None, where: Optional[Dict[str, Any]] = None
     ) -> List[Dict[str, Any]]:
+        """Tìm kiếm bằng vector similarity."""
         if not text.strip():
             return []
         k = k or self._config.top_k
         return [self._to_result(doc, i + 1, distance=score) for i, (doc, score) in enumerate(results)]
     def bm25_search(self, text: str, *, k: int | None = None) -> List[Dict[str, Any]]:
+        """Tìm kiếm bằng BM25 keyword matching."""
         if not text.strip():
             return []
         bm25 = self._init_bm25()  # Lazy-load BM25
         if not bm25:
             return self.vector_search(text, k=k)
         k = k or self._config.top_k
         bm25.k = k
         results = bm25.invoke(text)
     def hybrid_search(
         self, text: str, *, k: int | None = None, initial_k: int | None = None
     ) -> List[Dict[str, Any]]:
+        """Tìm kiếm hybrid (vector + BM25) không có rerank."""
         if not text.strip():
             return []
         k = k or self._config.top_k
         if initial_k:
             self._vector_retriever.search_kwargs["k"] = initial_k
             if bm25:
                 bm25.k = initial_k
         ensemble = self._get_ensemble_retriever()
         results = ensemble.invoke(text)
         return [self._to_result(doc, i + 1) for i, doc in enumerate(results[:k])]
         where: Optional[Dict[str, Any]] = None,
         initial_k: int | None = None,
     ) -> List[Dict[str, Any]]:
+        """Tìm kiếm hybrid + reranking để có kết quả tốt nhất."""
         if not text.strip():
             return []
         k = k or self._config.top_k
         initial_k = initial_k or self._config.initial_k
                 for i, doc in enumerate(results[:k])
             ]
+        # Cập nhật k cho initial fetch
         if initial_k:
             self._vector_retriever.search_kwargs["k"] = initial_k
             bm25 = self._init_bm25()
             if bm25:
                 bm25.k = initial_k
+        # Hybrid search
         ensemble = self._get_ensemble_retriever()
         ensemble_results = ensemble.invoke(text)
+        # Rerank nếu có
         if self._reranker:
             results = self._reranker.compress_documents(ensemble_results, text)
         else:
             for i, doc in enumerate(results[:k])
         ]
     def flexible_search(
         self,
         text: str,
         initial_k: int | None = None,
         where: Optional[Dict[str, Any]] = None,
     ) -> List[Dict[str, Any]]:
+        """Tìm kiếm linh hoạt với nhiều chế độ."""
         if not text.strip():
             return []
+        # Parse mode từ string
         if isinstance(mode, str):
             try:
                 mode = RetrievalMode(mode.lower())
         k = k or self._config.top_k
         initial_k = initial_k or self._config.initial_k
+        # Gọi method tương ứng theo mode
         if mode == RetrievalMode.VECTOR_ONLY:
             return self.vector_search(text, k=k, where=where)
         elif mode == RetrievalMode.BM25_ONLY:
         else:  # HYBRID_RERANK
             return self.search_with_rerank(text, k=k, where=where, initial_k=initial_k)
+    # Alias để tương thích ngược
     query = vector_search

core/rag/vector_store.py CHANGED Viewed

@@ -13,66 +13,76 @@ logger = logging.getLogger(__name__)
 @dataclass
 class ChromaConfig:
     def _default_persist_dir() -> str:
         repo_root = Path(__file__).resolve().parents[2]
         return str((repo_root / "data" / "chroma").resolve())
-    persist_dir: str = field(default_factory=_default_persist_dir)
-    collection_name: str = "hust_rag_collection"
 class ChromaVectorDB:
     def __init__(
         self,
         embedder: Any,
         config: ChromaConfig | None = None,
     ):
         self.embedder = embedder
         self.config = config or ChromaConfig()
         self._hasher = HashProcessor(verbose=False)
-        # Storage for parent nodes (not embedded, used for Small-to-Big retrieval)
-        # Persist to JSON file in same directory as ChromaDB
         self._parent_nodes_path = Path(self.config.persist_dir) / "parent_nodes.json"
         self._parent_nodes: Dict[str, Dict[str, Any]] = self._load_parent_nodes()
         self._vs = Chroma(
             collection_name=self.config.collection_name,
             embedding_function=self.embedder,
             persist_directory=self.config.persist_dir,
         )
-        logger.info(f"ChromaVectorDB initialized: {self.config.collection_name}")
     def _load_parent_nodes(self) -> Dict[str, Dict[str, Any]]:
         if self._parent_nodes_path.exists():
             try:
                 with open(self._parent_nodes_path, 'r', encoding='utf-8') as f:
                     data = json.load(f)
-                    logger.info(f"Loaded {len(data)} parent nodes from {self._parent_nodes_path}")
                     return data
             except Exception as e:
-                logger.warning(f"Failed to load parent nodes: {e}")
         return {}
     def _save_parent_nodes(self) -> None:
-        """Save parent nodes to JSON file."""
         try:
             self._parent_nodes_path.parent.mkdir(parents=True, exist_ok=True)
             with open(self._parent_nodes_path, 'w', encoding='utf-8') as f:
                 json.dump(self._parent_nodes, f, ensure_ascii=False, indent=2)
-            logger.info(f"Saved {len(self._parent_nodes)} parent nodes to {self._parent_nodes_path}")
         except Exception as e:
-            logger.warning(f"Failed to save parent nodes: {e}")
     @property
     def collection(self):
         return getattr(self._vs, "_collection", None)
     @property
     def vectorstore(self):
         return self._vs
     def _flatten_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         out: Dict[str, Any] = {}
         for k, v in (metadata or {}).items():
             if v is None:
@@ -80,33 +90,33 @@ class ChromaVectorDB:
             if isinstance(v, (str, int, float, bool)):
                 out[str(k)] = v
             elif isinstance(v, (list, tuple, set, dict)):
                 out[str(k)] = json.dumps(v, ensure_ascii=False)
             else:
                 out[str(k)] = str(v)
         return out
     def _normalize_doc(self, doc: Any) -> Dict[str, Any]:
-        # Nếu đã là dict
         if isinstance(doc, dict):
             return doc
-        # Nếu là TextNode/BaseNode từ llama_index
         if hasattr(doc, "get_content") and hasattr(doc, "metadata"):
             return {
                 "content": doc.get_content(),
                 "metadata": dict(doc.metadata) if doc.metadata else {},
             }
-        # Nếu là Document từ langchain
         if hasattr(doc, "page_content") and hasattr(doc, "metadata"):
             return {
                 "content": doc.page_content,
                 "metadata": dict(doc.metadata) if doc.metadata else {},
             }
-        raise TypeError(f"Unsupported document type: {type(doc)}")
     def _to_documents(self, docs: Sequence[Any], ids: Sequence[str]) -> List[Document]:
         out: List[Document] = []
         for d, doc_id in zip(docs, ids):
             normalized = self._normalize_doc(d)
@@ -116,6 +126,7 @@ class ChromaVectorDB:
         return out
     def _doc_id(self, doc: Any) -> str:
         normalized = self._normalize_doc(doc)
         md = normalized.get("metadata") or {}
         key = {
@@ -133,13 +144,14 @@ class ChromaVectorDB:
         ids: Optional[Sequence[str]] = None,
         batch_size: int = 128,
     ) -> int:
         if not docs:
             return 0
         if ids is not None and len(ids) != len(docs):
-            raise ValueError("ids length must match docs length")
-        # Separate parent nodes (not embedded) from regular nodes
         regular_docs = []
         regular_ids = []
         parent_count = 0
@@ -150,7 +162,7 @@ class ChromaVectorDB:
             doc_id = ids[i] if ids else self._doc_id(d)
             if md.get("is_parent"):
-                # Store parent node separately (for Small-to-Big retrieval)
                 parent_id = md.get("node_id", doc_id)
                 self._parent_nodes[parent_id] = {
                     "id": parent_id,
@@ -163,12 +175,13 @@ class ChromaVectorDB:
                 regular_ids.append(doc_id)
         if parent_count > 0:
-            logger.info(f"Stored {parent_count} parent nodes (not embedded)")
-            self._save_parent_nodes()  # Persist to disk
         if not regular_docs:
             return parent_count
         bs = max(1, batch_size)
         total = 0
@@ -180,12 +193,13 @@ class ChromaVectorDB:
             try:
                 self._vs.add_documents(lc_docs, ids=batch_ids)
             except TypeError:
                 texts = [d.page_content for d in lc_docs]
                 metas = [d.metadata for d in lc_docs]
                 self._vs.add_texts(texts=texts, metadatas=metas, ids=batch_ids)
             total += len(batch)
-        logger.info(f"Added {total} documents to vector store")
         return total + parent_count
     def upsert_documents(
@@ -195,13 +209,14 @@ class ChromaVectorDB:
         ids: Optional[Sequence[str]] = None,
         batch_size: int = 128,
     ) -> int:
         if not docs:
             return 0
         if ids is not None and len(ids) != len(docs):
-            raise ValueError("ids length must match docs length")
-        # Separate parent nodes (not embedded) from regular nodes
         regular_docs = []
         regular_ids = []
         parent_count = 0
@@ -212,7 +227,7 @@ class ChromaVectorDB:
             doc_id = ids[i] if ids else self._doc_id(d)
             if md.get("is_parent"):
-                # Store parent node separately (for Small-to-Big retrieval)
                 parent_id = md.get("node_id", doc_id)
                 self._parent_nodes[parent_id] = {
                     "id": parent_id,
@@ -225,8 +240,8 @@ class ChromaVectorDB:
                 regular_ids.append(doc_id)
         if parent_count > 0:
-            logger.info(f"Stored {parent_count} parent nodes (not embedded)")
-            self._save_parent_nodes()  # Persist to disk
         if not regular_docs:
             return parent_count
@@ -234,9 +249,11 @@ class ChromaVectorDB:
         bs = max(1, batch_size)
         col = self.collection
         if col is None:
             return self.add_documents(regular_docs, ids=regular_ids, batch_size=bs) + parent_count
         total = 0
         for start in range(0, len(regular_docs), bs):
             batch = regular_docs[start : start + bs]
@@ -248,14 +265,16 @@ class ChromaVectorDB:
             col.upsert(ids=batch_ids, documents=texts, metadatas=metas, embeddings=embs)
             total += len(batch)
-        logger.info(f"Upserted {total} documents to vector store")
         return total + parent_count
     def count(self) -> int:
         col = self.collection
         return int(col.count()) if col else 0
     def get_all_documents(self, limit: int = 5000) -> List[Dict[str, Any]]:
         col = self.collection
         if col is None:
             return []
@@ -272,6 +291,7 @@ class ChromaVectorDB:
         return docs
     def delete_documents(self, ids: Sequence[str]) -> int:
         if not ids:
             return 0
@@ -280,12 +300,14 @@ class ChromaVectorDB:
             return 0
         col.delete(ids=list(ids))
-        logger.info(f"Deleted {len(ids)} documents from vector store")
         return len(ids)
     def get_parent_node(self, parent_id: str) -> Optional[Dict[str, Any]]:
         return self._parent_nodes.get(parent_id)
     @property
     def parent_nodes(self) -> Dict[str, Dict[str, Any]]:
         return self._parent_nodes

 @dataclass
 class ChromaConfig:
+    """Cấu hình cho ChromaDB."""
     def _default_persist_dir() -> str:
+        """Lấy đường dẫn mặc định cho persist directory."""
         repo_root = Path(__file__).resolve().parents[2]
         return str((repo_root / "data" / "chroma").resolve())
+    persist_dir: str = field(default_factory=_default_persist_dir)  # Thư mục lưu DB
+    collection_name: str = "hust_rag_collection"                    # Tên collection
 class ChromaVectorDB:
+    """Wrapper cho ChromaDB với hỗ trợ Small-to-Big retrieval."""
     def __init__(
         self,
         embedder: Any,
         config: ChromaConfig | None = None,
     ):
+        """Khởi tạo ChromaDB với embedder và config."""
         self.embedder = embedder
         self.config = config or ChromaConfig()
         self._hasher = HashProcessor(verbose=False)
+        # Lưu trữ parent nodes (không embed, dùng cho Small-to-Big)
         self._parent_nodes_path = Path(self.config.persist_dir) / "parent_nodes.json"
         self._parent_nodes: Dict[str, Dict[str, Any]] = self._load_parent_nodes()
+        # Khởi tạo ChromaDB
         self._vs = Chroma(
             collection_name=self.config.collection_name,
             embedding_function=self.embedder,
             persist_directory=self.config.persist_dir,
         )
+        logger.info(f"Đã khởi tạo ChromaVectorDB: {self.config.collection_name}")
     def _load_parent_nodes(self) -> Dict[str, Dict[str, Any]]:
+        """Tải parent nodes từ file JSON."""
         if self._parent_nodes_path.exists():
             try:
                 with open(self._parent_nodes_path, 'r', encoding='utf-8') as f:
                     data = json.load(f)
+                    logger.info(f"Đã tải {len(data)} parent nodes từ {self._parent_nodes_path}")
                     return data
             except Exception as e:
+                logger.warning(f"Không thể tải parent nodes: {e}")
         return {}
     def _save_parent_nodes(self) -> None:
+        """Lưu parent nodes vào file JSON."""
         try:
             self._parent_nodes_path.parent.mkdir(parents=True, exist_ok=True)
             with open(self._parent_nodes_path, 'w', encoding='utf-8') as f:
                 json.dump(self._parent_nodes, f, ensure_ascii=False, indent=2)
+            logger.info(f"Đã lưu {len(self._parent_nodes)} parent nodes vào {self._parent_nodes_path}")
         except Exception as e:
+            logger.warning(f"Không thể lưu parent nodes: {e}")
     @property
     def collection(self):
+        """Lấy collection gốc của ChromaDB."""
         return getattr(self._vs, "_collection", None)
     @property
     def vectorstore(self):
+        """Lấy LangChain Chroma vectorstore."""
         return self._vs
     def _flatten_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        """Chuyển metadata phức tạp thành format ChromaDB hỗ trợ."""
         out: Dict[str, Any] = {}
         for k, v in (metadata or {}).items():
             if v is None:
             if isinstance(v, (str, int, float, bool)):
                 out[str(k)] = v
             elif isinstance(v, (list, tuple, set, dict)):
+                # Chuyển list/dict thành JSON string
                 out[str(k)] = json.dumps(v, ensure_ascii=False)
             else:
                 out[str(k)] = str(v)
         return out
     def _normalize_doc(self, doc: Any) -> Dict[str, Any]:
+        """Chuẩn hóa document từ nhiều format khác nhau thành dict."""
+        # Đã là dict
         if isinstance(doc, dict):
             return doc
+        # TextNode/BaseNode từ llama_index
         if hasattr(doc, "get_content") and hasattr(doc, "metadata"):
             return {
                 "content": doc.get_content(),
                 "metadata": dict(doc.metadata) if doc.metadata else {},
             }
+        # Document từ LangChain
         if hasattr(doc, "page_content") and hasattr(doc, "metadata"):
             return {
                 "content": doc.page_content,
                 "metadata": dict(doc.metadata) if doc.metadata else {},
             }
+        raise TypeError(f"Không hỗ trợ loại document: {type(doc)}")
     def _to_documents(self, docs: Sequence[Any], ids: Sequence[str]) -> List[Document]:
+        """Chuyển danh sách docs thành LangChain Documents."""
         out: List[Document] = []
         for d, doc_id in zip(docs, ids):
             normalized = self._normalize_doc(d)
         return out
     def _doc_id(self, doc: Any) -> str:
+        """Tạo ID duy nhất cho document dựa trên nội dung."""
         normalized = self._normalize_doc(doc)
         md = normalized.get("metadata") or {}
         key = {
         ids: Optional[Sequence[str]] = None,
         batch_size: int = 128,
     ) -> int:
+        """Thêm documents vào vector store."""
         if not docs:
             return 0
         if ids is not None and len(ids) != len(docs):
+            raise ValueError("Số lượng ids phải bằng số lượng docs")
+        # Tách parent nodes (không embed) khỏi regular nodes
         regular_docs = []
         regular_ids = []
         parent_count = 0
             doc_id = ids[i] if ids else self._doc_id(d)
             if md.get("is_parent"):
+                # Lưu parent node riêng (cho Small-to-Big)
                 parent_id = md.get("node_id", doc_id)
                 self._parent_nodes[parent_id] = {
                     "id": parent_id,
                 regular_ids.append(doc_id)
         if parent_count > 0:
+            logger.info(f"Đã lưu {parent_count} parent nodes (không embed)")
+            self._save_parent_nodes()
         if not regular_docs:
             return parent_count
+        # Thêm theo batch
         bs = max(1, batch_size)
         total = 0
             try:
                 self._vs.add_documents(lc_docs, ids=batch_ids)
             except TypeError:
+                # Fallback nếu add_documents không nhận ids
                 texts = [d.page_content for d in lc_docs]
                 metas = [d.metadata for d in lc_docs]
                 self._vs.add_texts(texts=texts, metadatas=metas, ids=batch_ids)
             total += len(batch)
+        logger.info(f"Đã thêm {total} documents vào vector store")
         return total + parent_count
     def upsert_documents(
         ids: Optional[Sequence[str]] = None,
         batch_size: int = 128,
     ) -> int:
+        """Upsert documents (thêm mới hoặc cập nhật nếu đã tồn tại)."""
         if not docs:
             return 0
         if ids is not None and len(ids) != len(docs):
+            raise ValueError("Số lượng ids phải bằng số lượng docs")
+        # Tách parent nodes khỏi regular nodes
         regular_docs = []
         regular_ids = []
         parent_count = 0
             doc_id = ids[i] if ids else self._doc_id(d)
             if md.get("is_parent"):
+                # Lưu parent node riêng
                 parent_id = md.get("node_id", doc_id)
                 self._parent_nodes[parent_id] = {
                     "id": parent_id,
                 regular_ids.append(doc_id)
         if parent_count > 0:
+            logger.info(f"Đã lưu {parent_count} parent nodes (không embed)")
+            self._save_parent_nodes()
         if not regular_docs:
             return parent_count
         bs = max(1, batch_size)
         col = self.collection
+        # Fallback nếu không có collection
         if col is None:
             return self.add_documents(regular_docs, ids=regular_ids, batch_size=bs) + parent_count
+        # Upsert theo batch
         total = 0
         for start in range(0, len(regular_docs), bs):
             batch = regular_docs[start : start + bs]
             col.upsert(ids=batch_ids, documents=texts, metadatas=metas, embeddings=embs)
             total += len(batch)
+        logger.info(f"Đã upsert {total} documents vào vector store")
         return total + parent_count
     def count(self) -> int:
+        """Đếm số documents trong collection."""
         col = self.collection
         return int(col.count()) if col else 0
     def get_all_documents(self, limit: int = 5000) -> List[Dict[str, Any]]:
+        """Lấy tất cả documents từ collection."""
         col = self.collection
         if col is None:
             return []
         return docs
     def delete_documents(self, ids: Sequence[str]) -> int:
+        """Xóa documents theo danh sách IDs."""
         if not ids:
             return 0
             return 0
         col.delete(ids=list(ids))
+        logger.info(f"Đã xóa {len(ids)} documents khỏi vector store")
         return len(ids)
     def get_parent_node(self, parent_id: str) -> Optional[Dict[str, Any]]:
+        """Lấy parent node theo ID (cho Small-to-Big)."""
         return self._parent_nodes.get(parent_id)
     @property
     def parent_nodes(self) -> Dict[str, Dict[str, Any]]:
+        """Lấy tất cả parent nodes."""
         return self._parent_nodes

evaluation/eval_utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import sys
 import re
@@ -20,31 +22,38 @@ from core.rag.generator import RAGGenerator
 def strip_thinking(text: str) -> str:
     return re.sub(r'<think>.*?</think>\s*', '', text, flags=re.DOTALL).strip()
 def load_csv_data(csv_path: str, sample_size: int = 0) -> tuple[list, list]:
     questions, ground_truths = [], []
     with open(csv_path, 'r', encoding='utf-8') as f:
         for row in csv.DictReader(f):
             if row.get('question') and row.get('ground_truth'):
                 questions.append(row['question'])
                 ground_truths.append(row['ground_truth'])
     if sample_size > 0:
         questions = questions[:sample_size]
         ground_truths = ground_truths[:sample_size]
     return questions, ground_truths
 def init_rag() -> tuple[RAGGenerator, QwenEmbeddings, OpenAI]:
     embeddings = QwenEmbeddings(EmbeddingConfig())
     db = ChromaVectorDB(embedder=embeddings, config=ChromaConfig())
     retriever = Retriever(vector_db=db)
     rag = RAGGenerator(retriever=retriever)
     api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
     if not api_key:
-        raise ValueError("Missing SILICONFLOW_API_KEY")
     llm_client = OpenAI(api_key=api_key, base_url="https://api.siliconflow.com/v1", timeout=60.0)
     return rag, embeddings, llm_client
@@ -58,14 +67,18 @@ def generate_answers(
     retrieval_mode: str = "hybrid_rerank",
     max_workers: int = 8,
 ) -> tuple[list, list]:
     def process(idx_q):
         idx, q = idx_q
         try:
             prepared = rag.retrieve_and_prepare(q, mode=retrieval_mode)
             if not prepared["results"]:
                 return idx, "Không tìm thấy thông tin.", []
             resp = llm_client.chat.completions.create(
                 model=llm_model,
                 messages=[{"role": "user", "content": prepared["prompt"]}],
@@ -75,18 +88,20 @@ def generate_answers(
             answer = strip_thinking(resp.choices[0].message.content or "")
             return idx, answer, prepared["contexts"]
         except Exception as e:
-            print(f"  Q{idx+1} Error: {e}")
             return idx, "Không thể trả lời.", []
     n = len(questions)
     answers, contexts = [""] * n, [[] for _ in range(n)]
-    print(f"  Generating {n} answers...")
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = {executor.submit(process, (i, q)): i for i, q in enumerate(questions)}
         for i, future in enumerate(as_completed(futures), 1):
             idx, ans, ctx = future.result(timeout=120)
             answers[idx], contexts[idx] = ans, ctx
-            print(f"  [{i}/{n}] Done")
     return answers, contexts

+"""Các utility functions cho evaluation."""
 import os
 import sys
 import re
 def strip_thinking(text: str) -> str:
+    """Loại bỏ các block <think>...</think> từ output của LLM."""
     return re.sub(r'<think>.*?</think>\s*', '', text, flags=re.DOTALL).strip()
 def load_csv_data(csv_path: str, sample_size: int = 0) -> tuple[list, list]:
+    """Đọc dữ liệu câu hỏi và ground truth từ file CSV."""
     questions, ground_truths = [], []
     with open(csv_path, 'r', encoding='utf-8') as f:
         for row in csv.DictReader(f):
             if row.get('question') and row.get('ground_truth'):
                 questions.append(row['question'])
                 ground_truths.append(row['ground_truth'])
+    # Giới hạn số lượng sample
     if sample_size > 0:
         questions = questions[:sample_size]
         ground_truths = ground_truths[:sample_size]
     return questions, ground_truths
 def init_rag() -> tuple[RAGGenerator, QwenEmbeddings, OpenAI]:
+    """Khởi tạo các components RAG cho evaluation."""
     embeddings = QwenEmbeddings(EmbeddingConfig())
     db = ChromaVectorDB(embedder=embeddings, config=ChromaConfig())
     retriever = Retriever(vector_db=db)
     rag = RAGGenerator(retriever=retriever)
+    # Khởi tạo LLM client
     api_key = os.getenv("SILICONFLOW_API_KEY", "").strip()
     if not api_key:
+        raise ValueError("Chưa đặt SILICONFLOW_API_KEY")
     llm_client = OpenAI(api_key=api_key, base_url="https://api.siliconflow.com/v1", timeout=60.0)
     return rag, embeddings, llm_client
     retrieval_mode: str = "hybrid_rerank",
     max_workers: int = 8,
 ) -> tuple[list, list]:
+    """Generate câu trả lời cho danh sách câu hỏi với parallel processing."""
     def process(idx_q):
+        """Xử lý một câu hỏi: retrieve + generate."""
         idx, q = idx_q
         try:
+            # Retrieve và chuẩn bị context
             prepared = rag.retrieve_and_prepare(q, mode=retrieval_mode)
             if not prepared["results"]:
                 return idx, "Không tìm thấy thông tin.", []
+            # Gọi LLM để generate answer
             resp = llm_client.chat.completions.create(
                 model=llm_model,
                 messages=[{"role": "user", "content": prepared["prompt"]}],
             answer = strip_thinking(resp.choices[0].message.content or "")
             return idx, answer, prepared["contexts"]
         except Exception as e:
+            print(f"  Q{idx+1} Lỗi: {e}")
             return idx, "Không thể trả lời.", []
     n = len(questions)
     answers, contexts = [""] * n, [[] for _ in range(n)]
+    print(f"  Đang generate {n} câu trả lời...")
+    # Xử lý song song với ThreadPoolExecutor
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = {executor.submit(process, (i, q)): i for i, q in enumerate(questions)}
         for i, future in enumerate(as_completed(futures), 1):
             idx, ans, ctx = future.result(timeout=120)
             answers[idx], contexts[idx] = ans, ctx
+            print(f"  [{i}/{n}] Xong")
     return answers, contexts

evaluation/ragas_eval.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os
 import sys
 import json
@@ -21,33 +23,34 @@ from ragas.run_config import RunConfig
 from evaluation.eval_utils import load_csv_data, init_rag, generate_answers
-# Config
-CSV_PATH = "data/data.csv"
-OUTPUT_DIR = "evaluation/results"
-LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1")
 API_BASE = "https://api.siliconflow.com/v1"
 def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict:
     print(f"\n{'='*60}")
     print(f"RAGAS EVALUATION - Mode: {retrieval_mode}")
     print(f"{'='*60}")
-    # Init RAG components
     rag, embeddings, llm_client = init_rag()
-    # Load data
     questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size)
-    print(f"  Loaded {len(questions)} samples")
-    # Generate answers
     answers, contexts = generate_answers(
         rag, questions, llm_client,
         llm_model=LLM_MODEL,
         retrieval_mode=retrieval_mode,
     )
-    # Setup RAGAS evaluator
     api_key = os.getenv("SILICONFLOW_API_KEY", "")
     evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
         model=LLM_MODEL,
@@ -59,7 +62,7 @@ def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank")
     ))
     evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)
-    # Create dataset
     dataset = Dataset.from_dict({
         "question": questions,
         "answer": answers,
@@ -67,18 +70,18 @@ def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank")
         "ground_truth": ground_truths,
     })
-    # Run RAGAS evaluation
-    print("\n  Running RAGAS metrics...")
     results = evaluate(
         dataset=dataset,
         metrics=[
-            faithfulness,
-            answer_relevancy,
-            context_precision,
-            context_recall,
-            RougeScore(rouge_type='rouge1', mode='fmeasure'),
-            RougeScore(rouge_type='rouge2', mode='fmeasure'),
-            RougeScore(rouge_type='rougeL', mode='fmeasure'),
         ],
         llm=evaluator_llm,
         embeddings=evaluator_embeddings,
@@ -86,65 +89,37 @@ def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank")
         run_config=RunConfig(max_workers=8, timeout=600, max_retries=3),
     )
-    # Extract scores
     df = results.to_pandas()
     metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")]
     avg_scores = {}
     for col in metric_cols:
         values = df[col].dropna().tolist()
         if values:
             avg_scores[col] = sum(values) / len(values)
-    # Save results
     out_path = REPO_ROOT / OUTPUT_DIR
     out_path.mkdir(parents=True, exist_ok=True)
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    # JSON
-    json_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.json"
-    with open(json_path, 'w', encoding='utf-8') as f:
-        json.dump({
-            "timestamp": timestamp,
-            "retrieval_mode": retrieval_mode,
-            "sample_size": len(questions),
-            "avg_scores": avg_scores,
-            "samples": [
-                {"question": q, "answer": a, "ground_truth": gt, "contexts": ctx}
-                for q, a, gt, ctx in zip(questions, answers, ground_truths, contexts)
-            ]
-        }, f, ensure_ascii=False, indent=2)
-    # CSV
     csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv"
     with open(csv_path, 'w', encoding='utf-8') as f:
         f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n")
         f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n")
-    # Print summary
     print(f"\n{'='*60}")
-    print(f"RESULTS - {retrieval_mode} ({len(questions)} samples)")
     print(f"{'='*60}")
     for metric, score in avg_scores.items():
         bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
         print(f"  {metric:25} [{bar}] {score:.4f}")
-    print(f"\nSaved: {json_path}")
-    print(f"Saved: {csv_path}")
-    return avg_scores
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="RAGAS Evaluation")
-    parser.add_argument("--samples", type=int, default=10, help="Number of samples")
-    parser.add_argument("--mode", type=str, default="hybrid_rerank",
-                        choices=["vector_only", "bm25_only", "hybrid", "hybrid_rerank", "all"])
-    args = parser.parse_args()
-    if args.mode == "all":
-        for mode in ["vector_only", "bm25_only", "hybrid", "hybrid_rerank"]:
-            run_evaluation(args.samples, mode)
-    else:
-        run_evaluation(args.samples, args.mode)

+"""Script đánh giá RAG bằng RAGAS framework."""
 import os
 import sys
 import json
 from evaluation.eval_utils import load_csv_data, init_rag, generate_answers
+# Cấu hình
+CSV_PATH = "data/data.csv"                                        # File dữ liệu test
+OUTPUT_DIR = "evaluation/results"                                  # Thư mục output
+LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "nex-agi/DeepSeek-V3.1-Nex-N1")  # Model đánh giá
 API_BASE = "https://api.siliconflow.com/v1"
 def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict:
+    """Chạy đánh giá RAGAS trên dữ liệu test."""
     print(f"\n{'='*60}")
     print(f"RAGAS EVALUATION - Mode: {retrieval_mode}")
     print(f"{'='*60}")
+    # Khởi tạo RAG components
     rag, embeddings, llm_client = init_rag()
+    # Tải dữ liệu test
     questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size)
+    print(f"  Đã tải {len(questions)} samples")
+    # Generate câu trả lời
     answers, contexts = generate_answers(
         rag, questions, llm_client,
         llm_model=LLM_MODEL,
         retrieval_mode=retrieval_mode,
     )
+    # Thiết lập RAGAS evaluator
     api_key = os.getenv("SILICONFLOW_API_KEY", "")
     evaluator_llm = LangchainLLMWrapper(ChatOpenAI(
         model=LLM_MODEL,
     ))
     evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings)
+    # Chuyển dữ liệu thành format Dataset
     dataset = Dataset.from_dict({
         "question": questions,
         "answer": answers,
         "ground_truth": ground_truths,
     })
+    # Chạy đánh giá RAGAS
+    print("\n  Đang chạy RAGAS metrics...")
     results = evaluate(
         dataset=dataset,
         metrics=[
+            faithfulness,           # Độ trung thực với context
+            answer_relevancy,       # Độ liên quan của câu trả lời
+            context_precision,      # Độ chính xác của context
+            context_recall,         # Độ bao phủ của context
+            RougeScore(rouge_type='rouge1', mode='fmeasure'),  # ROUGE-1
+            RougeScore(rouge_type='rouge2', mode='fmeasure'),  # ROUGE-2
+            RougeScore(rouge_type='rougeL', mode='fmeasure'),  # ROUGE-L
         ],
         llm=evaluator_llm,
         embeddings=evaluator_embeddings,
         run_config=RunConfig(max_workers=8, timeout=600, max_retries=3),
     )
+    # Trích xuất điểm số
     df = results.to_pandas()
     metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")]
+    # Tính điểm trung bình cho mỗi metric
     avg_scores = {}
     for col in metric_cols:
         values = df[col].dropna().tolist()
         if values:
             avg_scores[col] = sum(values) / len(values)
+    # Lưu kết quả
     out_path = REPO_ROOT / OUTPUT_DIR
     out_path.mkdir(parents=True, exist_ok=True)
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Lưu file CSV (tóm tắt)
     csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv"
     with open(csv_path, 'w', encoding='utf-8') as f:
         f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n")
         f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n")
+    # In kết quả
     print(f"\n{'='*60}")
+    print(f"KẾT QUẢ - {retrieval_mode} ({len(questions)} samples)")
     print(f"{'='*60}")
     for metric, score in avg_scores.items():
         bar = "#" * int(score * 20) + "-" * (20 - int(score * 20))
         print(f"  {metric:25} [{bar}] {score:.4f}")
+    print(f"\nĐã lưu: {json_path}")
+    print(f"Đã lưu: {csv_path}")
+    return avg_scores

scripts/build_data.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import sys
 from pathlib import Path
 from dotenv import find_dotenv, load_dotenv
-# Load .env file
 load_dotenv(find_dotenv(usecwd=True))
 REPO_ROOT = Path(__file__).resolve().parents[1]
@@ -14,12 +16,11 @@ from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
 from core.rag.vector_store import ChromaConfig, ChromaVectorDB
 from core.hash_file.hash_file import HashProcessor
-# Global hash processor instance
 _hasher = HashProcessor(verbose=False)
 def get_db_file_info(db: ChromaVectorDB) -> dict:
-    """Get mapping of source_file -> set of doc IDs in DB."""
     docs = db.get_all_documents()
     file_to_ids = {}
     file_to_hash = {}
@@ -35,7 +36,7 @@ def get_db_file_info(db: ChromaVectorDB) -> dict:
                 file_to_ids[source] = set()
             file_to_ids[source].add(doc_id)
-            # Store first hash we see for this file
             if source not in file_to_hash and content_hash:
                 file_to_hash[source] = content_hash
@@ -43,64 +44,65 @@ def get_db_file_info(db: ChromaVectorDB) -> dict:
 def main():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--force", action="store_true", help="Force rebuild all files")
-    parser.add_argument("--no-delete", action="store_true", help="Don't delete orphaned docs")
     args = parser.parse_args()
     print("=" * 60)
     print("BUILD HUST RAG DATABASE")
     print("=" * 60)
-    print("\n[1/5] Initializing embedder...")
     emb_cfg = EmbeddingConfig()
     emb = QwenEmbeddings(emb_cfg)
     print(f"  Model: {emb_cfg.model}")
     print(f"  API: {emb_cfg.api_base_url}")
-    print("\n[2/5] Initializing ChromaDB...")
     db_cfg = ChromaConfig()
     db = ChromaVectorDB(embedder=emb, config=db_cfg)
     old_count = db.count()
     print(f"  Collection: {db_cfg.collection_name}")
-    print(f"  Current docs: {old_count}")
-    # Get current state of DB
     db_info = {"ids": {}, "hashes": {}}
     if not args.force and old_count > 0:
-        print("\n  Scanning existing documents...")
         db_info = get_db_file_info(db)
-        print(f"  Found {len(db_info['ids'])} source files in DB")
-    # Scan markdown files
-    print("\n[3/5] Scanning markdown files...")
     root = REPO_ROOT / "data" / "data_process"
     md_files = sorted(root.rglob("*.md"))
-    print(f"  Found {len(md_files)} markdown files on disk")
-    # Build set of current file names
     current_files = {f.name for f in md_files}
     db_files = set(db_info["ids"].keys())
-    # Find files to delete (in DB but not on disk)
     files_to_delete = db_files - current_files
-    # Delete orphaned documents
     deleted_count = 0
     if files_to_delete and not args.no_delete:
-        print(f"\n[4/5] Cleaning up {len(files_to_delete)} deleted files...")
         for filename in files_to_delete:
             doc_ids = list(db_info["ids"].get(filename, []))
             if doc_ids:
                 db.delete_documents(doc_ids)
                 deleted_count += len(doc_ids)
-                print(f"  Deleted: {filename} ({len(doc_ids)} chunks)")
     else:
-        print("\n[4/5] No files to delete")
-    # Process files (add new, update changed)
-    print("\n[5/5] Processing markdown files...")
     total_added = 0
     total_updated = 0
     skipped = 0
@@ -110,16 +112,16 @@ def main():
         db_hash = db_info["hashes"].get(f.name, "")
         existing_ids = db_info["ids"].get(f.name, set())
-        # Skip if hash matches (file unchanged)
         if not args.force and db_hash == file_hash:
-            print(f"  [{i}/{len(md_files)}] {f.name}: SKIP (unchanged)")
             skipped += 1
             continue
-        # If file changed, delete old chunks first
         if existing_ids and not args.force:
             db.delete_documents(list(existing_ids))
-            print(f"  [{i}/{len(md_files)}] {f.name}: UPDATE (deleted {len(existing_ids)} old chunks)")
             is_update = True
         else:
             is_update = False
@@ -127,7 +129,7 @@ def main():
         try:
             docs = chunk_markdown_file(f)
             if docs:
-                # Add content_hash to metadata for future change detection
                 for doc in docs:
                     if hasattr(doc, 'metadata'):
                         doc.metadata["content_hash"] = file_hash
@@ -135,29 +137,29 @@ def main():
                         doc["metadata"]["content_hash"] = file_hash
                 n = db.upsert_documents(docs)
                 if is_update:
                     total_updated += n
-                    print(f"  [{i}/{len(md_files)}] {f.name}: +{n} new chunks")
                 else:
                     total_added += n
                     print(f"  [{i}/{len(md_files)}] {f.name}: {n} chunks")
             else:
-                print(f"  [{i}/{len(md_files)}] {f.name}: SKIP (no chunks)")
         except Exception as e:
-            print(f"  [{i}/{len(md_files)}] {f.name}: ERROR - {e}")
     new_count = db.count()
     print(f"\n{'=' * 60}")
-    print("SUMMARY")
     print("=" * 60)
-    print(f"  Deleted (orphaned): {deleted_count} chunks")
-    print(f"  Updated: {total_updated} chunks")
-    print(f"  Added new: {total_added} chunks")
-    print(f"  Skipped (unchanged): {skipped} files")
-    print(f"  DB count: {old_count} -> {new_count} ({new_count - old_count:+d})")
-    print("\nDONE!")
 if __name__ == "__main__":

+"""Script build ChromaDB từ markdown files với incremental update."""
 import sys
+import argparse
 from pathlib import Path
 from dotenv import find_dotenv, load_dotenv
 load_dotenv(find_dotenv(usecwd=True))
 REPO_ROOT = Path(__file__).resolve().parents[1]
 from core.rag.vector_store import ChromaConfig, ChromaVectorDB
 from core.hash_file.hash_file import HashProcessor
 _hasher = HashProcessor(verbose=False)
 def get_db_file_info(db: ChromaVectorDB) -> dict:
+    """Lấy thông tin files đã có trong DB (IDs và hash)."""
     docs = db.get_all_documents()
     file_to_ids = {}
     file_to_hash = {}
                 file_to_ids[source] = set()
             file_to_ids[source].add(doc_id)
+            # Lưu hash đầu tiên tìm thấy cho file
             if source not in file_to_hash and content_hash:
                 file_to_hash[source] = content_hash
 def main():
+    parser = argparse.ArgumentParser(description="Build ChromaDB từ markdown files")
+    parser.add_argument("--force", action="store_true", help="Build lại tất cả files")
+    parser.add_argument("--no-delete", action="store_true", help="Không xóa docs orphaned")
     args = parser.parse_args()
     print("=" * 60)
     print("BUILD HUST RAG DATABASE")
     print("=" * 60)
+    # Bước 1: Khởi tạo embedder
+    print("\n[1/5] Khởi tạo embedder...")
     emb_cfg = EmbeddingConfig()
     emb = QwenEmbeddings(emb_cfg)
     print(f"  Model: {emb_cfg.model}")
     print(f"  API: {emb_cfg.api_base_url}")
+    # Bước 2: Khởi tạo ChromaDB
+    print("\n[2/5] Khởi tạo ChromaDB...")
     db_cfg = ChromaConfig()
     db = ChromaVectorDB(embedder=emb, config=db_cfg)
     old_count = db.count()
     print(f"  Collection: {db_cfg.collection_name}")
+    print(f"  Số docs hiện tại: {old_count}")
+    # Lấy trạng thái hiện tại của DB
     db_info = {"ids": {}, "hashes": {}}
     if not args.force and old_count > 0:
+        print("\n  Đang quét documents trong DB...")
         db_info = get_db_file_info(db)
+        print(f"  Tìm thấy {len(db_info['ids'])} source files trong DB")
+    # Bước 3: Quét markdown files
+    print("\n[3/5] Quét markdown files...")
     root = REPO_ROOT / "data" / "data_process"
     md_files = sorted(root.rglob("*.md"))
+    print(f"  Tìm thấy {len(md_files)} markdown files trên disk")
+    # So sánh files trên disk vs trong DB
     current_files = {f.name for f in md_files}
     db_files = set(db_info["ids"].keys())
+    # Tìm files cần xóa (có trong DB nhưng không có trên disk)
     files_to_delete = db_files - current_files
+    # Bước 4: Xóa docs orphaned
     deleted_count = 0
     if files_to_delete and not args.no_delete:
+        print(f"\n[4/5] Dọn dẹp {len(files_to_delete)} files đã xóa...")
         for filename in files_to_delete:
             doc_ids = list(db_info["ids"].get(filename, []))
             if doc_ids:
                 db.delete_documents(doc_ids)
                 deleted_count += len(doc_ids)
+                print(f"  Đã xóa: {filename} ({len(doc_ids)} chunks)")
     else:
+        print("\n[4/5] Không có files cần xóa")
+    # Bước 5: Xử lý markdown files (thêm mới, cập nhật)
+    print("\n[5/5] Xử lý markdown files...")
     total_added = 0
     total_updated = 0
     skipped = 0
         db_hash = db_info["hashes"].get(f.name, "")
         existing_ids = db_info["ids"].get(f.name, set())
+        # Bỏ qua nếu hash khớp (file không thay đổi)
         if not args.force and db_hash == file_hash:
+            print(f"  [{i}/{len(md_files)}] {f.name}: BỎ QUA (không đổi)")
             skipped += 1
             continue
+        # Nếu file thay đổi, xóa chunks cũ trước
         if existing_ids and not args.force:
             db.delete_documents(list(existing_ids))
+            print(f"  [{i}/{len(md_files)}] {f.name}: CẬP NHẬT (xóa {len(existing_ids)} chunks cũ)")
             is_update = True
         else:
             is_update = False
         try:
             docs = chunk_markdown_file(f)
             if docs:
+                # Thêm hash vào metadata để phát hiện thay đổi lần sau
                 for doc in docs:
                     if hasattr(doc, 'metadata'):
                         doc.metadata["content_hash"] = file_hash
                         doc["metadata"]["content_hash"] = file_hash
                 n = db.upsert_documents(docs)
                 if is_update:
                     total_updated += n
+                    print(f"  [{i}/{len(md_files)}] {f.name}: +{n} chunks mới")
                 else:
                     total_added += n
                     print(f"  [{i}/{len(md_files)}] {f.name}: {n} chunks")
             else:
+                print(f"  [{i}/{len(md_files)}] {f.name}: BỎ QUA (không có chunks)")
         except Exception as e:
+            print(f"  [{i}/{len(md_files)}] {f.name}: LỖI - {e}")
+    # Tổng kết
     new_count = db.count()
     print(f"\n{'=' * 60}")
+    print("TỔNG KẾT")
     print("=" * 60)
+    print(f"  Đã xóa (orphaned): {deleted_count} chunks")
+    print(f"  Đã cập nhật: {total_updated} chunks")
+    print(f"  Đã thêm mới: {total_added} chunks")
+    print(f"  Đã bỏ qua: {skipped} files")
+    print(f"  Số docs trong DB: {old_count} -> {new_count} ({new_count - old_count:+d})")
+    print("\nHOÀN TẤT!")
 if __name__ == "__main__":

scripts/run_eval.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import sys
 from pathlib import Path
 REPO_ROOT = Path(__file__).resolve().parents[1]
@@ -7,18 +8,19 @@ if str(REPO_ROOT) not in sys.path:
 def main():
-    import argparse
-    parser = argparse.ArgumentParser(description="RAG Evaluation")
-    parser.add_argument("--samples", type=int, default=10, help="Number of samples (0 = all)")
     parser.add_argument("--mode", type=str, default="hybrid_rerank",
-                        choices=["vector_only", "bm25_only", "hybrid", "hybrid_rerank", "all"])
     args = parser.parse_args()
     from evaluation.ragas_eval import run_evaluation
     if args.mode == "all":
         print("\n" + "=" * 60)
-        print("RUNNING ALL RETRIEVAL MODES")
         print("=" * 60)
         for mode in ["vector_only", "bm25_only", "hybrid", "hybrid_rerank"]:
             run_evaluation(args.samples, mode)

 import sys
+import argparse
 from pathlib import Path
 REPO_ROOT = Path(__file__).resolve().parents[1]
 def main():
+    parser = argparse.ArgumentParser(description="Đánh giá RAG bằng RAGAS")
+    parser.add_argument("--samples", type=int, default=10, help="Số lượng samples (0 = tất cả)")
     parser.add_argument("--mode", type=str, default="hybrid_rerank",
+                        choices=["vector_only", "bm25_only", "hybrid", "hybrid_rerank", "all"],
+                        help="Chế độ retrieval")
     args = parser.parse_args()
     from evaluation.ragas_eval import run_evaluation
     if args.mode == "all":
+        # Chạy tất cả các chế độ retrieval
         print("\n" + "=" * 60)
+        print("CHẠY TẤT CẢ CÁC CHẾ ĐỘ RETRIEVAL")
         print("=" * 60)
         for mode in ["vector_only", "bm25_only", "hybrid", "hybrid_rerank"]:
             run_evaluation(args.samples, mode)

test/parse_data_hash_test.py DELETED Viewed

@@ -1,102 +0,0 @@
-import os
-import sys
-import random
-import shutil
-from pathlib import Path
-# Ensure project root is on sys.path
-_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if _PROJECT_ROOT not in sys.path:
-    sys.path.insert(0, _PROJECT_ROOT)
-from core.preprocessing.docling_processor import DoclingProcessor
-def get_random_local_pdf(source_dir: str):
-    if not os.path.exists(source_dir):
-        return None
-    files = [f for f in os.listdir(source_dir) if f.lower().endswith('.pdf')]
-    if not files:
-        return None
-    return os.path.join(source_dir, random.choice(files))
-def main(output_dir=None, use_ocr=False):
-    # Setup paths
-    source_dir = os.path.join(_PROJECT_ROOT, "data", "files")
-    if output_dir is None:
-        output_dir = os.path.join(_PROJECT_ROOT, "data", "test_output")
-    # Clean up old test output
-    if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-    print(f"Đang tìm file PDF để test...")
-    # 1. Thử lấy từ local data/files
-    file_path = get_random_local_pdf(source_dir)
-    if not file_path:
-        print(f"Không tìm thấy file PDF nào trong {source_dir}")
-        print("Hãy chạy 'python core/hash_file/hash_data_goc.py' để tải dữ liệu trước.")
-        return 1
-    filename = os.path.basename(file_path)
-    print(f"Đã chọn file test: {filename}")
-    print(f"Đường dẫn: {file_path}")
-    try:
-        # Khởi tạo processor
-        print("Khởi tạo DoclingProcessor...")
-        processor = DoclingProcessor(
-            output_dir=output_dir,
-            use_ocr=use_ocr,
-            timeout=None
-        )
-        # Parse file
-        print(f"Bắt đầu parse...")
-        result = processor.parse_document(file_path)
-        if result:
-            print(f"Test thành công!")
-            # Kiểm tra kết quả
-            output_files = os.listdir(output_dir)
-            md_files = [f for f in output_files if f.endswith('.md')]
-            if md_files:
-                print(f"File output: {md_files[0]}")
-                print(f"Thư mục output: {output_dir}")
-                # In thống kê sơ bộ cho Markdown
-                content_len = len(result)
-                preview = result[:200].replace('\n', ' ') + "..."
-                print(f" Kích thước: {content_len} ký tự")
-                print(f" Preview: {preview}")
-            else:
-                print("  Không tìm thấy file Markdown output dù hàm trả về kết quả.")
-        else:
-            print("Test thất bại: Hàm parse trả về None")
-            return 1
-        return 0
-    except Exception as e:
-        print(f"Lỗi ngoại lệ: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Test Docling với 1 file PDF ngẫu nhiên từ data/files")
-    parser.add_argument("--output", help="Thư mục output cho test (mặc định: data/test_output)")
-    parser.add_argument("--ocr", action="store_true", help="Bật OCR")
-    args = parser.parse_args()
-    sys.exit(main(
-        output_dir=args.output,
-        use_ocr=args.ocr
-    ))

test/test_chunk.py CHANGED Viewed

@@ -1,47 +1,57 @@
 import sys
 sys.path.insert(0, "/home/bahung/DoAn")
 from core.rag.chunk import chunk_markdown_file
 test_file = "data/data_process/chuong_trinh_dao_tao/1.1. Kỹ thuật Cơ điện tử.md"
 print("=" * 70)
 print(f" File: {test_file}")
 print("=" * 70)
-# Now returns List[BaseNode] instead of List[Dict]
 nodes = chunk_markdown_file(test_file)
-print(f"\n Total nodes: {len(nodes)}\n")
 for i, node in enumerate(nodes):
     content = node.get_content()
     metadata = node.metadata
     print(f"\n{'─' * 70}")
     print(f" NODE #{i}")
-    print(f"   Type: {type(node).__name__}")
-    print(f"   Length: {len(content)} chars")
     if metadata:
         print(f"   Metadata: {metadata}")
     print(f"{'─' * 70}")
     content_preview = content[:200]
     if len(content) > 200:
         content_preview += "..."
     print(content_preview)
 with open("test_chunk.md", "w", encoding="utf-8") as f:
     for i, node in enumerate(nodes):
         content = node.get_content()
         metadata = node.metadata
         f.write(f"# NODE {i}\n")
-        f.write(f"**Type:** {type(node).__name__}\n\n")
         f.write("**Metadata:**\n")
         for key, value in metadata.items():
             f.write(f"- {key}: {value}\n")
-        f.write("\n**Content:**\n")
         f.write(content)
         f.write("\n\n---\n\n")
-print("\n Done")

+"""Script test chunking markdown file."""
 import sys
 sys.path.insert(0, "/home/bahung/DoAn")
+from dotenv import load_dotenv
+load_dotenv()  # Load biến môi trường từ .env
 from core.rag.chunk import chunk_markdown_file
+# File test
 test_file = "data/data_process/chuong_trinh_dao_tao/1.1. Kỹ thuật Cơ điện tử.md"
 print("=" * 70)
 print(f" File: {test_file}")
 print("=" * 70)
+# Chunk file markdown
 nodes = chunk_markdown_file(test_file)
+print(f"\n Tổng số nodes: {len(nodes)}\n")
+# Hiển thị thông tin từng node
 for i, node in enumerate(nodes):
     content = node.get_content()
     metadata = node.metadata
     print(f"\n{'─' * 70}")
     print(f" NODE #{i}")
+    print(f"   Loại: {type(node).__name__}")
+    print(f"   Độ dài: {len(content)} ký tự")
     if metadata:
         print(f"   Metadata: {metadata}")
     print(f"{'─' * 70}")
+    # Preview nội dung (tối đa 200 ký tự)
     content_preview = content[:200]
     if len(content) > 200:
         content_preview += "..."
     print(content_preview)
+# Lưu kết quả ra file markdown để dễ xem
 with open("test_chunk.md", "w", encoding="utf-8") as f:
     for i, node in enumerate(nodes):
         content = node.get_content()
         metadata = node.metadata
         f.write(f"# NODE {i}\n")
+        f.write(f"**Loại:** {type(node).__name__}\n\n")
         f.write("**Metadata:**\n")
         for key, value in metadata.items():
             f.write(f"- {key}: {value}\n")
+        f.write("\n**Nội dung:**\n")
         f.write(content)
         f.write("\n\n---\n\n")
+print("\n Hoàn tất!")