Spaces:

Chatbot-TLU
/

M_chatbot

Sleeping

minh-4T commited on Apr 11

Commit

42ed92c

1 Parent(s): 2a96248

change model embedding + encoder

Files changed (2) hide show

core/chunking.py CHANGED Viewed

@@ -28,7 +28,7 @@ LIST_PATTERNS = [
     (r"(?m)^\s*•\s+", "<LIST_BULLET>"),
 ]
 def extract_and_protect_tables(text: str) -> Tuple[str, dict]:
     table_pattern = re.compile(r"(?:\|.*\|[\r\n]+)+")
     tables = {}
@@ -41,7 +41,7 @@ def extract_and_protect_tables(text: str) -> Tuple[str, dict]:
     protected_text = re.sub(table_pattern, replace_table, text)
     return protected_text, tables
 def protect_lists(text: str) -> Tuple[str, dict]:
     placeholders = {}
     protected = text
@@ -55,14 +55,14 @@ def protect_lists(text: str) -> Tuple[str, dict]:
     return protected, placeholders
 def restore_placeholders(text: str, placeholders: dict) -> str:
     restored = text
     for placeholder, original in placeholders.items():
         restored = restored.replace(placeholder, original)
     return restored
 def split_by_structure(text: str) -> List[str]:
     parts = [text]
@@ -91,7 +91,7 @@ def split_by_structure(text: str) -> List[str]:
     return [part for part in parts if part.strip()]
 def smart_chunking(docs: List) -> List:
     logger.info("Chunking theo cau truc + do dai...")
     length_splitter = RecursiveCharacterTextSplitter(

     (r"(?m)^\s*•\s+", "<LIST_BULLET>"),
 ]
+# Tách và thêm các thẻ <table> để bảo vệ cấu trúc bảng khỏi bị chia cắt trong quá trình chunking.
 def extract_and_protect_tables(text: str) -> Tuple[str, dict]:
     table_pattern = re.compile(r"(?:\|.*\|[\r\n]+)+")
     tables = {}
     protected_text = re.sub(table_pattern, replace_table, text)
     return protected_text, tables
+# Bảo vệ các phần tử của danh sách khỏi bị chia cắt trong quá trình chunking
 def protect_lists(text: str) -> Tuple[str, dict]:
     placeholders = {}
     protected = text
     return protected, placeholders
+# Khôi phục các phần từ được bảo vệ về nội dung gốc bằng cách thay thế các placeholder
 def restore_placeholders(text: str, placeholders: dict) -> str:
     restored = text
     for placeholder, original in placeholders.items():
         restored = restored.replace(placeholder, original)
     return restored
+# Tách văn bản dựa trên cấu trúc được xây dựng từ đầu
 def split_by_structure(text: str) -> List[str]:
     parts = [text]
     return [part for part in parts if part.strip()]
+# Hàm chính thực hiện chunking thông minh
 def smart_chunking(docs: List) -> List:
     logger.info("Chunking theo cau truc + do dai...")
     length_splitter = RecursiveCharacterTextSplitter(

core/config.py CHANGED Viewed

@@ -39,14 +39,14 @@ GEMINI_API_KEYS = os.getenv('GEMINI_API_KEYS', '').strip()
 # Name models
 LLM_MODEL = os.getenv('LLM_MODEL', 'llama-3.1-70b-versatile')
 FAST_LLM_MODEL = os.getenv('FAST_LLM_MODEL', 'llama-3.1-8b-instant')
-EMBED_MODEL = os.getenv('EMBED_MODEL', 'BAAI/bge-m3')
-CROSS_ENCODER_MODEL = os.getenv('CROSS_ENCODER_MODEL', 'BAAI/bge-reranker-v2-m3')
 # Chunking and retrieval settings
 CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', '800'))
 CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', '150'))
-TOP_K_RESULTS = int(os.getenv('TOP_K_RESULTS', '10'))
-FINAL_TOP_K = int(os.getenv('FINAL_TOP_K', '5'))
 QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'rag_docs')
 DOCUMENTS_DATABASE_URL = os.getenv('DOCUMENTS_DATABASE_URL', _default_documents_db_url())

 # Name models
 LLM_MODEL = os.getenv('LLM_MODEL', 'llama-3.1-70b-versatile')
 FAST_LLM_MODEL = os.getenv('FAST_LLM_MODEL', 'llama-3.1-8b-instant')
+EMBED_MODEL = os.getenv('EMBED_MODEL', 'bkai-foundation-models/vietnamese-bi-encoder')
+CROSS_ENCODER_MODEL = os.getenv('CROSS_ENCODER_MODEL', 'itdainb/PhoRanker')
 # Chunking and retrieval settings
 CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', '800'))
 CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', '150'))
+TOP_K_RESULTS = int(os.getenv('TOP_K_RESULTS', '8'))
+FINAL_TOP_K = int(os.getenv('FINAL_TOP_K', '3'))
 QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'rag_docs')
 DOCUMENTS_DATABASE_URL = os.getenv('DOCUMENTS_DATABASE_URL', _default_documents_db_url())