Spaces:

NguyenQuocVy2004
/

chatbot-lichsu

Running

App Files Files Community

Nguyễn Quốc Vỹ commited on Mar 20

Commit

47738d8

1 Parent(s): ab66a23

Tối ưu logic index: chỉ index chunk mới, bỏ qua tài liệu/chunk đã tồn tại

Browse files

Files changed (4) hide show

backend/admin_services.py +13 -2
backend/rag_chain_pg.py +3 -0
data_processing/dynamic_indexing.py +32 -5
data_processing/indexing.py +35 -7

backend/admin_services.py CHANGED Viewed

@@ -17,6 +17,8 @@ from backend import db
 from backend.auth import is_admin
 from backend.runtime_paths import PDF_DIR
 from backend.db_sync import schedule_pdf_upload, schedule_pdf_delete, schedule_vector_sync
 # ======================== UserService ========================
@@ -109,8 +111,10 @@ def create_system_doc(file_path: str, filename: str = None) -> tuple[bool, str]:
         else:
             shutil.copy2(file_path, dest)
         db.insert_tai_lieu_he_thong(ma_tai_lieu=ma, ten_file=filename, duong_dan=os.path.abspath(dest))
         schedule_pdf_upload(dest, filename)
-        return True, "Đã thêm tài liệu. Hãy chạy 'Tái lập chỉ mục' để cập nhật RAG."
     except Exception as e:
         return False, str(e)
@@ -133,8 +137,15 @@ def update_system_doc(ma_tai_lieu: str, file_path: str = None, ten_file: str = N
             return False, str(e)
     # Cập nhật tên trong DB nếu đổi tên (cần hàm update trong db - hiện chỉ có insert upsert)
     db.insert_tai_lieu_he_thong(ma_tai_lieu=ma_tai_lieu, ten_file=new_name, duong_dan=old_path)
     schedule_pdf_upload(old_path, new_name)
-    return True, "Đã cập nhật. Chạy 'Tái lập chỉ mục' nếu đã thay file."
 def delete_system_doc(ma_tai_lieu: str) -> tuple[bool, str]:

 from backend.auth import is_admin
 from backend.runtime_paths import PDF_DIR
 from backend.db_sync import schedule_pdf_upload, schedule_pdf_delete, schedule_vector_sync
+from data_processing.dynamic_indexing import add_pdf_file
+from data_processing.indexing import delete_chunks_by_source
 # ======================== UserService ========================
         else:
             shutil.copy2(file_path, dest)
         db.insert_tai_lieu_he_thong(ma_tai_lieu=ma, ten_file=filename, duong_dan=os.path.abspath(dest))
+        # Index ngay sau khi upload để hỏi đáp dùng được luôn.
+        chunks_added = add_pdf_file(dest)
         schedule_pdf_upload(dest, filename)
+        return True, f"Đã thêm tài liệu và index {chunks_added} chunks."
     except Exception as e:
         return False, str(e)
             return False, str(e)
     # Cập nhật tên trong DB nếu đổi tên (cần hàm update trong db - hiện chỉ có insert upsert)
     db.insert_tai_lieu_he_thong(ma_tai_lieu=ma_tai_lieu, ten_file=new_name, duong_dan=old_path)
+    # Khi thay file, cần cập nhật vector ngay để tránh dùng dữ liệu cũ.
+    if file_path:
+        delete_chunks_by_source(new_name)
+        chunks_added = add_pdf_file(old_path)
+        msg = f"Đã cập nhật và index lại {chunks_added} chunks."
+    else:
+        msg = "Đã cập nhật metadata tài liệu."
     schedule_pdf_upload(old_path, new_name)
+    return True, msg
 def delete_system_doc(ma_tai_lieu: str) -> tuple[bool, str]:

backend/rag_chain_pg.py CHANGED Viewed

@@ -1058,6 +1058,8 @@ def process_uploaded_pdf(uploaded_file, user_id=None) -> dict:
         documents = [{"content": text.strip(), "source": filename}]
         chunks_added = add_new_documents(documents)
         print(f"[PDF] ✅ Indexed {chunks_added} chunks from {filename}")
         if user_id:
             ma_tai_lieu = str(uuid.uuid4())
@@ -1074,6 +1076,7 @@ def process_uploaded_pdf(uploaded_file, user_id=None) -> dict:
             "filename": filename,
             "text": text,
             "chunks_count": chunks_added,
         }
     except Exception as e:

         documents = [{"content": text.strip(), "source": filename}]
         chunks_added = add_new_documents(documents)
         print(f"[PDF] ✅ Indexed {chunks_added} chunks from {filename}")
+        global _source_cache
+        _source_cache = None
         if user_id:
             ma_tai_lieu = str(uuid.uuid4())
             "filename": filename,
             "text": text,
             "chunks_count": chunks_added,
+            "already_indexed": False,
         }
     except Exception as e:

data_processing/dynamic_indexing.py CHANGED Viewed

@@ -70,19 +70,46 @@ def add_new_documents(documents: list) -> int:
     batch_size = 500
     total = len(documents_list)
     for start in range(0, total, batch_size):
         end = min(start + batch_size, total)
         collection.upsert(
-            documents=documents_list[start:end],
-            metadatas=metadatas_list[start:end],
-            ids=ids_list[start:end],
         )
     embedding_fn.set_mode("query")
-    print(f"✅ Đã thêm {total} chunks mới vào ChromaDB")
     print(f"📊 Tổng chunks hiện tại: {collection.count()}")
-    return total
 def add_pdf_file(filepath: str) -> int:

     batch_size = 500
     total = len(documents_list)
+    skipped_existing = 0
+    inserted_new = 0
     for start in range(0, total, batch_size):
         end = min(start + batch_size, total)
+        batch_ids = ids_list[start:end]
+        existing = collection.get(ids=batch_ids, include=[])
+        existing_ids = set(existing.get("ids", []) if existing else [])
+        filtered_docs = []
+        filtered_metas = []
+        filtered_ids = []
+        for doc, meta, chunk_id in zip(
+            documents_list[start:end],
+            metadatas_list[start:end],
+            batch_ids,
+        ):
+            if chunk_id in existing_ids:
+                skipped_existing += 1
+                continue
+            filtered_docs.append(doc)
+            filtered_metas.append(meta)
+            filtered_ids.append(chunk_id)
+        if not filtered_ids:
+            continue
         collection.upsert(
+            documents=filtered_docs,
+            metadatas=filtered_metas,
+            ids=filtered_ids,
         )
+        inserted_new += len(filtered_ids)
     embedding_fn.set_mode("query")
+    print(f"✅ Đã thêm {inserted_new} chunks mới vào ChromaDB")
+    if skipped_existing:
+        print(f"⏭️ Bỏ qua {skipped_existing} chunks đã tồn tại")
     print(f"📊 Tổng chunks hiện tại: {collection.count()}")
+    return inserted_new
 def add_pdf_file(filepath: str) -> int:

data_processing/indexing.py CHANGED Viewed

@@ -157,11 +157,10 @@ def create_vector_database(chunks: List[Dict]):
         print("❌ Không có chunks để index!")
         return
     embedding_fn = get_embedding_function()
     embedding_fn.set_mode("passage")
-    collection = get_collection()
     documents = []
     metadatas = []
     ids = []
@@ -191,19 +190,46 @@ def create_vector_database(chunks: List[Dict]):
     batch_size = 500
     total = len(documents)
     for start in range(0, total, batch_size):
         end = min(start + batch_size, total)
         collection.upsert(
-            documents=documents[start:end],
-            metadatas=metadatas[start:end],
-            ids=ids[start:end]
         )
-        print(f"  ✅ Đã index {end}/{total} chunks")
     embedding_fn.set_mode("query")
-    print(f"\n✅ Tổng cộng {total} chunks đã được index vào ChromaDB")
     print(f"📁 Dữ liệu lưu tại: {CHROMA_PERSIST_DIR}")
     print(f"🧠 Embedding model: {EMBEDDING_MODEL}")
@@ -215,6 +241,8 @@ def search(query: str, top_k: int = 5, max_distance: float = 0.8) -> List[Dict]:
     max_distance: ngưỡng tối đa, chỉ trả về kết quả có distance < max_distance.
     """
     collection = get_collection()
     if collection.count() == 0:
         print("[Search] ⚠️ Database rỗng! Chạy run_pipeline.py trước.")

         print("❌ Không có chunks để index!")
         return
+    collection = get_collection()
     embedding_fn = get_embedding_function()
     embedding_fn.set_mode("passage")
     documents = []
     metadatas = []
     ids = []
     batch_size = 500
     total = len(documents)
+    skipped_existing = 0
+    inserted_new = 0
     for start in range(0, total, batch_size):
         end = min(start + batch_size, total)
+        batch_ids = ids[start:end]
+        existing = collection.get(ids=batch_ids, include=[])
+        existing_ids = set(existing.get("ids", []) if existing else [])
+        filtered_docs = []
+        filtered_metas = []
+        filtered_ids = []
+        for doc, meta, chunk_id in zip(
+            documents[start:end],
+            metadatas[start:end],
+            batch_ids,
+        ):
+            if chunk_id in existing_ids:
+                skipped_existing += 1
+                continue
+            filtered_docs.append(doc)
+            filtered_metas.append(meta)
+            filtered_ids.append(chunk_id)
+        if not filtered_ids:
+            continue
         collection.upsert(
+            documents=filtered_docs,
+            metadatas=filtered_metas,
+            ids=filtered_ids
         )
+        inserted_new += len(filtered_ids)
+        print(f"  ✅ Đã index mới {inserted_new}/{total} chunks")
     embedding_fn.set_mode("query")
+    print(f"\n✅ Tổng cộng {inserted_new} chunks mới đã được index vào ChromaDB")
+    if skipped_existing:
+        print(f"⏭️ Bỏ qua {skipped_existing} chunks đã tồn tại")
     print(f"📁 Dữ liệu lưu tại: {CHROMA_PERSIST_DIR}")
     print(f"🧠 Embedding model: {EMBEDDING_MODEL}")
     max_distance: ngưỡng tối đa, chỉ trả về kết quả có distance < max_distance.
     """
     collection = get_collection()
+    # Đảm bảo query luôn dùng đúng prefix "query: "
+    get_embedding_function().set_mode("query")
     if collection.count() == 0:
         print("[Search] ⚠️ Database rỗng! Chạy run_pipeline.py trước.")