Spaces:

NguyenQuocVy2004
/

chatbot-lichsu

Running

Nguyễn Quốc Vỹ commited on Mar 20

Commit

0700453

1 Parent(s): 47738d8

Tối ưu đồng bộ chỉ mục: chỉ index tài liệu mới, bỏ reindex toàn bộ

Files changed (2) hide show

backend/admin_services.py CHANGED Viewed

@@ -173,22 +173,43 @@ def reindex_doc(ma_tai_lieu: str) -> tuple[bool, str]:
 def reindex_all() -> tuple[bool, str]:
-    """Tái lập chỉ mục Vector DB: chạy pipeline đọc PDF → chunk → ChromaDB."""
     try:
-        from data_processing.loader import load_all_documents
-        from data_processing.chunking import chunk_documents
-        from data_processing.indexing import create_vector_database, delete_collection
-        documents = load_all_documents(PDF_DIR)
-        if not documents:
             return False, f"Không có tài liệu PDF trong {PDF_DIR}"
-        chunks = chunk_documents(documents)
-        try:
-            delete_collection()
-        except Exception:
-            pass
-        create_vector_database(chunks)
         schedule_vector_sync()
-        return True, f"Đã tái lập chỉ mục: {len(documents)} tài liệu, {len(chunks)} chunks."
     except Exception as e:
         return False, f"Lỗi: {str(e)}"

 def reindex_all() -> tuple[bool, str]:
+    """
+    Đồng bộ chỉ mục theo kiểu tăng dần (incremental):
+    - Quét thư mục PDF runtime
+    - Chỉ index tài liệu CHƯA có trong ChromaDB
+    Tránh reindex toàn bộ gây chậm khi số tài liệu lớn.
+    """
     try:
+        from data_processing.dynamic_indexing import add_pdf_file
+        if not os.path.isdir(PDF_DIR):
             return False, f"Không có tài liệu PDF trong {PDF_DIR}"
+        pdf_files = [
+            os.path.join(PDF_DIR, f)
+            for f in sorted(os.listdir(PDF_DIR))
+            if f.lower().endswith(".pdf")
+        ]
+        if not pdf_files:
+            return False, f"Không có tài liệu PDF trong {PDF_DIR}"
+        indexed_docs = 0
+        indexed_chunks = 0
+        skipped_docs = 0
+        for pdf_path in pdf_files:
+            added = add_pdf_file(pdf_path)
+            if added > 0:
+                indexed_docs += 1
+                indexed_chunks += added
+            else:
+                skipped_docs += 1
         schedule_vector_sync()
+        return True, (
+            f"Đồng bộ xong: thêm mới {indexed_docs} tài liệu / {indexed_chunks} chunks, "
+            f"bỏ qua {skipped_docs} tài liệu đã có."
+        )
     except Exception as e:
         return False, f"Lỗi: {str(e)}"

frontend/app.py CHANGED Viewed

@@ -1011,15 +1011,16 @@ def show_admin_page():
                     st.error(msg)
     with tab5:
-        st.subheader("Thống kê RAG & Tái lập chỉ mục")
         stats = get_rag_stats()
         if stats.get("error"):
             st.warning(stats["error"])
         else:
             st.metric("Tổng số chunks", stats.get("total_chunks", 0))
             st.caption(f"Collection: {stats.get('collection_name')} | Thư mục: {stats.get('persist_dir', '')}")
-        if st.button("Tái lập chỉ mục Vector DB (ChromaDB)", type="primary"):
-            with st.spinner("Đang đọc PDF, chia đoạn và index..."):
                 ok, msg = reindex_all()
             if ok:
                 st.success(msg)

                     st.error(msg)
     with tab5:
+        st.subheader("Thống kê RAG & Đồng bộ chỉ mục")
         stats = get_rag_stats()
         if stats.get("error"):
             st.warning(stats["error"])
         else:
             st.metric("Tổng số chunks", stats.get("total_chunks", 0))
             st.caption(f"Collection: {stats.get('collection_name')} | Thư mục: {stats.get('persist_dir', '')}")
+        st.caption("Nút bên dưới chỉ index tài liệu mới, bỏ qua tài liệu/chunks đã có để chạy nhanh hơn.")
+        if st.button("Đồng bộ chỉ mục (chỉ tài liệu mới)", type="primary"):
+            with st.spinner("Đang đồng bộ tài liệu mới vào ChromaDB..."):
                 ok, msg = reindex_all()
             if ok:
                 st.success(msg)