Spaces:

Tahasaif3
/

chatbot

Runtime error

App Files Files Community

Tahasaif3 commited on Dec 8, 2025

Commit

e62ef42

verified ·

1 Parent(s): 456d4ee

Update app/services/document_ingestion.py

Browse files

Files changed (1) hide show

app/services/document_ingestion.py +38 -14

app/services/document_ingestion.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import os
 from typing import List, Dict
 from app.utils.text_processing import extract_chapters_and_sections, split_text_into_chunks, clean_markdown
 from app.services.rag_service import rag_pipeline
-def ingest_book_content(file_path: str) -> List[str]:
     """
     Ingest the book content from a markdown file into the vector store
@@ -20,7 +21,8 @@ def ingest_book_content(file_path: str) -> List[str]:
     # Extract chapters and sections
     sections = extract_chapters_and_sections(content)
-    ingested_document_ids = []
     # Process each section
     for section in sections:
@@ -34,7 +36,7 @@ def ingest_book_content(file_path: str) -> List[str]:
         # Split into chunks if the content is too long
         chunks = split_text_into_chunks(clean_content, chunk_size=800, overlap=100)
-        # Ingest each chunk
         for i, chunk in enumerate(chunks):
             document = {
                 "title": section["title"] + (f" (part {i+1})" if len(chunks) > 1 else ""),
@@ -43,31 +45,53 @@ def ingest_book_content(file_path: str) -> List[str]:
                 "section": section["section"],
                 "subsection": section["subsection"]
             }
-            # Ingest the document
-            doc_id = rag_pipeline.ingest_document(document)
-            if doc_id:
-                ingested_document_ids.append(doc_id)
     return ingested_document_ids
-def initialize_knowledge_base():
     """
-    Initialize the knowledge base by ingesting the book content
     """
     # Define the path to the book knowledge base
-    book_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
-                            "book_knowledge_base.md")
     if os.path.exists(book_path):
         print("Ingesting book content into the knowledge base...")
-        document_ids = ingest_book_content(book_path)
         print(f"Successfully ingested {len(document_ids)} documents into the knowledge base.")
         return document_ids
     else:
         print(f"Book file not found at {book_path}")
         return []
 if __name__ == "__main__":
     # Initialize the knowledge base when the script is run directly
-    initialize_knowledge_base()

 import os
+import asyncio
 from typing import List, Dict
 from app.utils.text_processing import extract_chapters_and_sections, split_text_into_chunks, clean_markdown
 from app.services.rag_service import rag_pipeline
+async def ingest_book_content(file_path: str) -> List[str]:
     """
     Ingest the book content from a markdown file into the vector store
     # Extract chapters and sections
     sections = extract_chapters_and_sections(content)
+    # Prepare all documents first
+    documents_to_ingest = []
     # Process each section
     for section in sections:
         # Split into chunks if the content is too long
         chunks = split_text_into_chunks(clean_content, chunk_size=800, overlap=100)
+        # Prepare each chunk as a document
         for i, chunk in enumerate(chunks):
             document = {
                 "title": section["title"] + (f" (part {i+1})" if len(chunks) > 1 else ""),
                 "section": section["section"],
                 "subsection": section["subsection"]
             }
+            documents_to_ingest.append(document)
+    # Ingest all documents using batch processing for better performance
+    print(f"Prepared {len(documents_to_ingest)} document chunks for ingestion...")
+    ingested_document_ids = await rag_pipeline.ingest_documents_batch(documents_to_ingest)
     return ingested_document_ids
+async def initialize_knowledge_base_async():
     """
+    Async function to initialize the knowledge base by ingesting the book content
     """
     # Define the path to the book knowledge base
+    book_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+        "book_knowledge_base.md"
+    )
     if os.path.exists(book_path):
         print("Ingesting book content into the knowledge base...")
+        document_ids = await ingest_book_content(book_path)
         print(f"Successfully ingested {len(document_ids)} documents into the knowledge base.")
         return document_ids
     else:
         print(f"Book file not found at {book_path}")
         return []
+def initialize_knowledge_base():
+    """
+    Synchronous wrapper to initialize the knowledge base
+    Can be called from non-async contexts
+    """
+    try:
+        # Check if an event loop is already running
+        try:
+            loop = asyncio.get_running_loop()
+            # If we're in an async context, return a task
+            return asyncio.create_task(initialize_knowledge_base_async())
+        except RuntimeError:
+            # No event loop running, create one and run
+            return asyncio.run(initialize_knowledge_base_async())
+    except Exception as e:
+        print(f"Error initializing knowledge base: {e}")
+        import traceback
+        traceback.print_exc()
+        return []
 if __name__ == "__main__":
     # Initialize the knowledge base when the script is run directly
+    asyncio.run(initialize_knowledge_base_async())