Hackaton1_BOOK_chatbot / scripts /index_chapters.py
Abdullahcoder54's picture
'Upload'
36425a4
import os
import glob
import uuid
from dotenv import load_dotenv
from backend.app.utils.chunking import semantic_chunking
from backend.app.services.embedding_service import EmbeddingService
from backend.app.services.qdrant_service import QdrantService
load_dotenv()
def index_chapters():
print("Starting chapter indexing...")
# Initialize services (assuming they can be initialized without FastAPI app context for script use)
embedding_service = EmbeddingService() # Assuming default constructor
qdrant_service = QdrantService() # Assuming default constructor
chapter_files = glob.glob("frontend/docs/chapter-*.md")
if not chapter_files:
print("No chapter files found in website/docs/. Please ensure chapters exist.")
return
for file_path in chapter_files:
print(f"Processing {file_path}...")
with open(file_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
chunks = semantic_chunking(markdown_content)
for i, chunk in enumerate(chunks):
chunk_content = chunk["content"]
metadata = chunk["metadata"]
metadata["source"] = file_path # Update source to actual file path
metadata["chunk_number"] = i
# Generate a unique UUID for the Qdrant point
point_id = str(uuid.uuid4())
# Generate embedding
embedding = embedding_service.encode([chunk_content])[0] # Pass as list and take first element
# Store in Qdrant
qdrant_service.upsert_chunks(ids=[point_id], vectors=[embedding], payloads=[metadata])
print("Chapter indexing completed.")
if __name__ == "__main__":
index_chapters()