Spaces:
Runtime error
Runtime error
File size: 1,733 Bytes
36425a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import os
import glob
import uuid
from dotenv import load_dotenv
from backend.app.utils.chunking import semantic_chunking
from backend.app.services.embedding_service import EmbeddingService
from backend.app.services.qdrant_service import QdrantService
load_dotenv()
def index_chapters():
print("Starting chapter indexing...")
# Initialize services (assuming they can be initialized without FastAPI app context for script use)
embedding_service = EmbeddingService() # Assuming default constructor
qdrant_service = QdrantService() # Assuming default constructor
chapter_files = glob.glob("frontend/docs/chapter-*.md")
if not chapter_files:
print("No chapter files found in website/docs/. Please ensure chapters exist.")
return
for file_path in chapter_files:
print(f"Processing {file_path}...")
with open(file_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
chunks = semantic_chunking(markdown_content)
for i, chunk in enumerate(chunks):
chunk_content = chunk["content"]
metadata = chunk["metadata"]
metadata["source"] = file_path # Update source to actual file path
metadata["chunk_number"] = i
# Generate a unique UUID for the Qdrant point
point_id = str(uuid.uuid4())
# Generate embedding
embedding = embedding_service.encode([chunk_content])[0] # Pass as list and take first element
# Store in Qdrant
qdrant_service.upsert_chunks(ids=[point_id], vectors=[embedding], payloads=[metadata])
print("Chapter indexing completed.")
if __name__ == "__main__":
index_chapters()
|