File size: 1,733 Bytes
36425a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import glob
import uuid
from dotenv import load_dotenv

from backend.app.utils.chunking import semantic_chunking
from backend.app.services.embedding_service import EmbeddingService
from backend.app.services.qdrant_service import QdrantService

load_dotenv()

def index_chapters():
    print("Starting chapter indexing...")

    # Initialize services (assuming they can be initialized without FastAPI app context for script use)
    embedding_service = EmbeddingService() # Assuming default constructor
    qdrant_service = QdrantService()       # Assuming default constructor

    chapter_files = glob.glob("frontend/docs/chapter-*.md")
    if not chapter_files:
        print("No chapter files found in website/docs/. Please ensure chapters exist.")
        return

    for file_path in chapter_files:
        print(f"Processing {file_path}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            markdown_content = f.read()

        chunks = semantic_chunking(markdown_content)

        for i, chunk in enumerate(chunks):
            chunk_content = chunk["content"]
            metadata = chunk["metadata"]
            metadata["source"] = file_path # Update source to actual file path
            metadata["chunk_number"] = i
            
            # Generate a unique UUID for the Qdrant point
            point_id = str(uuid.uuid4())

            # Generate embedding
            embedding = embedding_service.encode([chunk_content])[0] # Pass as list and take first element

            # Store in Qdrant
            qdrant_service.upsert_chunks(ids=[point_id], vectors=[embedding], payloads=[metadata])

    print("Chapter indexing completed.")

if __name__ == "__main__":
    index_chapters()