File size: 3,825 Bytes
fe10c91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import argparse
from pathlib import Path
import markdown
from bs4 import BeautifulSoup
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from dotenv import load_dotenv

# Add these to enable relative imports
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from app.services.embeddings_service import EmbeddingsService, GeminiEmbeddingsService
from app.qdrant_client import get_qdrant_client
from app.config import settings

load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env")

QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection")

def load_mdx_content(filepath: Path) -> str:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    # MDX is essentially Markdown, so we can convert to HTML then extract text
    html = markdown.markdown(content)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]:
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

async def ingest_content(
    docs_path: Path,
    qdrant_client: QdrantClient,
    embeddings_service: EmbeddingsService,
    collection_name: str,
):
    # Determine vector size based on the embedding service
    if isinstance(embeddings_service, GeminiEmbeddingsService):
        vector_size = 768  # Gemini embedding size
    else:
        vector_size = 1536  # OpenAI embedding size
    
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )

    points = []
    point_id = 0
    for mdx_file in docs_path.rglob("*.mdx"):
        print(f"Processing {mdx_file}")
        content = load_mdx_content(mdx_file)
        chunks = chunk_text(content)

        for chunk in chunks:
            embedding = await embeddings_service.create_embedding(chunk)
            points.append(
                PointStruct(
                    id=point_id,
                    vector=embedding,
                    payload={
                        "content": chunk,
                        "source": str(mdx_file.relative_to(docs_path))
                    }
                )
            )
            point_id += 1

            if len(points) >= 100:  # Batch upsert
                qdrant_client.upsert(
                    collection_name=collection_name,
                    points=points,
                    wait=True,
                )
                points = []

    if points: # Upsert remaining points
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points,
            wait=True,
        )

    print(f"Ingestion complete. Total points: {point_id}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.")
    parser.add_argument(
        "--docs_path",
        type=str,
        default="../physical-ai-humanoid-robotics/docs/",
        help="Path to the directory containing MDX documentation files."
    )
    args = parser.parse_args()

    qdrant_client = get_qdrant_client()
    
    # Choose the appropriate embedding service based on AI_PROVIDER setting
    if settings.AI_PROVIDER.lower() == "gemini":
        embeddings_service = GeminiEmbeddingsService()
    else:
        embeddings_service = EmbeddingsService()

    # Run the async ingestion
    import asyncio
    asyncio.run(ingest_content(
        docs_path=Path(args.docs_path),
        qdrant_client=qdrant_client,
        embeddings_service=embeddings_service,
        collection_name=QDRANT_COLLECTION_NAME
    ))