File size: 3,350 Bytes
0cee4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import argparse
from pathlib import Path
import markdown
from bs4 import BeautifulSoup
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from dotenv import load_dotenv

# Add these to enable relative imports
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from app.services.embeddings_service import EmbeddingsService
from app.qdrant_client import get_qdrant_client

load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env")

QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection")

def load_mdx_content(filepath: Path) -> str:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    # MDX is essentially Markdown, so we can convert to HTML then extract text
    html = markdown.markdown(content)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]:
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

async def ingest_content(
    docs_path: Path,
    qdrant_client: QdrantClient,
    embeddings_service: EmbeddingsService,
    collection_name: str,
):
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE),  # Gemini text-embedding-004 size
    )

    points = []
    point_id = 0
    for mdx_file in docs_path.rglob("*.mdx"):
        print(f"Processing {mdx_file}")
        content = load_mdx_content(mdx_file)
        chunks = chunk_text(content)

        for chunk in chunks:
            embedding = embeddings_service.create_embedding(chunk)
            points.append(
                PointStruct(
                    id=point_id,
                    vector=embedding,
                    payload={
                        "content": chunk,
                        "source": str(mdx_file.relative_to(docs_path))
                    }
                )
            )
            point_id += 1

            if len(points) >= 100:  # Batch upsert
                qdrant_client.upsert(
                    collection_name=collection_name,
                    points=points,
                    wait=True,
                )
                points = []

    if points: # Upsert remaining points
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points,
            wait=True,
        )

    print(f"Ingestion complete. Total points: {point_id}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.")
    parser.add_argument(
        "--docs_path",
        type=str,
        default="../physical-ai-humanoid-robotics/docs/",
        help="Path to the directory containing MDX documentation files."
    )
    args = parser.parse_args()

    qdrant_client = get_qdrant_client()
    embeddings_service = EmbeddingsService()

    # Run the async ingestion
    import asyncio
    asyncio.run(ingest_content(
        docs_path=Path(args.docs_path),
        qdrant_client=qdrant_client,
        embeddings_service=embeddings_service,
        collection_name=QDRANT_COLLECTION_NAME
    ))