import os import argparse from pathlib import Path import markdown from bs4 import BeautifulSoup from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct from dotenv import load_dotenv # Add these to enable relative imports import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from app.services.embeddings_service import EmbeddingsService from app.qdrant_client import get_qdrant_client load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env") QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection") def load_mdx_content(filepath: Path) -> str: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # MDX is essentially Markdown, so we can convert to HTML then extract text html = markdown.markdown(content) soup = BeautifulSoup(html, 'html.parser') return soup.get_text() def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]: chunks = [] for i in range(0, len(text), chunk_size - overlap): chunks.append(text[i:i + chunk_size]) return chunks async def ingest_content( docs_path: Path, qdrant_client: QdrantClient, embeddings_service: EmbeddingsService, collection_name: str, ): qdrant_client.recreate_collection( collection_name=collection_name, vectors_config=VectorParams(size=1536, distance=Distance.COSINE), # OpenAI embeddings size ) points = [] point_id = 0 for mdx_file in docs_path.rglob("*.mdx"): print(f"Processing {mdx_file}") content = load_mdx_content(mdx_file) chunks = chunk_text(content) for chunk in chunks: embedding = embeddings_service.create_embedding(chunk) points.append( PointStruct( id=point_id, vector=embedding, payload={ "content": chunk, "source": str(mdx_file.relative_to(docs_path)) } ) ) point_id += 1 if len(points) >= 100: # Batch upsert qdrant_client.upsert( collection_name=collection_name, points=points, wait=True, ) points = [] if points: # Upsert remaining points qdrant_client.upsert( collection_name=collection_name, points=points, wait=True, ) print(f"Ingestion complete. Total points: {point_id}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.") parser.add_argument( "--docs_path", type=str, default="../physical-ai-humanoid-robotics/docs/", help="Path to the directory containing MDX documentation files." ) args = parser.parse_args() qdrant_client = get_qdrant_client() embeddings_service = EmbeddingsService() # Run the async ingestion import asyncio asyncio.run(ingest_content( docs_path=Path(args.docs_path), qdrant_client=qdrant_client, embeddings_service=embeddings_service, collection_name=QDRANT_COLLECTION_NAME ))