Spaces:
Sleeping
Sleeping
| import os | |
| import argparse | |
| from pathlib import Path | |
| import markdown | |
| from bs4 import BeautifulSoup | |
| from qdrant_client import QdrantClient | |
| from qdrant_client.models import Distance, VectorParams, PointStruct | |
| from dotenv import load_dotenv | |
| # Add these to enable relative imports | |
| import sys | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from app.services.embeddings_service import EmbeddingsService, GeminiEmbeddingsService | |
| from app.qdrant_client import get_qdrant_client | |
| from app.config import settings | |
| load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env") | |
| QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection") | |
| def load_mdx_content(filepath: Path) -> str: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # MDX is essentially Markdown, so we can convert to HTML then extract text | |
| html = markdown.markdown(content) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| return soup.get_text() | |
| def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]: | |
| chunks = [] | |
| for i in range(0, len(text), chunk_size - overlap): | |
| chunks.append(text[i:i + chunk_size]) | |
| return chunks | |
| async def ingest_content( | |
| docs_path: Path, | |
| qdrant_client: QdrantClient, | |
| embeddings_service: EmbeddingsService, | |
| collection_name: str, | |
| ): | |
| # Determine vector size based on the embedding service | |
| if isinstance(embeddings_service, GeminiEmbeddingsService): | |
| vector_size = 768 # Gemini embedding size | |
| else: | |
| vector_size = 1536 # OpenAI embedding size | |
| qdrant_client.recreate_collection( | |
| collection_name=collection_name, | |
| vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), | |
| ) | |
| points = [] | |
| point_id = 0 | |
| for mdx_file in docs_path.rglob("*.mdx"): | |
| print(f"Processing {mdx_file}") | |
| content = load_mdx_content(mdx_file) | |
| chunks = chunk_text(content) | |
| for chunk in chunks: | |
| embedding = await embeddings_service.create_embedding(chunk) | |
| points.append( | |
| PointStruct( | |
| id=point_id, | |
| vector=embedding, | |
| payload={ | |
| "content": chunk, | |
| "source": str(mdx_file.relative_to(docs_path)) | |
| } | |
| ) | |
| ) | |
| point_id += 1 | |
| if len(points) >= 100: # Batch upsert | |
| qdrant_client.upsert( | |
| collection_name=collection_name, | |
| points=points, | |
| wait=True, | |
| ) | |
| points = [] | |
| if points: # Upsert remaining points | |
| qdrant_client.upsert( | |
| collection_name=collection_name, | |
| points=points, | |
| wait=True, | |
| ) | |
| print(f"Ingestion complete. Total points: {point_id}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.") | |
| parser.add_argument( | |
| "--docs_path", | |
| type=str, | |
| default="../physical-ai-humanoid-robotics/docs/", | |
| help="Path to the directory containing MDX documentation files." | |
| ) | |
| args = parser.parse_args() | |
| qdrant_client = get_qdrant_client() | |
| # Choose the appropriate embedding service based on AI_PROVIDER setting | |
| if settings.AI_PROVIDER.lower() == "gemini": | |
| embeddings_service = GeminiEmbeddingsService() | |
| else: | |
| embeddings_service = EmbeddingsService() | |
| # Run the async ingestion | |
| import asyncio | |
| asyncio.run(ingest_content( | |
| docs_path=Path(args.docs_path), | |
| qdrant_client=qdrant_client, | |
| embeddings_service=embeddings_service, | |
| collection_name=QDRANT_COLLECTION_NAME | |
| )) |