Spaces:

codersHub12
/

rag

Sleeping

File size: 3,825 Bytes

fe10c91

import os
import argparse
from pathlib import Path
import markdown
from bs4 import BeautifulSoup
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from dotenv import load_dotenv

# Add these to enable relative imports
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from app.services.embeddings_service import EmbeddingsService, GeminiEmbeddingsService
from app.qdrant_client import get_qdrant_client
from app.config import settings

load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env")

QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection")

def load_mdx_content(filepath: Path) -> str:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    # MDX is essentially Markdown, so we can convert to HTML then extract text
    html = markdown.markdown(content)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]:
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

async def ingest_content(
    docs_path: Path,
    qdrant_client: QdrantClient,
    embeddings_service: EmbeddingsService,
    collection_name: str,
):
    # Determine vector size based on the embedding service
    if isinstance(embeddings_service, GeminiEmbeddingsService):
        vector_size = 768  # Gemini embedding size
    else:
        vector_size = 1536  # OpenAI embedding size
    
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )

    points = []
    point_id = 0
    for mdx_file in docs_path.rglob("*.mdx"):
        print(f"Processing {mdx_file}")
        content = load_mdx_content(mdx_file)
        chunks = chunk_text(content)

        for chunk in chunks:
            embedding = await embeddings_service.create_embedding(chunk)
            points.append(
                PointStruct(
                    id=point_id,
                    vector=embedding,
                    payload={
                        "content": chunk,
                        "source": str(mdx_file.relative_to(docs_path))
                    }
                )
            )
            point_id += 1

            if len(points) >= 100:  # Batch upsert
                qdrant_client.upsert(
                    collection_name=collection_name,
                    points=points,
                    wait=True,
                )
                points = []

    if points: # Upsert remaining points
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points,
            wait=True,
        )

    print(f"Ingestion complete. Total points: {point_id}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.")
    parser.add_argument(
        "--docs_path",
        type=str,
        default="../physical-ai-humanoid-robotics/docs/",
        help="Path to the directory containing MDX documentation files."
    )
    args = parser.parse_args()

    qdrant_client = get_qdrant_client()
    
    # Choose the appropriate embedding service based on AI_PROVIDER setting
    if settings.AI_PROVIDER.lower() == "gemini":
        embeddings_service = GeminiEmbeddingsService()
    else:
        embeddings_service = EmbeddingsService()

    # Run the async ingestion
    import asyncio
    asyncio.run(ingest_content(
        docs_path=Path(args.docs_path),
        qdrant_client=qdrant_client,
        embeddings_service=embeddings_service,
        collection_name=QDRANT_COLLECTION_NAME
    ))