Spaces:
No application file
No application file
File size: 3,350 Bytes
0cee4dc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | import os
import argparse
from pathlib import Path
import markdown
from bs4 import BeautifulSoup
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from dotenv import load_dotenv
# Add these to enable relative imports
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from app.services.embeddings_service import EmbeddingsService
from app.qdrant_client import get_qdrant_client
load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env")
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection")
def load_mdx_content(filepath: Path) -> str:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# MDX is essentially Markdown, so we can convert to HTML then extract text
html = markdown.markdown(content)
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text()
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]:
chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunks.append(text[i:i + chunk_size])
return chunks
async def ingest_content(
docs_path: Path,
qdrant_client: QdrantClient,
embeddings_service: EmbeddingsService,
collection_name: str,
):
qdrant_client.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=1536, distance=Distance.COSINE), # Gemini text-embedding-004 size
)
points = []
point_id = 0
for mdx_file in docs_path.rglob("*.mdx"):
print(f"Processing {mdx_file}")
content = load_mdx_content(mdx_file)
chunks = chunk_text(content)
for chunk in chunks:
embedding = embeddings_service.create_embedding(chunk)
points.append(
PointStruct(
id=point_id,
vector=embedding,
payload={
"content": chunk,
"source": str(mdx_file.relative_to(docs_path))
}
)
)
point_id += 1
if len(points) >= 100: # Batch upsert
qdrant_client.upsert(
collection_name=collection_name,
points=points,
wait=True,
)
points = []
if points: # Upsert remaining points
qdrant_client.upsert(
collection_name=collection_name,
points=points,
wait=True,
)
print(f"Ingestion complete. Total points: {point_id}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.")
parser.add_argument(
"--docs_path",
type=str,
default="../physical-ai-humanoid-robotics/docs/",
help="Path to the directory containing MDX documentation files."
)
args = parser.parse_args()
qdrant_client = get_qdrant_client()
embeddings_service = EmbeddingsService()
# Run the async ingestion
import asyncio
asyncio.run(ingest_content(
docs_path=Path(args.docs_path),
qdrant_client=qdrant_client,
embeddings_service=embeddings_service,
collection_name=QDRANT_COLLECTION_NAME
))
|