Spaces:

codersHub12
/

rag

Sleeping

App Files Files Community

rag / scripts /ingest_content.py

Tahasaif3

'changes'

fe10c91 4 months ago

raw

history blame contribute delete

3.83 kB

	import os
	import argparse
	from pathlib import Path
	import markdown
	from bs4 import BeautifulSoup
	from qdrant_client import QdrantClient
	from qdrant_client.models import Distance, VectorParams, PointStruct
	from dotenv import load_dotenv

	# Add these to enable relative imports
	import sys
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from app.services.embeddings_service import EmbeddingsService, GeminiEmbeddingsService
	from app.qdrant_client import get_qdrant_client
	from app.config import settings

	load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env")

	QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "docs_collection")

	def load_mdx_content(filepath: Path) -> str:
	with open(filepath, 'r', encoding='utf-8') as f:
	content = f.read()
	# MDX is essentially Markdown, so we can convert to HTML then extract text
	html = markdown.markdown(content)
	soup = BeautifulSoup(html, 'html.parser')
	return soup.get_text()

	def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]:
	chunks = []
	for i in range(0, len(text), chunk_size - overlap):
	chunks.append(text[i:i + chunk_size])
	return chunks

	async def ingest_content(
	docs_path: Path,
	qdrant_client: QdrantClient,
	embeddings_service: EmbeddingsService,
	collection_name: str,
	):
	# Determine vector size based on the embedding service
	if isinstance(embeddings_service, GeminiEmbeddingsService):
	vector_size = 768 # Gemini embedding size
	else:
	vector_size = 1536 # OpenAI embedding size

	qdrant_client.recreate_collection(
	collection_name=collection_name,
	vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
	)

	points = []
	point_id = 0
	for mdx_file in docs_path.rglob("*.mdx"):
	print(f"Processing {mdx_file}")
	content = load_mdx_content(mdx_file)
	chunks = chunk_text(content)

	for chunk in chunks:
	embedding = await embeddings_service.create_embedding(chunk)
	points.append(
	PointStruct(
	id=point_id,
	vector=embedding,
	payload={
	"content": chunk,
	"source": str(mdx_file.relative_to(docs_path))
	}
	)
	)
	point_id += 1

	if len(points) >= 100: # Batch upsert
	qdrant_client.upsert(
	collection_name=collection_name,
	points=points,
	wait=True,
	)
	points = []

	if points: # Upsert remaining points
	qdrant_client.upsert(
	collection_name=collection_name,
	points=points,
	wait=True,
	)

	print(f"Ingestion complete. Total points: {point_id}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Ingest MDX content into Qdrant.")
	parser.add_argument(
	"--docs_path",
	type=str,
	default="../physical-ai-humanoid-robotics/docs/",
	help="Path to the directory containing MDX documentation files."
	)
	args = parser.parse_args()

	qdrant_client = get_qdrant_client()

	# Choose the appropriate embedding service based on AI_PROVIDER setting
	if settings.AI_PROVIDER.lower() == "gemini":
	embeddings_service = GeminiEmbeddingsService()
	else:
	embeddings_service = EmbeddingsService()

	# Run the async ingestion
	import asyncio
	asyncio.run(ingest_content(
	docs_path=Path(args.docs_path),
	qdrant_client=qdrant_client,
	embeddings_service=embeddings_service,
	collection_name=QDRANT_COLLECTION_NAME
	))