Spaces:

shivvamm
/

chatdocs

Sleeping

shivvamm

Deploy ChatDocs main server (single-container demo)

b496a3b 27 days ago

2.89 kB

	"""Synchronous, in-process document ingestion.

	Used for single-container deployments (e.g. Hugging Face Spaces) where the
	RabbitMQ + processing_server worker is not available. Mirrors the PDF path of
	processing_server/consumer.py but reads Qdrant connection details from env so
	it can talk to a managed Qdrant Cloud cluster.
	"""

	import os
	import logging
	from uuid import uuid4

	import pymupdf4llm
	from langchain_core.documents import Document
	from langchain_pinecone import PineconeEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_qdrant import QdrantVectorStore
	from qdrant_client import QdrantClient
	from qdrant_client.models import Distance, VectorParams

	logger = logging.getLogger(__name__)

	QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
	QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") # None for a local Qdrant
	EMBEDDING_DIM = 1024 # multilingual-e5-large


	def get_qdrant_client(timeout: int = 120) -> QdrantClient:
	return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY, timeout=timeout)


	def _chunk(document: Document, chunk_size: int = 600, chunk_overlap: int = 60):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size, chunk_overlap=chunk_overlap
	)
	return splitter.split_documents([document])


	def ingest_pdf_files(file_paths, collection_name: str) -> int:
	"""Extract, chunk, embed and upsert PDFs into a per-company Qdrant collection.

	Returns the total number of chunks stored. Deletes each source file after
	a successful ingest (the disk is ephemeral on HF Spaces).
	"""
	embeddings = PineconeEmbeddings(model="multilingual-e5-large")
	client = get_qdrant_client()

	if not client.collection_exists(collection_name):
	logger.info("Creating Qdrant collection: %s", collection_name)
	client.create_collection(
	collection_name=collection_name,
	vectors_config=VectorParams(size=EMBEDDING_DIM, distance=Distance.COSINE),
	)

	vector_store = QdrantVectorStore(
	client=client, collection_name=collection_name, embedding=embeddings
	)

	total_chunks = 0
	for path in file_paths:
	logger.info("Ingesting file: %s", path)
	markdown = pymupdf4llm.to_markdown(path)
	document = Document(page_content=markdown, metadata={"source": "Documents"})
	chunks = _chunk(document)
	logger.info("Generated %d chunks from %s", len(chunks), path)

	for i in range(0, len(chunks), 100):
	batch = chunks[i : i + 100]
	ids = [str(uuid4()) for _ in batch]
	vector_store.add_documents(documents=batch, ids=ids)

	total_chunks += len(chunks)
	try:
	os.remove(path)
	except OSError:
	pass

	logger.info("Ingested %d total chunks into '%s'", total_chunks, collection_name)
	return total_chunks