"""Synchronous, in-process document ingestion. Used for single-container deployments (e.g. Hugging Face Spaces) where the RabbitMQ + processing_server worker is not available. Mirrors the PDF path of processing_server/consumer.py but reads Qdrant connection details from env so it can talk to a managed Qdrant Cloud cluster. """ import os import logging from uuid import uuid4 import pymupdf4llm from langchain_core.documents import Document from langchain_pinecone import PineconeEmbeddings from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_qdrant import QdrantVectorStore from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams logger = logging.getLogger(__name__) QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") # None for a local Qdrant EMBEDDING_DIM = 1024 # multilingual-e5-large def get_qdrant_client(timeout: int = 120) -> QdrantClient: return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY, timeout=timeout) def _chunk(document: Document, chunk_size: int = 600, chunk_overlap: int = 60): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) return splitter.split_documents([document]) def ingest_pdf_files(file_paths, collection_name: str) -> int: """Extract, chunk, embed and upsert PDFs into a per-company Qdrant collection. Returns the total number of chunks stored. Deletes each source file after a successful ingest (the disk is ephemeral on HF Spaces). """ embeddings = PineconeEmbeddings(model="multilingual-e5-large") client = get_qdrant_client() if not client.collection_exists(collection_name): logger.info("Creating Qdrant collection: %s", collection_name) client.create_collection( collection_name=collection_name, vectors_config=VectorParams(size=EMBEDDING_DIM, distance=Distance.COSINE), ) vector_store = QdrantVectorStore( client=client, collection_name=collection_name, embedding=embeddings ) total_chunks = 0 for path in file_paths: logger.info("Ingesting file: %s", path) markdown = pymupdf4llm.to_markdown(path) document = Document(page_content=markdown, metadata={"source": "Documents"}) chunks = _chunk(document) logger.info("Generated %d chunks from %s", len(chunks), path) for i in range(0, len(chunks), 100): batch = chunks[i : i + 100] ids = [str(uuid4()) for _ in batch] vector_store.add_documents(documents=batch, ids=ids) total_chunks += len(chunks) try: os.remove(path) except OSError: pass logger.info("Ingested %d total chunks into '%s'", total_chunks, collection_name) return total_chunks