chatdocs / utils /ingest.py
shivvamm
Deploy ChatDocs main server (single-container demo)
b496a3b
Raw
History Blame Contribute Delete
2.89 kB
"""Synchronous, in-process document ingestion.
Used for single-container deployments (e.g. Hugging Face Spaces) where the
RabbitMQ + processing_server worker is not available. Mirrors the PDF path of
processing_server/consumer.py but reads Qdrant connection details from env so
it can talk to a managed Qdrant Cloud cluster.
"""
import os
import logging
from uuid import uuid4
import pymupdf4llm
from langchain_core.documents import Document
from langchain_pinecone import PineconeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
logger = logging.getLogger(__name__)
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") # None for a local Qdrant
EMBEDDING_DIM = 1024 # multilingual-e5-large
def get_qdrant_client(timeout: int = 120) -> QdrantClient:
return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY, timeout=timeout)
def _chunk(document: Document, chunk_size: int = 600, chunk_overlap: int = 60):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
return splitter.split_documents([document])
def ingest_pdf_files(file_paths, collection_name: str) -> int:
"""Extract, chunk, embed and upsert PDFs into a per-company Qdrant collection.
Returns the total number of chunks stored. Deletes each source file after
a successful ingest (the disk is ephemeral on HF Spaces).
"""
embeddings = PineconeEmbeddings(model="multilingual-e5-large")
client = get_qdrant_client()
if not client.collection_exists(collection_name):
logger.info("Creating Qdrant collection: %s", collection_name)
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=EMBEDDING_DIM, distance=Distance.COSINE),
)
vector_store = QdrantVectorStore(
client=client, collection_name=collection_name, embedding=embeddings
)
total_chunks = 0
for path in file_paths:
logger.info("Ingesting file: %s", path)
markdown = pymupdf4llm.to_markdown(path)
document = Document(page_content=markdown, metadata={"source": "Documents"})
chunks = _chunk(document)
logger.info("Generated %d chunks from %s", len(chunks), path)
for i in range(0, len(chunks), 100):
batch = chunks[i : i + 100]
ids = [str(uuid4()) for _ in batch]
vector_store.add_documents(documents=batch, ids=ids)
total_chunks += len(chunks)
try:
os.remove(path)
except OSError:
pass
logger.info("Ingested %d total chunks into '%s'", total_chunks, collection_name)
return total_chunks