from docling.document_converter import DocumentConverter from docling.chunking import HybridChunker from transformers import AutoTokenizer from langchain_core.documents import Document def ingestion_and_chunking(file_path : str) : converter = DocumentConverter() result = converter.convert(file_path) tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") chunker = HybridChunker(merge_peers=True , chunk_size=800 , overlap=200, tokenizer=tokenizer ) chunks = list(chunker.chunk(result.document)) for chunk in chunks : chunk.text = chunker.contextualize(chunk) docs = [] for chunk in chunks: pages = sorted({ prov.page_no for item in chunk.meta.doc_items for prov in item.prov }) docs.append( Document( page_content=chunk.text, metadata={ "source": chunk.meta.origin.filename, "pages": pages, "section": chunk.meta.headings[0] if chunk.meta.headings else None, } ) ) return docs