File size: 1,256 Bytes
9cc7f8d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from langchain_core.documents import Document
def ingestion_and_chunking(file_path : str) :
converter = DocumentConverter()
result = converter.convert(file_path)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
chunker = HybridChunker(merge_peers=True ,
chunk_size=800 ,
overlap=200,
tokenizer=tokenizer )
chunks = list(chunker.chunk(result.document))
for chunk in chunks :
chunk.text = chunker.contextualize(chunk)
docs = []
for chunk in chunks:
pages = sorted({
prov.page_no
for item in chunk.meta.doc_items
for prov in item.prov
})
docs.append(
Document(
page_content=chunk.text,
metadata={
"source": chunk.meta.origin.filename,
"pages": pages,
"section": chunk.meta.headings[0] if chunk.meta.headings else None,
}
)
)
return docs |