pdf_rag / src /ingestion.py
LightRT's picture
Final Changes
77d7fca
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from langchain_core.documents import Document
def ingestion_and_chunking(file_path : str) :
converter = DocumentConverter()
result = converter.convert(file_path)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
chunker = HybridChunker(merge_peers=True ,
chunk_size=800 ,
overlap=200,
tokenizer=tokenizer )
chunks = list(chunker.chunk(result.document))
for chunk in chunks :
chunk.text = chunker.contextualize(chunk)
docs = []
for chunk in chunks:
pages = sorted({
prov.page_no
for item in chunk.meta.doc_items
for prov in item.prov
})
docs.append(
Document(
page_content=chunk.text,
metadata={
"source": chunk.meta.origin.filename,
"pages": pages,
"section": chunk.meta.headings[0] if chunk.meta.headings else None,
}
)
)
return docs