File size: 1,256 Bytes
9cc7f8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from langchain_core.documents import Document

def ingestion_and_chunking(file_path : str) :

    converter = DocumentConverter()
    result = converter.convert(file_path)

    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

    chunker = HybridChunker(merge_peers=True , 
                            chunk_size=800 , 
                            overlap=200,
                            tokenizer=tokenizer )
    
    chunks = list(chunker.chunk(result.document))

    for chunk in chunks :
        chunk.text = chunker.contextualize(chunk)

    
    docs = []

    for chunk in chunks:
        pages = sorted({
            prov.page_no
            for item in chunk.meta.doc_items
            for prov in item.prov
        })

        docs.append(
            Document(
                page_content=chunk.text,
                metadata={
                    "source": chunk.meta.origin.filename,
                    "pages": pages,
                    "section": chunk.meta.headings[0] if chunk.meta.headings else None,
                }
            )
        )

    return docs