Image_generation / docling_chunker_fixed.py
manasdhir's picture
minor changes
5d1cbd9
import gc
import tempfile
from pathlib import Path
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
def convert_markdown_string_to_docling(md_string: str):
# Initialize the converter
converter = DocumentConverter()
# Create a temporary file for the Markdown string
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
temp_file.write(md_string)
temp_path = Path(temp_file.name)
try:
# Convert the temporary Markdown file
conv_result = converter.convert(temp_path)
return conv_result
finally:
# Clean up the temporary file to free resources
temp_path.unlink(missing_ok=True)
gc.collect() # Optional: Force garbage collection
import tiktoken
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
tokenizer = OpenAITokenizer(
tokenizer=tiktoken.encoding_for_model("gpt-4o"),
max_tokens=128 * 1024, # context window length required for OpenAI tokenizers
)
def chunk_docs(doc):
chunker = HybridChunker(
tokenizer=tokenizer,
max_tokens=8000,
merge_peers=True # This handles merging of undersized chunks
)
# Generate chunks from the document
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)
return chunks
def split_to_docling_chunks(md_string:str):
doc=convert_markdown_string_to_docling(md_string=md_string)
chunks=chunk_docs(doc.document)
return chunks
if __name__ == "__main__":
# Usage example (replace with your md_content)
with open('content.txt','r') as f:
md_content=f.read()
# converted_docs = convert_markdown_string_to_docling(md_content)
# a=chunk_docs(converted_docs.document)
from pprint import pprint
pprint(split_to_docling_chunks(md_content))
#print("Returned Docling docs:", converted_docs)