Spaces:

manasdhir04
/

Image_generation

Sleeping

App Files Files Community

Image_generation / docling_chunker_fixed.py

manasdhir

minor changes

5d1cbd9 9 months ago

raw

history blame contribute delete

1.94 kB

	import gc
	import tempfile
	from pathlib import Path
	from docling.document_converter import DocumentConverter
	from docling.chunking import HybridChunker

	def convert_markdown_string_to_docling(md_string: str):
	# Initialize the converter
	converter = DocumentConverter()

	# Create a temporary file for the Markdown string
	with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file:
	temp_file.write(md_string)
	temp_path = Path(temp_file.name)

	try:
	# Convert the temporary Markdown file
	conv_result = converter.convert(temp_path)

	return conv_result
	finally:
	# Clean up the temporary file to free resources
	temp_path.unlink(missing_ok=True)
	gc.collect() # Optional: Force garbage collection

	import tiktoken
	from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
	tokenizer = OpenAITokenizer(
	tokenizer=tiktoken.encoding_for_model("gpt-4o"),
	max_tokens=128 * 1024, # context window length required for OpenAI tokenizers
	)
	def chunk_docs(doc):
	chunker = HybridChunker(
	tokenizer=tokenizer,
	max_tokens=8000,
	merge_peers=True # This handles merging of undersized chunks
	)

	# Generate chunks from the document
	chunk_iter = chunker.chunk(dl_doc=doc)
	chunks = list(chunk_iter)

	return chunks

	def split_to_docling_chunks(md_string:str):
	doc=convert_markdown_string_to_docling(md_string=md_string)
	chunks=chunk_docs(doc.document)
	return chunks

	if __name__ == "__main__":
	# Usage example (replace with your md_content)
	with open('content.txt','r') as f:
	md_content=f.read()
	# converted_docs = convert_markdown_string_to_docling(md_content)
	# a=chunk_docs(converted_docs.document)

	from pprint import pprint
	pprint(split_to_docling_chunks(md_content))


	#print("Returned Docling docs:", converted_docs)