Spaces:
Sleeping
Sleeping
| import gc | |
| import tempfile | |
| from pathlib import Path | |
| from docling.document_converter import DocumentConverter | |
| from docling.chunking import HybridChunker | |
| def convert_markdown_string_to_docling(md_string: str): | |
| # Initialize the converter | |
| converter = DocumentConverter() | |
| # Create a temporary file for the Markdown string | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_file: | |
| temp_file.write(md_string) | |
| temp_path = Path(temp_file.name) | |
| try: | |
| # Convert the temporary Markdown file | |
| conv_result = converter.convert(temp_path) | |
| return conv_result | |
| finally: | |
| # Clean up the temporary file to free resources | |
| temp_path.unlink(missing_ok=True) | |
| gc.collect() # Optional: Force garbage collection | |
| import tiktoken | |
| from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer | |
| tokenizer = OpenAITokenizer( | |
| tokenizer=tiktoken.encoding_for_model("gpt-4o"), | |
| max_tokens=128 * 1024, # context window length required for OpenAI tokenizers | |
| ) | |
| def chunk_docs(doc): | |
| chunker = HybridChunker( | |
| tokenizer=tokenizer, | |
| max_tokens=8000, | |
| merge_peers=True # This handles merging of undersized chunks | |
| ) | |
| # Generate chunks from the document | |
| chunk_iter = chunker.chunk(dl_doc=doc) | |
| chunks = list(chunk_iter) | |
| return chunks | |
| def split_to_docling_chunks(md_string:str): | |
| doc=convert_markdown_string_to_docling(md_string=md_string) | |
| chunks=chunk_docs(doc.document) | |
| return chunks | |
| if __name__ == "__main__": | |
| # Usage example (replace with your md_content) | |
| with open('content.txt','r') as f: | |
| md_content=f.read() | |
| # converted_docs = convert_markdown_string_to_docling(md_content) | |
| # a=chunk_docs(converted_docs.document) | |
| from pprint import pprint | |
| pprint(split_to_docling_chunks(md_content)) | |
| #print("Returned Docling docs:", converted_docs) | |