import os from langchain_docling import DoclingLoader from langchain_docling.loader import ExportType from docling.chunking import HybridChunker from transformers import AutoTokenizer from rich.console import Console console = Console() def print_docling_chunks(file_path): if not os.path.exists(file_path): print(f"Error: File not found at {file_path}") return # 1. Setup Hybrid Chunker # Using the same embedding model as the rest of the project EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" console.print(f"[bold green]Loading tokenizer:[/] {EMBEDDING_MODEL}") tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL) chunker = HybridChunker( tokenizer=tokenizer, max_tokens=1000, # Can be tuned # overlap=200, # Can be tuned ) # 2. Setup DoclingLoader as requested console.print(f"[bold green]Processing file:[/] {os.path.basename(file_path)}") loader = DoclingLoader( file_path=file_path, export_type=ExportType.DOC_CHUNKS, chunker=chunker, ) # 3. Load and print chunks chunks = loader.load() console.print(chunks[0].metadata["chunk_type"]) # console.print(f"\n[bold yellow]Total chunks found:[/] {len(chunks)}\n") # for i, chunk in enumerate(chunks): # console.print(f"[bold cyan]--- Chunk {i+1} ---[/]") # # Print a snippet of metadata and the content # # console.print(f"[dim]Metadata:[/] {chunk.metadata}") # console.print(chunk.page_content) # console.print("-" * 40) if __name__ == "__main__": # Test with one of the PDF documents sample_pdf = "data/SBC_SILVER_SilverShield.pdf" print_docling_chunks(sample_pdf)