Nagendravarma
Deployable release for Hugging Face Spaces with LFS database tracking
ad8c65e
Raw
History Blame Contribute Delete
1.75 kB
import os
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from rich.console import Console
console = Console()
def print_docling_chunks(file_path):
if not os.path.exists(file_path):
print(f"Error: File not found at {file_path}")
return
# 1. Setup Hybrid Chunker
# Using the same embedding model as the rest of the project
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
console.print(f"[bold green]Loading tokenizer:[/] {EMBEDDING_MODEL}")
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
chunker = HybridChunker(
tokenizer=tokenizer,
max_tokens=1000, # Can be tuned
# overlap=200, # Can be tuned
)
# 2. Setup DoclingLoader as requested
console.print(f"[bold green]Processing file:[/] {os.path.basename(file_path)}")
loader = DoclingLoader(
file_path=file_path,
export_type=ExportType.DOC_CHUNKS,
chunker=chunker,
)
# 3. Load and print chunks
chunks = loader.load()
console.print(chunks[0].metadata["chunk_type"])
# console.print(f"\n[bold yellow]Total chunks found:[/] {len(chunks)}\n")
# for i, chunk in enumerate(chunks):
# console.print(f"[bold cyan]--- Chunk {i+1} ---[/]")
# # Print a snippet of metadata and the content
# # console.print(f"[dim]Metadata:[/] {chunk.metadata}")
# console.print(chunk.page_content)
# console.print("-" * 40)
if __name__ == "__main__":
# Test with one of the PDF documents
sample_pdf = "data/SBC_SILVER_SilverShield.pdf"
print_docling_chunks(sample_pdf)