import os
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from rich.console import Console

console = Console()

def print_docling_chunks(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    # 1. Setup Hybrid Chunker
    # Using the same embedding model as the rest of the project
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    
    console.print(f"[bold green]Loading tokenizer:[/] {EMBEDDING_MODEL}")
    tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
    
    chunker = HybridChunker(
        tokenizer=tokenizer,
        max_tokens=1000, # Can be tuned
        # overlap=200,    # Can be tuned
    )

    # 2. Setup DoclingLoader as requested
    console.print(f"[bold green]Processing file:[/] {os.path.basename(file_path)}")
    loader = DoclingLoader(
        file_path=file_path,
        export_type=ExportType.DOC_CHUNKS,
        chunker=chunker,
    )

    # 3. Load and print chunks
    chunks = loader.load()
    console.print(chunks[0].metadata["chunk_type"])
    
    # console.print(f"\n[bold yellow]Total chunks found:[/] {len(chunks)}\n")
    
    # for i, chunk in enumerate(chunks):
    #     console.print(f"[bold cyan]--- Chunk {i+1} ---[/]")
    #     # Print a snippet of metadata and the content
    #     # console.print(f"[dim]Metadata:[/] {chunk.metadata}")
    #     console.print(chunk.page_content)
    #     console.print("-" * 40)

if __name__ == "__main__":
    # Test with one of the PDF documents
    sample_pdf = "data/SBC_SILVER_SilverShield.pdf"
    print_docling_chunks(sample_pdf)