"""
PDF → Markdown extraction using Docling.

Docling preserves complex financial table structures by converting
PDF content into clean Markdown, solving cell-merging and missing-row
issues that plague basic text extractors like PyPDF2.
"""

import os
from typing import Optional

from docling.document_converter import DocumentConverter


# Module-level singleton — avoids re-initializing the heavy converter on every call
_converter: Optional[DocumentConverter] = None


def _get_converter() -> DocumentConverter:
    """Lazy-load the Docling DocumentConverter (one-time cost)."""
    global _converter
    if _converter is None:
        print("  [Docling] Initializing DocumentConverter...")
        _converter = DocumentConverter()
        print("  [Docling] Converter ready.")
    return _converter


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Convert a PDF file to a Markdown string using Docling.

    Args:
        pdf_path: Path to the PDF file.

    Returns:
        Markdown-formatted string preserving tables and structure.

    Raises:
        FileNotFoundError: If the PDF file doesn't exist.
        ValueError: If Docling produces no content.
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF not found: {pdf_path}")

    converter = _get_converter()

    print(f"  [Docling] Converting: {os.path.basename(pdf_path)} → Markdown")
    result = converter.convert(pdf_path)
    markdown_text = result.document.export_to_markdown()

    if not markdown_text.strip():
        raise ValueError(
            f"No content extracted from {pdf_path}. "
            "The PDF may be image-only or corrupted."
        )

    print(f"  [Docling] Extracted {len(markdown_text):,} characters of Markdown.")
    return markdown_text


def extract_text_from_directory(dir_path: str, max_files: Optional[int] = None) -> list:
    """
    Extract Markdown from all PDFs in a directory.

    Args:
        dir_path: Path to directory containing PDFs.
        max_files: Maximum number of files to process.

    Returns:
        List of dicts with 'filename' and 'text' keys.
    """
    results = []
    pdf_files = [f for f in os.listdir(dir_path) if f.lower().endswith('.pdf')]

    if max_files:
        pdf_files = pdf_files[:max_files]

    for filename in pdf_files:
        filepath = os.path.join(dir_path, filename)
        try:
            text = extract_text_from_pdf(filepath)
            results.append({
                "filename": filename,
                "text": text,
                "char_count": len(text),
            })
        except (ValueError, Exception) as e:
            print(f"  [SKIP] {filename}: {e}")

    print(f"Successfully extracted text from {len(results)}/{len(pdf_files)} PDFs")
    return results