"""pipeline/pdf_processor.py — PDF text extraction + OCR fallback."""
import re
from pathlib import Path
from typing import Generator
from dataclasses import dataclass
import fitz, pdfplumber, pytesseract
from pdf2image import convert_from_path
from loguru import logger
import utils.config as cfg


@dataclass
class Chunk:
    source_file: str
    page_start:  int
    page_end:    int
    text:        str
    was_ocr:     bool = False

    @property
    def word_count(self): return len(self.text.split())


class PDFProcessor:
    MIN_CHARS   = 80
    CHUNK_WORDS = cfg.MAX_TOKENS_PER_CHUNK // 2

    def process(self, pdf_path: Path) -> Generator[Chunk, None, None]:
        pages = self._extract_pages(pdf_path)
        yield from self._chunk(pages, pdf_path.name)

    def _extract_pages(self, path):
        mu = {}
        try:
            doc = fitz.open(str(path))
            for i, pg in enumerate(doc): mu[i+1] = self._clean(pg.get_text("text"))
            doc.close()
        except: pass
        pl = {}
        try:
            with pdfplumber.open(str(path)) as pdf:
                for i, pg in enumerate(pdf.pages):
                    try: pl[i+1] = self._clean(pg.extract_text() or "")
                    except: pl[i+1] = ""
        except: pass
        total = max(len(mu), len(pl), 1)
        results = []; ocr_needed = []
        for pnum in range(1, total+1):
            best = mu.get(pnum,"") if len(mu.get(pnum,"")) > len(pl.get(pnum,"")) else pl.get(pnum,"")
            if len(best) >= self.MIN_CHARS: results.append((pnum, best, False))
            else: results.append((pnum, best, False)); ocr_needed.append(pnum)
        if ocr_needed:
            ocr = self._ocr(path, ocr_needed)
            for i,(pnum,_,_) in enumerate(results):
                if pnum in ocr: results[i] = (pnum, ocr[pnum], True)
        return results

    def _ocr(self, path, pages):
        out = {}
        try:
            imgs = convert_from_path(str(path), dpi=cfg.OCR_DPI,
                first_page=min(pages), last_page=max(pages))
            for i, pnum in enumerate(range(min(pages), max(pages)+1)):
                if pnum in pages and i < len(imgs):
                    out[pnum] = self._clean(pytesseract.image_to_string(imgs[i],lang="eng",config="--psm 6"))
        except Exception as e: logger.warning(f"OCR: {e}")
        return out

    def _chunk(self, pages, source):
        buf, words, p_start, any_ocr = [], 0, 1, False
        for pnum, text, ocr in pages:
            if not text: continue
            buf.append(text); words += len(text.split())
            if ocr: any_ocr = True
            if words >= self.CHUNK_WORDS:
                yield Chunk(source, p_start, pnum, "\n\n".join(buf), any_ocr)
                buf, words, p_start, any_ocr = [text], len(text.split()), pnum, ocr
        if buf:
            last = pages[-1][0] if pages else p_start
            yield Chunk(source, p_start, last, "\n\n".join(buf), any_ocr)

    @staticmethod
    def _clean(text):
        if not text: return ""
        text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r"[ \t]+", " ", text)
        return text.strip()