"""
LLM-based OCR using Groq vision model.
Supports: PDF (text + scanned), Images, DOCX, TXT
Arabic + English
"""

import io
import base64
from pathlib import Path


def extract_text_with_llm(file_bytes: bytes, filename: str, groq, vision_model: str) -> str:
    ext = Path(filename).suffix.lower()

    if ext == ".txt":
        return _extract_txt(file_bytes)

    elif ext == ".docx":
        return _extract_docx(file_bytes)

    elif ext == ".pdf":
        return _extract_pdf(file_bytes, groq, vision_model)

    elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"]:
        return _llm_ocr_image(file_bytes, "image/png", groq, vision_model)

    else:
        raise ValueError(f"Unsupported file type: {ext}")


# ─── TXT ─────────────────────────────────────────────────
def _extract_txt(file_bytes: bytes) -> str:
    try:
        return file_bytes.decode("utf-8")
    except UnicodeDecodeError:
        return file_bytes.decode("latin-1", errors="ignore")


# ─── DOCX ────────────────────────────────────────────────
def _extract_docx(file_bytes: bytes) -> str:
    import docx
    doc = docx.Document(io.BytesIO(file_bytes))
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())


# ─── PDF ─────────────────────────────────────────────────
def _extract_pdf(file_bytes: bytes, groq, vision_model: str) -> str:
    import fitz  # PyMuPDF

    doc  = fitz.open(stream=file_bytes, filetype="pdf")
    text = ""

    for page in doc:
        page_text = page.get_text("text")
        text += page_text + "\n"

    doc.close()

    # If no text found → scanned PDF → use LLM OCR on each page image
    if not text.strip():
        text = _ocr_pdf_pages_with_llm(file_bytes, groq, vision_model)

    return text


def _ocr_pdf_pages_with_llm(file_bytes: bytes, groq, vision_model: str) -> str:
    import fitz
    from PIL import Image

    doc   = fitz.open(stream=file_bytes, filetype="pdf")
    texts = []

    for page in doc:
        mat = fitz.Matrix(2.0, 2.0)          # 2x zoom = ~144 DPI
        pix = page.get_pixmap(matrix=mat)

        img_bytes = pix.tobytes("png")
        page_text = _llm_ocr_image(img_bytes, "image/png", groq, vision_model)
        texts.append(page_text)

    doc.close()
    return "\n\n".join(texts)


# ─── LLM Vision OCR ──────────────────────────────────────
def _llm_ocr_image(image_bytes: bytes, media_type: str, groq, vision_model: str) -> str:
    b64 = base64.standard_b64encode(image_bytes).decode("utf-8")

    response = groq.chat.completions.create(
        model=vision_model,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{b64}"
                        },
                    },
                    {
                        "type": "text",
                        "text": (
                            "Extract ALL text from this image exactly as written. "
                            "Support both Arabic (right-to-left) and English text. "
                            "Preserve paragraphs and line breaks. "
                            "Return ONLY the extracted text, nothing else."
                        ),
                    },
                ],
            }
        ],
        temperature=0,
    )

    return response.choices[0].message.content.strip()