# =============================================================================
# 📰 Newspaper Summarizer
# Scans all pages for headlines using font size, then ranks via LLM.
# No YOLO, no vision model, no OCR. Fast and standalone.
#
# Usage:
#   from summarizer import NewspaperSummarizer
#   summarizer = NewspaperSummarizer(api_key="...")
#   result = summarizer.summarize("newspaper.pdf")
# =============================================================================

import json
import time
import re
import fitz
from openai import OpenAI
import logging

from config import (
    LLM_BASE_URL, TEXT_MODEL,
    HEADLINE_MIN_FONT_SIZE, TOP_ARTICLES_COUNT,
    SUMMARY_PROMPT, SECTION_NAMES, SKIP_SECTIONS, SECTION_TIERS,
)

logger = logging.getLogger("newspaper_summarizer")


class NewspaperSummarizer:
    """Scans a newspaper PDF for headlines and generates importance-ranked summaries."""

    def __init__(self, api_key):
        self.llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=api_key)
        logger.info("✅ Summarizer initialized")

    def summarize(self, pdf_path):
        """
        Full pipeline: detect sections → scan headlines → rank via LLM.
        Returns: {"important_articles": [...], "total_headlines_found": int, "sections": {...}}
        """
        sections = self._detect_page_sections(pdf_path)
        logger.info(f"Detected sections: {sections}")

        headlines = self._scan_headlines(pdf_path, sections)
        logger.info(f"Scanned {len(headlines)} headlines from PDF")

        if not headlines:
            return {"important_articles": [], "total_headlines_found": 0, "sections": sections}

        ranked = self._rank_headlines(headlines)
        ranked["total_headlines_found"] = len(headlines)
        ranked["sections"] = sections
        return ranked

    def _detect_page_sections(self, pdf_path):
        """
        Detect section names from page headers.
        Returns dict: {page_num (1-indexed): section_name}
        """
        doc = fitz.open(pdf_path)
        sections = {}

        section_names_lower = {s.lower(): s for s in SECTION_NAMES}

        for page_num in range(doc.page_count):
            page = doc[page_num]
            page_height = page.rect.height

            # Scan top 12% of the page for section headers
            header_rect = fitz.Rect(0, 0, page.rect.width, page_height * 0.12)
            blocks = page.get_text("dict", clip=header_rect)["blocks"]

            best_match = None
            best_size = 0

            for block in blocks:
                if block.get("type") != 0:
                    continue
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        text = span.get("text", "").strip().upper()
                        size = span.get("size", 0)

                        # Check against known section names
                        for known_lower, known_original in section_names_lower.items():
                            if known_lower in text.lower() and size > best_size:
                                best_match = known_original
                                best_size = size

            if best_match:
                sections[page_num + 1] = best_match
            elif page_num == 0:
                sections[1] = "FRONT PAGE"

        doc.close()
        return sections

    def _get_tier(self, section_name):
        """Get tier (1-3) for a section. Returns 3 as default."""
        if not section_name:
            return 3
        upper = section_name.upper()
        for tier, names in SECTION_TIERS.items():
            if upper in [n.upper() for n in names]:
                return tier
        return 3

    def _scan_headlines(self, pdf_path, sections=None):
        """
        Extract headlines from all pages using font size detection.
        Includes section context from detected sections.
        Skips pages with low-value sections.
        """
        if sections is None:
            sections = {}

        doc = fitz.open(pdf_path)
        headlines = []

        for page_num in range(doc.page_count):
            section = sections.get(page_num + 1, "UNKNOWN")

            # Skip low-value sections entirely
            if section.lower() in SKIP_SECTIONS:
                continue

            page = doc[page_num]
            blocks = page.get_text("dict")["blocks"]

            page_texts = []

            # Collect all text spans with their font size and position
            for block in blocks:
                if block.get("type") != 0:  # type 0 = text block
                    continue
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        text = span.get("text", "").strip()
                        if not text:
                            continue
                        page_texts.append({
                            "text": text,
                            "size": round(span.get("size", 0), 1),
                            "y": round(span.get("origin", [0, 0])[1], 1),
                            "flags": span.get("flags", 0),  # bold, italic etc.
                        })

            if not page_texts:
                continue

            # Find headline spans (font size above threshold)
            # Group consecutive spans at the same font size into one headline
            i = 0
            while i < len(page_texts):
                span = page_texts[i]

                if span["size"] >= HEADLINE_MIN_FONT_SIZE:
                    # Collect consecutive headline-sized spans into one headline
                    headline_parts = [span["text"]]
                    headline_y = span["y"]
                    headline_size = span["size"]
                    j = i + 1

                    while j < len(page_texts):
                        next_span = page_texts[j]
                        # Same headline if similar font size and close vertical position
                        if (abs(next_span["size"] - headline_size) < 2
                                and abs(next_span["y"] - page_texts[j-1]["y"]) < 20):
                            headline_parts.append(next_span["text"])
                            j += 1
                        else:
                            break

                    headline_text = " ".join(headline_parts).strip()

                    # Skip very short headlines (likely page numbers, labels)
                    if len(headline_text) < 10:
                        i = j
                        continue

                    # Skip common non-article text
                    skip_patterns = [
                        r"^the\s+hindu$",
                        r"^page\s+\d+",
                        r"^\d+$",
                        r"^www\.",
                        r"^https?://",
                        r"continued\s+on",
                        r"continued\s+from",
                    ]
                    if any(re.search(p, headline_text, re.IGNORECASE) for p in skip_patterns):
                        i = j
                        continue

                    # Grab snippet: first ~200 chars of body text below this headline
                    snippet = ""
                    for k in range(j, min(j + 15, len(page_texts))):
                        body_span = page_texts[k]
                        if body_span["size"] < HEADLINE_MIN_FONT_SIZE:
                            snippet += body_span["text"] + " "
                            if len(snippet) >= 200:
                                break

                    snippet = snippet[:200].strip()

                    headlines.append({
                        "page": page_num + 1,
                        "headline": headline_text,
                        "snippet": snippet,
                        "font_size": headline_size,
                        "section": section,
                        "tier": self._get_tier(section),
                    })

                    i = j
                else:
                    i += 1

        doc.close()

        # Deduplicate headlines (same text on same page)
        seen = set()
        unique = []
        for h in headlines:
            key = (h["page"], h["headline"].lower())
            if key not in seen:
                seen.add(key)
                unique.append(h)

        # Sort by font size descending (bigger headline = more important)
        unique.sort(key=lambda h: h["font_size"], reverse=True)

        return unique

    def _rank_headlines(self, headlines, max_retries=3):
        """Send headlines to text LLM for importance ranking and summary."""

        # Build headline list with section and tier context
        lines = []
        for h in headlines:
            tier = h.get("tier", 3)
            section = h.get("section", "UNKNOWN")
            line = f"[Tier {tier}] Page {h['page']} — {section}: \"{h['headline']}\""
            if h["snippet"]:
                line += f" — {h['snippet']}"
            lines.append(line)

        headlines_text = "\n".join(lines)

        prompt = SUMMARY_PROMPT.format(
            headlines_list=headlines_text,
            count=TOP_ARTICLES_COUNT,
        )

        # Call LLM with retry
        for attempt in range(max_retries):
            try:
                response = self.llm_client.chat.completions.create(
                    model=TEXT_MODEL,
                    messages=[
                        {
                            "role": "system",
                            "content": "You are a newspaper editor. Respond with valid JSON only, no markdown fences.",
                        },
                        {"role": "user", "content": prompt},
                    ],
                    temperature=0.1,
                    max_tokens=4096,
                )
                raw = response.choices[0].message.content.strip()
                if raw.startswith("```"):
                    raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
                result = json.loads(raw)
                logger.info(
                    f"LLM ranked {len(result.get('important_articles', []))} important articles"
                )
                return result

            except Exception as e:
                if "429" in str(e) or "rate" in str(e).lower():
                    wait = 60
                    match = re.search(r"(\d+\.?\d*)\s*s", str(e))
                    if match:
                        wait = float(match.group(1)) + 2
                    logger.warning(f"Rate limited, waiting {wait:.0f}s (attempt {attempt + 1})")
                    time.sleep(wait)
                    continue
                logger.error(f"LLM ranking failed: {e}")
                raise

        raise RuntimeError("Summary LLM failed after retries")