"""Scientific paper parser: extract full text + structured metadata from PDFs.

Extracts:
- title, authors, year, abstract
- section titles + bodies
- references
"""

import re
import json
from pathlib import Path
from typing import Dict, List, Tuple, Optional


# ---------------------------------------------------------------------------
# Section patterns (covers most LaTeX-compiled arXiv PDFs)
# ---------------------------------------------------------------------------

SECTION_PATTERNS = [
    # Numbered: "1 Introduction", "2. Related Work", "1.1 Background"
    re.compile(r"^\s*(\d+(?:\.\d+)*)\s{1,4}([A-Z][A-Za-z &,\-:]{2,60})\s*$", re.MULTILINE),
    # Unnumbered all-caps: "INTRODUCTION", "RELATED WORK"
    re.compile(r"^\s*([A-Z][A-Z ]{3,40})\s*$", re.MULTILINE),
    # Named: "Abstract", "Introduction", "Conclusion", "References"
    re.compile(
        r"^\s*(Abstract|Introduction|Related Work|Background|Methodology|Methods|"
        r"Experiments?|Results?|Discussion|Conclusion|Limitations?|"
        r"Future Work|Acknowledgements?|References?)\s*$",
        re.MULTILINE | re.IGNORECASE,
    ),
]

ABSTRACT_RE = re.compile(
    r"(?:Abstract|ABSTRACT)[.\s—–-]*\n(.*?)(?=\n\s*\n|\n\s*(?:1\.|Introduction|Keywords))",
    re.DOTALL | re.IGNORECASE,
)

REFERENCE_RE = re.compile(
    r"\n\s*(?:References?|Bibliography)\s*\n(.*?)$",
    re.DOTALL | re.IGNORECASE,
)

AUTHORS_RE = re.compile(
    r"(?:^|\n)((?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))(?:,?\s+(?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))*",
)


def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract raw text from PDF, preserving page structure."""
    from pypdf import PdfReader
    reader = PdfReader(pdf_path)
    pages = []
    for page in reader.pages:
        text = page.extract_text() or ""
        pages.append(text)
    return "\n\n[PAGE_BREAK]\n\n".join(pages)


def extract_abstract(text: str) -> str:
    """Extract abstract from paper text."""
    m = ABSTRACT_RE.search(text)
    if m:
        abstract = m.group(1).strip()
        # Clean up hyphenated line breaks and extra spaces
        abstract = re.sub(r"-\n", "", abstract)
        abstract = re.sub(r"\s+", " ", abstract)
        return abstract[:2000]
    # Fallback: first 500 chars after abstract keyword
    idx = text.lower().find("abstract")
    if idx >= 0:
        return text[idx + 8:idx + 800].strip()
    return ""


def extract_sections(text: str) -> List[Dict]:
    """
    Split paper into sections. Returns list of:
      {"title": str, "body": str, "type": str}
    where type is one of: abstract, introduction, methodology, results, conclusion, other.
    """
    # Find all section header positions
    positions = []
    for pat in SECTION_PATTERNS:
        for m in pat.finditer(text):
            positions.append((m.start(), m.end(), m.group(0).strip()))

    # Sort by position, deduplicate overlapping matches
    positions.sort(key=lambda x: x[0])
    deduped = []
    last_end = -1
    for start, end, title in positions:
        if start >= last_end:
            deduped.append((start, end, title))
            last_end = end

    if not deduped:
        # No sections found: return whole text as one section
        return [{"title": "full_text", "body": text, "type": "other"}]

    sections = []
    for i, (start, end, title) in enumerate(deduped):
        body_start = end
        body_end = deduped[i + 1][0] if i + 1 < len(deduped) else len(text)
        body = text[body_start:body_end].strip()
        section_type = _classify_section(title)
        sections.append({"title": title, "body": body, "type": section_type})

    return sections


def _classify_section(title: str) -> str:
    t = title.lower()
    if any(k in t for k in ["abstract"]):
        return "abstract"
    if any(k in t for k in ["introduction", "background", "overview", "motivation"]):
        return "introduction"
    if any(k in t for k in ["method", "approach", "model", "architecture", "framework", "system"]):
        return "methodology"
    if any(k in t for k in ["experiment", "result", "evaluat", "benchmark", "performance", "ablat"]):
        return "results"
    if any(k in t for k in ["conclusion", "future", "discussion", "limitation", "summary"]):
        return "conclusion"
    if any(k in t for k in ["reference", "bibliograph"]):
        return "references"
    if any(k in t for k in ["related work", "prior work", "literature"]):
        return "related_work"
    return "other"


def extract_references(text: str) -> List[str]:
    """Extract references section as list of reference strings."""
    m = REFERENCE_RE.search(text)
    if not m:
        return []
    ref_block = m.group(1)
    # Split on numbered references [1], [2] or numbered lines
    refs = re.split(r"\n\s*\[\d+\]|\n\s*\d+\.", ref_block)
    return [r.strip() for r in refs if len(r.strip()) > 20][:100]


def extract_metadata(
    pdf_path: str,
    prefetched_meta: Optional[Dict] = None,
) -> Dict:
    """
    Full extraction pipeline for one PDF.

    Args:
        pdf_path: path to PDF file
        prefetched_meta: dict with title/authors/year/abstract from arXiv API (optional)

    Returns metadata dict with:
        pdf_path, title, authors, year, abstract,
        sections (list of {title, body, type}),
        references (list of strings),
        full_text
    """
    path = Path(pdf_path)
    try:
        full_text = extract_text_from_pdf(str(path))
    except Exception as e:
        print(f"  [warn] could not parse {path.name}: {e}")
        full_text = ""

    if prefetched_meta:
        title = prefetched_meta.get("title", path.stem)
        authors = prefetched_meta.get("authors", [])
        year = prefetched_meta.get("year", "")
        abstract = prefetched_meta.get("abstract", "") or extract_abstract(full_text)
    else:
        title = path.stem
        authors = []
        year = ""
        abstract = extract_abstract(full_text)

    sections = extract_sections(full_text)
    references = extract_references(full_text)

    return {
        "pdf_path": str(path),
        "arxiv_id": path.stem,
        "title": title,
        "authors": authors,
        "year": year,
        "abstract": abstract,
        "sections": sections,
        "references": references,
        "full_text": full_text,
        "num_sections": len(sections),
        "num_references": len(references),
        "text_length": len(full_text),
    }


def parse_pdf(pdf_path: str, prefetched_meta: Optional[Dict] = None) -> Dict:
    """Alias for extract_metadata — use in pipeline code."""
    return extract_metadata(pdf_path, prefetched_meta)


# ---------------------------------------------------------------------------
# Batch parsing with metadata JSONL
# ---------------------------------------------------------------------------

def parse_corpus(
    papers_dir: str = "data/papers",
    metadata_file: str = "data/metadata.jsonl",
    output_file: str = "data/parsed_corpus.jsonl",
) -> List[Dict]:
    """Parse all PDFs in papers_dir. Returns list of parsed metadata dicts."""
    papers_path = Path(papers_dir)
    output_path = Path(output_file)

    # Load arXiv metadata if available
    meta_lookup = {}
    if Path(metadata_file).exists():
        with open(metadata_file) as f:
            for line in f:
                rec = json.loads(line)
                meta_lookup[rec.get("arxiv_id", "")] = rec

    pdfs = sorted(papers_path.glob("*.pdf"))
    print(f"Parsing {len(pdfs)} PDFs...")

    parsed = []
    with output_path.open("w") as out_f:
        for i, pdf in enumerate(pdfs):
            arxiv_id = pdf.stem
            pre = meta_lookup.get(arxiv_id)
            print(f"  [{i+1}/{len(pdfs)}] {pdf.name[:50]}")
            doc = extract_metadata(str(pdf), prefetched_meta=pre)
            out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
            out_f.flush()
            parsed.append(doc)

    print(f"Parsed corpus saved to {output_path}")
    return parsed


if __name__ == "__main__":
    parse_corpus()