"""
Paper Extractor Agent — Parses PDF and extracts structured paper data.

Reference: system_design.md — Agent 3 (Lines 321-333)
Reference: engineering_guardrails.md — §3.3 Agent-Level Fallback (Line 134)

Step 1 in pipeline: Every other agent depends on this output.
Failure Mode: If PDF unreadable → return partial with extraction_confidence: "low"
"""

from dotenv import load_dotenv
load_dotenv()

from crewai import Agent, Task, LLM
from tools.pdf_parser import pdf_parser_tool

# Use GPT-4o for extraction — needs deeper comprehension of full paper
llm = LLM(
    model="gpt-4o",
    temperature=0.1,
    seed=42,
)

paper_extractor = Agent(
    role="Research Paper Data Extractor",
    goal=(
        "Parse the paper and extract all key structural elements into a clean, "
        "structured format. Identify title, authors, abstract, methodology, "
        "key findings, contributions, limitations, references, and paper type."
    ),
    backstory=(
        "You are a meticulous research librarian who has cataloged thousands of "
        "papers. You never miss a section, always identify the methodology "
        "correctly, and structure information for easy downstream analysis. "
        "When information is unclear or missing, you note it honestly rather "
        "than fabricating details."
    ),
    tools=[pdf_parser_tool],
    llm=llm,
    verbose=True,
    allow_delegation=False,
    max_iter=3,
)


def create_extraction_task(sanitized_text: str) -> Task:
    """Create the paper extraction task.
    
    Reference: system_design.md — Paper Extractor output schema (Lines 80-91)
    """
    from schemas.models import PaperExtraction

    return Task(
        description=(
            f"Extract structured metadata from the following research paper text.\n\n"
            f"You MUST extract ALL of the following fields:\n"
            f"1. title: The exact title of the paper\n"
            f"2. authors: List of all author names\n"
            f"3. abstract: The paper's abstract (full text)\n"
            f"4. methodology: Description of the research methodology used\n"
            f"5. key_findings: List of the paper's main findings/results\n"
            f"6. contributions: List of the paper's claimed contributions\n"
            f"7. limitations_stated: List of limitations the authors acknowledge\n"
            f"8. references_count: Total number of references/citations\n"
            f"9. paper_type: One of 'empirical', 'theoretical', 'survey', 'system', 'mixed'\n"
            f"10. extraction_confidence: 'high' if all fields clearly found, 'medium' if some "
            f"are inferred, 'low' if paper is poorly structured\n\n"
            f"If a field cannot be found, provide your best inference and set "
            f"extraction_confidence to 'medium' or 'low'.\n\n"
            f"PAPER TEXT:\n{sanitized_text[:50000]}"
        ),
        agent=paper_extractor,
        expected_output="A PaperExtraction with all structured fields from the paper.",
        output_pydantic=PaperExtraction,
    )