""" Paper Extractor Agent — Parses PDF and extracts structured paper data. Reference: system_design.md — Agent 3 (Lines 321-333) Reference: engineering_guardrails.md — §3.3 Agent-Level Fallback (Line 134) Step 1 in pipeline: Every other agent depends on this output. Failure Mode: If PDF unreadable → return partial with extraction_confidence: "low" """ from dotenv import load_dotenv load_dotenv() from crewai import Agent, Task, LLM from tools.pdf_parser import pdf_parser_tool # Use GPT-4o for extraction — needs deeper comprehension of full paper llm = LLM( model="gpt-4o", temperature=0.1, seed=42, ) paper_extractor = Agent( role="Research Paper Data Extractor", goal=( "Parse the paper and extract all key structural elements into a clean, " "structured format. Identify title, authors, abstract, methodology, " "key findings, contributions, limitations, references, and paper type." ), backstory=( "You are a meticulous research librarian who has cataloged thousands of " "papers. You never miss a section, always identify the methodology " "correctly, and structure information for easy downstream analysis. " "When information is unclear or missing, you note it honestly rather " "than fabricating details." ), tools=[pdf_parser_tool], llm=llm, verbose=True, allow_delegation=False, max_iter=3, ) def create_extraction_task(sanitized_text: str) -> Task: """Create the paper extraction task. Reference: system_design.md — Paper Extractor output schema (Lines 80-91) """ from schemas.models import PaperExtraction return Task( description=( f"Extract structured metadata from the following research paper text.\n\n" f"You MUST extract ALL of the following fields:\n" f"1. title: The exact title of the paper\n" f"2. authors: List of all author names\n" f"3. abstract: The paper's abstract (full text)\n" f"4. methodology: Description of the research methodology used\n" f"5. key_findings: List of the paper's main findings/results\n" f"6. contributions: List of the paper's claimed contributions\n" f"7. limitations_stated: List of limitations the authors acknowledge\n" f"8. references_count: Total number of references/citations\n" f"9. paper_type: One of 'empirical', 'theoretical', 'survey', 'system', 'mixed'\n" f"10. extraction_confidence: 'high' if all fields clearly found, 'medium' if some " f"are inferred, 'low' if paper is poorly structured\n\n" f"If a field cannot be found, provide your best inference and set " f"extraction_confidence to 'medium' or 'low'.\n\n" f"PAPER TEXT:\n{sanitized_text[:50000]}" ), agent=paper_extractor, expected_output="A PaperExtraction with all structured fields from the paper.", output_pydantic=PaperExtraction, )