| | """ |
| | Paper Extractor Agent β Parses PDF and extracts structured paper data. |
| | |
| | Reference: system_design.md β Agent 3 (Lines 321-333) |
| | Reference: engineering_guardrails.md β Β§3.3 Agent-Level Fallback (Line 134) |
| | |
| | Step 1 in pipeline: Every other agent depends on this output. |
| | Failure Mode: If PDF unreadable β return partial with extraction_confidence: "low" |
| | """ |
| |
|
| | from dotenv import load_dotenv |
| | load_dotenv() |
| |
|
| | from crewai import Agent, Task, LLM |
| | from tools.pdf_parser import pdf_parser_tool |
| |
|
| | |
| | llm = LLM( |
| | model="gpt-4o", |
| | temperature=0.1, |
| | seed=42, |
| | ) |
| |
|
| | paper_extractor = Agent( |
| | role="Research Paper Data Extractor", |
| | goal=( |
| | "Parse the paper and extract all key structural elements into a clean, " |
| | "structured format. Identify title, authors, abstract, methodology, " |
| | "key findings, contributions, limitations, references, and paper type." |
| | ), |
| | backstory=( |
| | "You are a meticulous research librarian who has cataloged thousands of " |
| | "papers. You never miss a section, always identify the methodology " |
| | "correctly, and structure information for easy downstream analysis. " |
| | "When information is unclear or missing, you note it honestly rather " |
| | "than fabricating details." |
| | ), |
| | tools=[pdf_parser_tool], |
| | llm=llm, |
| | verbose=True, |
| | allow_delegation=False, |
| | max_iter=3, |
| | ) |
| |
|
| |
|
| | def create_extraction_task(sanitized_text: str) -> Task: |
| | """Create the paper extraction task. |
| | |
| | Reference: system_design.md β Paper Extractor output schema (Lines 80-91) |
| | """ |
| | from schemas.models import PaperExtraction |
| |
|
| | return Task( |
| | description=( |
| | f"Extract structured metadata from the following research paper text.\n\n" |
| | f"You MUST extract ALL of the following fields:\n" |
| | f"1. title: The exact title of the paper\n" |
| | f"2. authors: List of all author names\n" |
| | f"3. abstract: The paper's abstract (full text)\n" |
| | f"4. methodology: Description of the research methodology used\n" |
| | f"5. key_findings: List of the paper's main findings/results\n" |
| | f"6. contributions: List of the paper's claimed contributions\n" |
| | f"7. limitations_stated: List of limitations the authors acknowledge\n" |
| | f"8. references_count: Total number of references/citations\n" |
| | f"9. paper_type: One of 'empirical', 'theoretical', 'survey', 'system', 'mixed'\n" |
| | f"10. extraction_confidence: 'high' if all fields clearly found, 'medium' if some " |
| | f"are inferred, 'low' if paper is poorly structured\n\n" |
| | f"If a field cannot be found, provide your best inference and set " |
| | f"extraction_confidence to 'medium' or 'low'.\n\n" |
| | f"PAPER TEXT:\n{sanitized_text[:50000]}" |
| | ), |
| | agent=paper_extractor, |
| | expected_output="A PaperExtraction with all structured fields from the paper.", |
| | output_pydantic=PaperExtraction, |
| | ) |
| |
|