AI-Research-Paper-Analyst / agents /paper_extractor.py
Saleh
Clean deployment to HuggingFace Space
2447eba
"""
Paper Extractor Agent β€” Parses PDF and extracts structured paper data.
Reference: system_design.md β€” Agent 3 (Lines 321-333)
Reference: engineering_guardrails.md β€” Β§3.3 Agent-Level Fallback (Line 134)
Step 1 in pipeline: Every other agent depends on this output.
Failure Mode: If PDF unreadable β†’ return partial with extraction_confidence: "low"
"""
from dotenv import load_dotenv
load_dotenv()
from crewai import Agent, Task, LLM
from tools.pdf_parser import pdf_parser_tool
# Use GPT-4o for extraction β€” needs deeper comprehension of full paper
llm = LLM(
model="gpt-4o",
temperature=0.1,
seed=42,
)
paper_extractor = Agent(
role="Research Paper Data Extractor",
goal=(
"Parse the paper and extract all key structural elements into a clean, "
"structured format. Identify title, authors, abstract, methodology, "
"key findings, contributions, limitations, references, and paper type."
),
backstory=(
"You are a meticulous research librarian who has cataloged thousands of "
"papers. You never miss a section, always identify the methodology "
"correctly, and structure information for easy downstream analysis. "
"When information is unclear or missing, you note it honestly rather "
"than fabricating details."
),
tools=[pdf_parser_tool],
llm=llm,
verbose=True,
allow_delegation=False,
max_iter=3,
)
def create_extraction_task(sanitized_text: str) -> Task:
"""Create the paper extraction task.
Reference: system_design.md β€” Paper Extractor output schema (Lines 80-91)
"""
from schemas.models import PaperExtraction
return Task(
description=(
f"Extract structured metadata from the following research paper text.\n\n"
f"You MUST extract ALL of the following fields:\n"
f"1. title: The exact title of the paper\n"
f"2. authors: List of all author names\n"
f"3. abstract: The paper's abstract (full text)\n"
f"4. methodology: Description of the research methodology used\n"
f"5. key_findings: List of the paper's main findings/results\n"
f"6. contributions: List of the paper's claimed contributions\n"
f"7. limitations_stated: List of limitations the authors acknowledge\n"
f"8. references_count: Total number of references/citations\n"
f"9. paper_type: One of 'empirical', 'theoretical', 'survey', 'system', 'mixed'\n"
f"10. extraction_confidence: 'high' if all fields clearly found, 'medium' if some "
f"are inferred, 'low' if paper is poorly structured\n\n"
f"If a field cannot be found, provide your best inference and set "
f"extraction_confidence to 'medium' or 'low'.\n\n"
f"PAPER TEXT:\n{sanitized_text[:50000]}"
),
agent=paper_extractor,
expected_output="A PaperExtraction with all structured fields from the paper.",
output_pydantic=PaperExtraction,
)