Spaces:

AISA-Framework
/

AI-Research-Paper-Analyst

Sleeping

AI-Research-Paper-Analyst / agents /paper_extractor.py

Saleh

Clean deployment to HuggingFace Space

2447eba 20 days ago

3.07 kB

	"""
	Paper Extractor Agent — Parses PDF and extracts structured paper data.

	Reference: system_design.md — Agent 3 (Lines 321-333)
	Reference: engineering_guardrails.md — §3.3 Agent-Level Fallback (Line 134)

	Step 1 in pipeline: Every other agent depends on this output.
	Failure Mode: If PDF unreadable → return partial with extraction_confidence: "low"
	"""

	from dotenv import load_dotenv
	load_dotenv()

	from crewai import Agent, Task, LLM
	from tools.pdf_parser import pdf_parser_tool

	# Use GPT-4o for extraction — needs deeper comprehension of full paper
	llm = LLM(
	model="gpt-4o",
	temperature=0.1,
	seed=42,
	)

	paper_extractor = Agent(
	role="Research Paper Data Extractor",
	goal=(
	"Parse the paper and extract all key structural elements into a clean, "
	"structured format. Identify title, authors, abstract, methodology, "
	"key findings, contributions, limitations, references, and paper type."
	),
	backstory=(
	"You are a meticulous research librarian who has cataloged thousands of "
	"papers. You never miss a section, always identify the methodology "
	"correctly, and structure information for easy downstream analysis. "
	"When information is unclear or missing, you note it honestly rather "
	"than fabricating details."
	),
	tools=[pdf_parser_tool],
	llm=llm,
	verbose=True,
	allow_delegation=False,
	max_iter=3,
	)


	def create_extraction_task(sanitized_text: str) -> Task:
	"""Create the paper extraction task.

	Reference: system_design.md — Paper Extractor output schema (Lines 80-91)
	"""
	from schemas.models import PaperExtraction

	return Task(
	description=(
	f"Extract structured metadata from the following research paper text.\n\n"
	f"You MUST extract ALL of the following fields:\n"
	f"1. title: The exact title of the paper\n"
	f"2. authors: List of all author names\n"
	f"3. abstract: The paper's abstract (full text)\n"
	f"4. methodology: Description of the research methodology used\n"
	f"5. key_findings: List of the paper's main findings/results\n"
	f"6. contributions: List of the paper's claimed contributions\n"
	f"7. limitations_stated: List of limitations the authors acknowledge\n"
	f"8. references_count: Total number of references/citations\n"
	f"9. paper_type: One of 'empirical', 'theoretical', 'survey', 'system', 'mixed'\n"
	f"10. extraction_confidence: 'high' if all fields clearly found, 'medium' if some "
	f"are inferred, 'low' if paper is poorly structured\n\n"
	f"If a field cannot be found, provide your best inference and set "
	f"extraction_confidence to 'medium' or 'low'.\n\n"
	f"PAPER TEXT:\n{sanitized_text[:50000]}"
	),
	agent=paper_extractor,
	expected_output="A PaperExtraction with all structured fields from the paper.",
	output_pydantic=PaperExtraction,
	)