Spaces:

ChronoStellar
/

ResearchPaper-Agent

Sleeping

App Files Files Community

ResearchPaper-Agent / setup.py

ChronoStellar

Update setup.py

3e07c13 verified 4 months ago

raw

history blame contribute delete

5.92 kB

	import os
	import requests
	import json
	import csv
	import arxiv
	from bs4 import BeautifulSoup
	from huggingface_hub import HfApi
	from pypdf import PdfReader
	from smolagents import tool, CodeAgent, WebSearchTool, OpenAIModel

	# ------------------------
	# Secrets
	# ------------------------
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	if not OPENAI_API_KEY:
	raise RuntimeError("Missing OPENAI_API_KEY environment variable")

	# ------------------------
	# Tools
	# ------------------------

	@tool
	def get_paper_recommendation() -> str:
	"""
	This is a tool that returns the most upvoted paper on Hugging Face daily papers.
	It returns the title of the paper
	"""
	try:
	url = "https://huggingface.co/papers"
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "html.parser")

	# Extract the title element from the JSON-like data in the "data-props" attribute
	containers = soup.find_all('div', class_='SVELTE_HYDRATER contents')
	top_paper = ""

	for container in containers:
	data_props = container.get('data-props', '')
	if data_props:
	try:
	# Parse the JSON-like string
	json_data = json.loads(data_props.replace('"', '"'))
	if 'dailyPapers' in json_data:
	papers = json_data['dailyPapers'][:10]
	top_paper = [paper["title"] for paper in papers]

	except json.JSONDecodeError:
	continue

	return top_paper
	except requests.exceptions.RequestException as e:
	print(f"Error occurred while fetching the HTML: {e}")
	return None

	@tool
	def get_paper_id_by_title(title: str) -> str:
	"""
	This is a tool that returns the arxiv paper id by its title.
	It returns the title of the paper

	Args:
	title: The paper title for which to get the id.
	"""
	api = HfApi()
	papers = api.list_papers(query=title)
	if papers:
	paper = next(iter(papers))
	return paper.id
	else:
	return None

	@tool
	def download_paper_by_id(paper_id: str) -> None:
	"""
	This tool gets the id of a paper and downloads it from arxiv. It saves the paper locally
	in the current directory as "paper.pdf".

	Args:
	paper_id: The id of the paper to download.
	"""
	paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
	paper.download_pdf(filename="paper.pdf")
	return None

	@tool
	def read_pdf_file(file_path: str) -> str:
	"""
	This function reads the first three pages of a PDF file and returns its content as a string.
	Args:
	file_path: The path to the PDF file.
	Returns:
	A string containing the content of the PDF file.
	"""
	content = ""
	reader = PdfReader('paper.pdf')
	print(len(reader.pages))
	pages = reader.pages[:3]
	for page in pages:
	content += page.extract_text()
	return content

	@tool
	def save_flashcards_to_csv(flashcards_data: str, filename: str) -> str:
	"""
	Saves a list of flashcards to a CSV file (importable into Anki).

	Args:
	flashcards_data: A string containing flashcards separated by newlines,
	where the Question and Answer are separated by a semicolon (;).
	Example: "What is X?;It is Y.\nWhy is Z?;Because A."
	filename: The name of the file to save (e.g., 'output.csv').
	"""
	try:
	rows = []
	# Split the big string into lines
	lines = flashcards_data.strip().split('\n')

	for line in lines:
	if ';' in line:
	parts = line.split(';', 1) # Split only on the first semicolon
	rows.append([parts[0].strip(), parts[1].strip()])

	with open(filename, 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow(["Front", "Back"]) # Header
	writer.writerows(rows)

	return f"Successfully saved {len(rows)} flashcards to {filename}."
	except Exception as e:
	return f"Error saving file: {e}"


	# ------------------------
	# Agent
	# ------------------------
	STRICT_PROMPT = """
	************************************************************
	CRITICAL INSTRUCTIONS - READ CAREFULLY
	************************************************************
	1. DOMAIN RESTRICTION (HARD):
	- You are a Research Paper Analysis Tool. You are NOT a general chatbot.
	- If the user asks about the weather, jokes, coding help unrelated to papers, or general chat, you MUST refuse.
	- Reply ONLY: "I can only analyze research papers. Please provide a topic or paper title."

	2. MANDATORY OUTPUT FORMAT:
	- Your goal is NEVER just to "answer". Your goal is ALWAYS to generate a Flashcard CSV.
	- Every successful execution MUST end with a call to `save_flashcards_to_csv`.

	3. YOUR WORKFLOW:
	- Step 1: Identify the paper (Search or Title).
	- Step 2: Download/Read the PDF.
	- Step 3: Extract 5-10 key concepts.
	- Step 4: Create flashcards.
	- Step 5: SAVE the CSV.
	************************************************************
	"""

	_agent = None

	def get_paper_agent():
	global _agent
	if _agent:
	return _agent

	print("AGENT INITIALIZING")

	model = OpenAIModel(
	api_key=OPENAI_API_KEY,
	model_id="gpt-5-nano",
	)

	agent = CodeAgent(
	tools=[
	get_paper_recommendation,
	get_paper_id_by_title,
	download_paper_by_id,
	read_pdf_file,
	save_flashcards_to_csv,
	WebSearchTool(),
	],
	model=model,
	name="flashcard_agent",
	description="Creates flashcards from research papers",
	max_steps=10,
	add_base_tools=True,
	)

	agent.prompt_templates["system_prompt"] += STRICT_PROMPT
	_agent = agent
	return agent