Spaces:

ChronoStellar
/

ResearchPaper-Agent

Sleeping

File size: 5,922 Bytes

import os
import requests
import json
import csv
import arxiv
from bs4 import BeautifulSoup
from huggingface_hub import HfApi
from pypdf import PdfReader
from smolagents import tool, CodeAgent, WebSearchTool, OpenAIModel

# ------------------------
# Secrets
# ------------------------
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise RuntimeError("Missing OPENAI_API_KEY environment variable")

# ------------------------
# Tools
# ------------------------

@tool
def get_paper_recommendation() -> str:
    """
    This is a tool that returns the most upvoted paper on Hugging Face daily papers.
    It returns the title of the paper
    """
    try:
      url = "https://huggingface.co/papers"
      response = requests.get(url)
      response.raise_for_status()
      soup = BeautifulSoup(response.content, "html.parser")

      # Extract the title element from the JSON-like data in the "data-props" attribute
      containers = soup.find_all('div', class_='SVELTE_HYDRATER contents')
      top_paper = ""

      for container in containers:
          data_props = container.get('data-props', '')
          if data_props:
              try:
                  # Parse the JSON-like string
                  json_data = json.loads(data_props.replace('&quot;', '"'))
                  if 'dailyPapers' in json_data:
                      papers = json_data['dailyPapers'][:10]
                      top_paper = [paper["title"] for paper in papers]

              except json.JSONDecodeError:
                  continue

      return top_paper
    except requests.exceptions.RequestException as e:
      print(f"Error occurred while fetching the HTML: {e}")
      return None

@tool
def get_paper_id_by_title(title: str) -> str:
    """
    This is a tool that returns the arxiv paper id by its title.
    It returns the title of the paper

    Args:
        title: The paper title for which to get the id.
    """
    api = HfApi()
    papers = api.list_papers(query=title)
    if papers:
        paper = next(iter(papers))
        return paper.id
    else:
        return None

@tool
def download_paper_by_id(paper_id: str) -> None:
    """
    This tool gets the id of a paper and downloads it from arxiv. It saves the paper locally
    in the current directory as "paper.pdf".

    Args:
        paper_id: The id of the paper to download.
    """
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
    paper.download_pdf(filename="paper.pdf")
    return None

@tool
def read_pdf_file(file_path: str) -> str:
    """
    This function reads the first three pages of a PDF file and returns its content as a string.
    Args:
        file_path: The path to the PDF file.
    Returns:
        A string containing the content of the PDF file.
    """
    content = ""
    reader = PdfReader('paper.pdf')
    print(len(reader.pages))
    pages = reader.pages[:3]
    for page in pages:
        content += page.extract_text()
    return content

@tool
def save_flashcards_to_csv(flashcards_data: str, filename: str) -> str:
    """
    Saves a list of flashcards to a CSV file (importable into Anki).

    Args:
        flashcards_data: A string containing flashcards separated by newlines,
                         where the Question and Answer are separated by a semicolon (;).
                         Example: "What is X?;It is Y.\nWhy is Z?;Because A."
        filename: The name of the file to save (e.g., 'output.csv').
    """
    try:
        rows = []
        # Split the big string into lines
        lines = flashcards_data.strip().split('\n')

        for line in lines:
            if ';' in line:
                parts = line.split(';', 1) # Split only on the first semicolon
                rows.append([parts[0].strip(), parts[1].strip()])

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["Front", "Back"]) # Header
            writer.writerows(rows)

        return f"Successfully saved {len(rows)} flashcards to {filename}."
    except Exception as e:
        return f"Error saving file: {e}"


# ------------------------
# Agent
# ------------------------
STRICT_PROMPT = """
************************************************************
CRITICAL INSTRUCTIONS - READ CAREFULLY
************************************************************
1. DOMAIN RESTRICTION (HARD):
   - You are a Research Paper Analysis Tool. You are NOT a general chatbot.
   - If the user asks about the weather, jokes, coding help unrelated to papers, or general chat, you MUST refuse.
   - Reply ONLY: "I can only analyze research papers. Please provide a topic or paper title."

2. MANDATORY OUTPUT FORMAT:
   - Your goal is NEVER just to "answer". Your goal is ALWAYS to generate a Flashcard CSV.
   - Every successful execution MUST end with a call to `save_flashcards_to_csv`.

3. YOUR WORKFLOW:
   - Step 1: Identify the paper (Search or Title).
   - Step 2: Download/Read the PDF.
   - Step 3: Extract 5-10 key concepts.
   - Step 4: Create flashcards.
   - Step 5: SAVE the CSV.
************************************************************
"""

_agent = None

def get_paper_agent():
    global _agent
    if _agent:
        return _agent

    print("AGENT INITIALIZING")

    model = OpenAIModel(
        api_key=OPENAI_API_KEY,
        model_id="gpt-5-nano",
    )

    agent = CodeAgent(
        tools=[
            get_paper_recommendation,
            get_paper_id_by_title,
            download_paper_by_id,
            read_pdf_file,
            save_flashcards_to_csv,
            WebSearchTool(),
        ],
        model=model,
        name="flashcard_agent",
        description="Creates flashcards from research papers",
        max_steps=10,
        add_base_tools=True,
    )

    agent.prompt_templates["system_prompt"] += STRICT_PROMPT
    _agent = agent
    return agent