import os import requests import json import csv import arxiv from bs4 import BeautifulSoup from huggingface_hub import HfApi from pypdf import PdfReader from smolagents import tool, CodeAgent, WebSearchTool, OpenAIModel # ------------------------ # Secrets # ------------------------ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") if not OPENAI_API_KEY: raise RuntimeError("Missing OPENAI_API_KEY environment variable") # ------------------------ # Tools # ------------------------ @tool def get_paper_recommendation() -> str: """ This is a tool that returns the most upvoted paper on Hugging Face daily papers. It returns the title of the paper """ try: url = "https://huggingface.co/papers" response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # Extract the title element from the JSON-like data in the "data-props" attribute containers = soup.find_all('div', class_='SVELTE_HYDRATER contents') top_paper = "" for container in containers: data_props = container.get('data-props', '') if data_props: try: # Parse the JSON-like string json_data = json.loads(data_props.replace('"', '"')) if 'dailyPapers' in json_data: papers = json_data['dailyPapers'][:10] top_paper = [paper["title"] for paper in papers] except json.JSONDecodeError: continue return top_paper except requests.exceptions.RequestException as e: print(f"Error occurred while fetching the HTML: {e}") return None @tool def get_paper_id_by_title(title: str) -> str: """ This is a tool that returns the arxiv paper id by its title. It returns the title of the paper Args: title: The paper title for which to get the id. """ api = HfApi() papers = api.list_papers(query=title) if papers: paper = next(iter(papers)) return paper.id else: return None @tool def download_paper_by_id(paper_id: str) -> None: """ This tool gets the id of a paper and downloads it from arxiv. It saves the paper locally in the current directory as "paper.pdf". Args: paper_id: The id of the paper to download. """ paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id]))) paper.download_pdf(filename="paper.pdf") return None @tool def read_pdf_file(file_path: str) -> str: """ This function reads the first three pages of a PDF file and returns its content as a string. Args: file_path: The path to the PDF file. Returns: A string containing the content of the PDF file. """ content = "" reader = PdfReader('paper.pdf') print(len(reader.pages)) pages = reader.pages[:3] for page in pages: content += page.extract_text() return content @tool def save_flashcards_to_csv(flashcards_data: str, filename: str) -> str: """ Saves a list of flashcards to a CSV file (importable into Anki). Args: flashcards_data: A string containing flashcards separated by newlines, where the Question and Answer are separated by a semicolon (;). Example: "What is X?;It is Y.\nWhy is Z?;Because A." filename: The name of the file to save (e.g., 'output.csv'). """ try: rows = [] # Split the big string into lines lines = flashcards_data.strip().split('\n') for line in lines: if ';' in line: parts = line.split(';', 1) # Split only on the first semicolon rows.append([parts[0].strip(), parts[1].strip()]) with open(filename, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(["Front", "Back"]) # Header writer.writerows(rows) return f"Successfully saved {len(rows)} flashcards to {filename}." except Exception as e: return f"Error saving file: {e}" # ------------------------ # Agent # ------------------------ STRICT_PROMPT = """ ************************************************************ CRITICAL INSTRUCTIONS - READ CAREFULLY ************************************************************ 1. DOMAIN RESTRICTION (HARD): - You are a Research Paper Analysis Tool. You are NOT a general chatbot. - If the user asks about the weather, jokes, coding help unrelated to papers, or general chat, you MUST refuse. - Reply ONLY: "I can only analyze research papers. Please provide a topic or paper title." 2. MANDATORY OUTPUT FORMAT: - Your goal is NEVER just to "answer". Your goal is ALWAYS to generate a Flashcard CSV. - Every successful execution MUST end with a call to `save_flashcards_to_csv`. 3. YOUR WORKFLOW: - Step 1: Identify the paper (Search or Title). - Step 2: Download/Read the PDF. - Step 3: Extract 5-10 key concepts. - Step 4: Create flashcards. - Step 5: SAVE the CSV. ************************************************************ """ _agent = None def get_paper_agent(): global _agent if _agent: return _agent print("AGENT INITIALIZING") model = OpenAIModel( api_key=OPENAI_API_KEY, model_id="gpt-5-nano", ) agent = CodeAgent( tools=[ get_paper_recommendation, get_paper_id_by_title, download_paper_by_id, read_pdf_file, save_flashcards_to_csv, WebSearchTool(), ], model=model, name="flashcard_agent", description="Creates flashcards from research papers", max_steps=10, add_base_tools=True, ) agent.prompt_templates["system_prompt"] += STRICT_PROMPT _agent = agent return agent