Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| import json | |
| import csv | |
| import arxiv | |
| from bs4 import BeautifulSoup | |
| from huggingface_hub import HfApi | |
| from pypdf import PdfReader | |
| from smolagents import tool, CodeAgent, WebSearchTool, OpenAIModel | |
| # ------------------------ | |
| # Secrets | |
| # ------------------------ | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| if not OPENAI_API_KEY: | |
| raise RuntimeError("Missing OPENAI_API_KEY environment variable") | |
| # ------------------------ | |
| # Tools | |
| # ------------------------ | |
| def get_paper_recommendation() -> str: | |
| """ | |
| This is a tool that returns the most upvoted paper on Hugging Face daily papers. | |
| It returns the title of the paper | |
| """ | |
| try: | |
| url = "https://huggingface.co/papers" | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Extract the title element from the JSON-like data in the "data-props" attribute | |
| containers = soup.find_all('div', class_='SVELTE_HYDRATER contents') | |
| top_paper = "" | |
| for container in containers: | |
| data_props = container.get('data-props', '') | |
| if data_props: | |
| try: | |
| # Parse the JSON-like string | |
| json_data = json.loads(data_props.replace('"', '"')) | |
| if 'dailyPapers' in json_data: | |
| papers = json_data['dailyPapers'][:10] | |
| top_paper = [paper["title"] for paper in papers] | |
| except json.JSONDecodeError: | |
| continue | |
| return top_paper | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error occurred while fetching the HTML: {e}") | |
| return None | |
| def get_paper_id_by_title(title: str) -> str: | |
| """ | |
| This is a tool that returns the arxiv paper id by its title. | |
| It returns the title of the paper | |
| Args: | |
| title: The paper title for which to get the id. | |
| """ | |
| api = HfApi() | |
| papers = api.list_papers(query=title) | |
| if papers: | |
| paper = next(iter(papers)) | |
| return paper.id | |
| else: | |
| return None | |
| def download_paper_by_id(paper_id: str) -> None: | |
| """ | |
| This tool gets the id of a paper and downloads it from arxiv. It saves the paper locally | |
| in the current directory as "paper.pdf". | |
| Args: | |
| paper_id: The id of the paper to download. | |
| """ | |
| paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id]))) | |
| paper.download_pdf(filename="paper.pdf") | |
| return None | |
| def read_pdf_file(file_path: str) -> str: | |
| """ | |
| This function reads the first three pages of a PDF file and returns its content as a string. | |
| Args: | |
| file_path: The path to the PDF file. | |
| Returns: | |
| A string containing the content of the PDF file. | |
| """ | |
| content = "" | |
| reader = PdfReader('paper.pdf') | |
| print(len(reader.pages)) | |
| pages = reader.pages[:3] | |
| for page in pages: | |
| content += page.extract_text() | |
| return content | |
| def save_flashcards_to_csv(flashcards_data: str, filename: str) -> str: | |
| """ | |
| Saves a list of flashcards to a CSV file (importable into Anki). | |
| Args: | |
| flashcards_data: A string containing flashcards separated by newlines, | |
| where the Question and Answer are separated by a semicolon (;). | |
| Example: "What is X?;It is Y.\nWhy is Z?;Because A." | |
| filename: The name of the file to save (e.g., 'output.csv'). | |
| """ | |
| try: | |
| rows = [] | |
| # Split the big string into lines | |
| lines = flashcards_data.strip().split('\n') | |
| for line in lines: | |
| if ';' in line: | |
| parts = line.split(';', 1) # Split only on the first semicolon | |
| rows.append([parts[0].strip(), parts[1].strip()]) | |
| with open(filename, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["Front", "Back"]) # Header | |
| writer.writerows(rows) | |
| return f"Successfully saved {len(rows)} flashcards to {filename}." | |
| except Exception as e: | |
| return f"Error saving file: {e}" | |
| # ------------------------ | |
| # Agent | |
| # ------------------------ | |
| STRICT_PROMPT = """ | |
| ************************************************************ | |
| CRITICAL INSTRUCTIONS - READ CAREFULLY | |
| ************************************************************ | |
| 1. DOMAIN RESTRICTION (HARD): | |
| - You are a Research Paper Analysis Tool. You are NOT a general chatbot. | |
| - If the user asks about the weather, jokes, coding help unrelated to papers, or general chat, you MUST refuse. | |
| - Reply ONLY: "I can only analyze research papers. Please provide a topic or paper title." | |
| 2. MANDATORY OUTPUT FORMAT: | |
| - Your goal is NEVER just to "answer". Your goal is ALWAYS to generate a Flashcard CSV. | |
| - Every successful execution MUST end with a call to `save_flashcards_to_csv`. | |
| 3. YOUR WORKFLOW: | |
| - Step 1: Identify the paper (Search or Title). | |
| - Step 2: Download/Read the PDF. | |
| - Step 3: Extract 5-10 key concepts. | |
| - Step 4: Create flashcards. | |
| - Step 5: SAVE the CSV. | |
| ************************************************************ | |
| """ | |
| _agent = None | |
| def get_paper_agent(): | |
| global _agent | |
| if _agent: | |
| return _agent | |
| print("AGENT INITIALIZING") | |
| model = OpenAIModel( | |
| api_key=OPENAI_API_KEY, | |
| model_id="gpt-5-nano", | |
| ) | |
| agent = CodeAgent( | |
| tools=[ | |
| get_paper_recommendation, | |
| get_paper_id_by_title, | |
| download_paper_by_id, | |
| read_pdf_file, | |
| save_flashcards_to_csv, | |
| WebSearchTool(), | |
| ], | |
| model=model, | |
| name="flashcard_agent", | |
| description="Creates flashcards from research papers", | |
| max_steps=10, | |
| add_base_tools=True, | |
| ) | |
| agent.prompt_templates["system_prompt"] += STRICT_PROMPT | |
| _agent = agent | |
| return agent | |