from crewai import Agent, Task, Crew from langchain_openai import ChatOpenAI from tavily import TavilyClient from semanticscholar import SemanticScholar import arxiv import os import json from pydantic import BaseModel, Field from crewai.tasks.task_output import TaskOutput from datetime import datetime, timedelta from langchain_core.prompts import ChatPromptTemplate from langchain_core.messages import SystemMessage, AIMessage, HumanMessage from langchain_core.output_parsers import JsonOutputParser from workflows.tools.scrape_website import scrape_tool, CustomScrapeWebsiteTool MAX_RESULTS = 2 AGE_OF_RESEARCH_PAPER = 60 class RecentArticleSuggester: """ Suggests recent research papers based on a given topic. """ def __init__(self): self.tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY")) def kickoff(self, inputs={}): self.topic = inputs["topic"] suggested_research_papers = self._suggest_research_papers() return suggested_research_papers def _suggest_research_papers(self): query = f"research papers on {self.topic} published in the last week" results = [] print("\nSearching for papers on Tavily...") results = self.tavily_client.search( query, max_results=MAX_RESULTS)['results'] print("\nSearching for papers on Arxiv...") arxiv_results = arxiv.Search( query=self.topic, max_results=MAX_RESULTS, sort_by=arxiv.SortCriterion.SubmittedDate ) for result in arxiv_results.results(): paper = { "title": result.title, "authors": ", ".join(str(author) for author in result.authors), "content": result.summary, # "published_on": result.submitted.date(), "url": result.entry_id, "pdf_url": result.pdf_url } results.append(paper) print("\nSearching for papers on Semanticscholar...") sch = SemanticScholar() semantic_results = sch.search_paper( self.topic, sort='publicationDate:desc', bulk=True, fields=['title', 'url', 'authors', 'publicationDate', 'abstract']) for result in semantic_results[:MAX_RESULTS]: paper = { "title": result.title, "authors": ", ".join(str(author.name) for author in result.authors), "content": result.abstract, "published_on": result.publicationDate, "url": result.url, } results.append(paper) # pitch_crew = self._create_pitch_crew() research_paper_suggestions = [] for result in results: try: info = self._article_pitch(result) # info = pitch_crew.kickoff(inputs={ # "title": result["title"], # "url": result["url"], # "content": result["content"] # }) if info is not None: research_paper_suggestions = research_paper_suggestions + \ [info] except BaseException as e: print( f"Error processing article '{result['title']}': {e}\n\n {e.__traceback__}") return research_paper_suggestions def _gather_information(self, article): print(f"\nScraping website: {article['url']}") article_content = CustomScrapeWebsiteTool(article["url"]) print(f"\nGathering information from website: {article['url']}") parser = JsonOutputParser(pydantic_object=ResearchPaper) prompt_template = ChatPromptTemplate.from_messages([ SystemMessage( "You are Research Paper Information Retriever. You are an expert in gathering required details about the given research paper." "Your personal goal is: Retrieve the author information and date the research paper was published in the format of dd/mm/yyyy." f"Formatting Instructions: {parser.get_format_instructions()}" ), HumanMessage( f"Here is the information about the research paper:\n {article}\n\n" f"Research Paper content:\n{article_content}" ) ]) llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2) information_scrapper_chain = prompt_template | llm | parser article_info = information_scrapper_chain.invoke({}) print("\nGathered Article Info: ", article_info) article_info['article_content'] = article_content return article_info def _article_pitch(self, article): article_info = self._gather_information(article) try: date_obj = datetime.strptime( article_info['published_on'], "%d/%m/%Y") start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER) # Compare if the input date is older if date_obj < start_date: print( f"\nRejecting research paper {article['title']} because it was published on {date_obj}," f" which is before the expected timeframe {start_date} & {datetime.now()}") return None except ValueError: print("Invalid date format. Please use dd/mm/yyyy.") return None print(f"\nCreating pitch for the research paper: {article['title']}") pitch_parser = JsonOutputParser(pydantic_object=ResearchPaperWithPitch) pitch_template = ChatPromptTemplate.from_messages([ SystemMessage( "You are Curiosity Catalyst. As a Curiosity Catalyst, you know exactly how to pique the user's curiosity to read the research paper." "Your personal goal is: To pique the user's curiosity to read the research paper." "Read the Research Paper Content to create a pitch." f"Formatting Instructions: {pitch_parser.get_format_instructions()}" ), HumanMessage( f"Here is the information about the research paper:\n {article_info}\n\n" f"Research Paper content:\n{article_info['article_content']}" ) ]) pitch_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2) pitcher_chain = pitch_template | pitch_llm | pitch_parser article_pitch = pitcher_chain.invoke({}) print("\nResearch Paper with the pitch: ", article_pitch) return article_pitch # Deprecated def _create_pitch_crew(self): information_gatherer = Agent( role="Research Paper Information Retriever", goal="Gather required information for the given research papers.", verbose=True, backstory=( "You are an expert in gathering required details " "about the given research paper." ), llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2), tools=[scrape_tool], ) def evaluator(output: TaskOutput): article_info = json.loads(output.exported_output) try: date_obj = datetime.strptime( article_info['published_on'], "%d/%m/%Y") start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER) # Compare if the input date is older if date_obj < start_date: raise BaseException( f"{date_obj} Older than given timeframe {start_date}") except ValueError: print("Invalid date format. Please use dd/mm/yyyy.") return False information_gathering_task = Task( description=( "Here is the information of a research paper: title {title}, " "url: {url} and content: {content}.\n" "Gather following information about the research paper: " "1. When was the research paper published and present it in dd/mm/yyyy format. " "2. Who is the author of the research paper. " ), expected_output=( "Following details of the research paper: title, url, " "content/summary, date it was published and author." ), agent=information_gatherer, async_exection=False, output_json=ResearchPaper, callback=evaluator, ) pitcher = Agent( role="Curiosity Catalyst", goal="To pique the user's curiosity to read the research paper.", verbose=True, backstory=( "As a Curiosity Catalyst, you know exactly how to pique the user's curiosity " "to read the research paper." ), llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2), tools=[scrape_tool], ) create_pitch = Task( description=( "Craft the pitch so to that it teases the research paper's most intriguing aspects, " "by posing questions that the research paper might answer or " "highlighting surprising facts to pique the user's curiosity " " to read the research paper so that he is up-to-date with latest research." ), expected_output=( "All the details of the research paper along with the pitch." ), tools=[scrape_tool], agent=pitcher, context=[information_gathering_task], output_json=ResearchPaperWithPitch, ) crew = Crew( agents=[information_gatherer, pitcher], tasks=[information_gathering_task, create_pitch], verbose=True, max_rpm=4, ) return crew class ResearchPaper(BaseModel): title: str url: str summary: str author: str = Field(description="author of the article") published_on: str = Field( description="Date the article was publised on in foramt dd/mm/yyyy") class ResearchPaperWithPitch(BaseModel): title: str url: str summary: str author: str = Field(description="author of the article") published_on: str = Field( description="Date the article was publised on in foramt dd/mm/yyyy") pitch: str