Spaces:

beautiful-code
/

ai_workflows

Runtime error

App Files Files Community

theRealNG commited on Jun 21, 2024

Commit

3307cbd

1 Parent(s): 24cc3f2

Refactor: Moved to chains instead of crew

Browse files

Files changed (4) hide show

crew/research_article_suggester.py +112 -16
requirements.txt +1 -0
test.py +1 -1
tools/scrape_website.py +11 -0

crew/research_article_suggester.py CHANGED Viewed

@@ -1,20 +1,25 @@
 from crewai import Agent, Task, Crew
 from langchain_openai import ChatOpenAI
 from tavily import TavilyClient
 import os
 import json
 from pydantic import BaseModel, Field
 from crewai.tasks.task_output import TaskOutput
 from datetime import datetime, timedelta
-from tools.scrape_website import scrape_tool
-MAX_RESULTS = 5
 AGE_OF_RESEARCH_PAPER = 60
 class RecentArticleSuggester:
     """
-    Suggests recent research articles based on a given topic.
     """
     def __init__(self):
@@ -27,24 +32,115 @@ class RecentArticleSuggester:
     def _suggest_research_papers(self):
         query = f"research papers on {self.topic} published in the last week"
-        results = self.tavily_client.search(query, max_results=MAX_RESULTS)['results']
-        print("Search Results: ", results)
-        pitch_crew = self._create_pitch_crew()
         research_paper_suggestions = []
         for result in results:
             try:
-                info = pitch_crew.kickoff(inputs={
-                    "title": result["title"],
-                    "url": result["url"],
-                    "content": result["content"]
-                })
-                research_paper_suggestions = research_paper_suggestions + \
-                    [info]
             except BaseException as e:
-                print(f"Error processing article '{result['title']}': {e}")
         return research_paper_suggestions
     def _create_pitch_crew(self):
         information_gatherer = Agent(
             role="Research Paper Information Retriever",
@@ -64,12 +160,12 @@ class RecentArticleSuggester:
                 date_obj = datetime.strptime(
                     article_info['published_on'], "%d/%m/%Y")
-                # Calculate the date that was 14 days ago from today
                 start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
                 # Compare if the input date is older
                 if date_obj < start_date:
-                    raise BaseException(f"{date_obj} Older than given timeframe {start_date}")
             except ValueError:
                 print("Invalid date format. Please use dd/mm/yyyy.")

 from crewai import Agent, Task, Crew
 from langchain_openai import ChatOpenAI
 from tavily import TavilyClient
+import arxiv
 import os
 import json
 from pydantic import BaseModel, Field
 from crewai.tasks.task_output import TaskOutput
 from datetime import datetime, timedelta
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
+from langchain_core.output_parsers import JsonOutputParser
+from tools.scrape_website import scrape_tool, CustomScrapeWebsiteTool
+MAX_RESULTS = 2
 AGE_OF_RESEARCH_PAPER = 60
 class RecentArticleSuggester:
     """
+    Suggests recent research papers based on a given topic.
     """
     def __init__(self):
     def _suggest_research_papers(self):
         query = f"research papers on {self.topic} published in the last week"
+        results = []
+        print("\nSearching for papers on Tavily...")
+        results = self.tavily_client.search(
+            query, max_results=MAX_RESULTS)['results']
+        print("\nSearching for papers on Arxiv...")
+        arxiv_results = arxiv.Search(
+            query=self.topic,
+            max_results=MAX_RESULTS,
+            sort_by=arxiv.SortCriterion.SubmittedDate
+        )
+        for result in arxiv_results.results():
+            paper = {
+                "title": result.title,
+                "authors": ", ".join(str(author) for author in result.authors),
+                "content": result.summary,
+                # "published_on": result.submitted.date(),
+                "url": result.entry_id,
+                "pdf_url": result.pdf_url
+            }
+            results.append(paper)
+        # pitch_crew = self._create_pitch_crew()
         research_paper_suggestions = []
         for result in results:
             try:
+                info = self._article_pitch(result)
+                # info = pitch_crew.kickoff(inputs={
+                #     "title": result["title"],
+                #     "url": result["url"],
+                #     "content": result["content"]
+                # })
+                if info is not None:
+                    research_paper_suggestions = research_paper_suggestions + \
+                        [info]
             except BaseException as e:
+                print(
+                    f"Error processing article '{result['title']}': {e}\n\n {e.__traceback__}")
         return research_paper_suggestions
+    def _gather_information(self, article):
+        print(f"\nScraping website: {article['url']}")
+        article_content = CustomScrapeWebsiteTool(article["url"])
+        print(f"\nGathering information from website: {article['url']}")
+        parser = JsonOutputParser(pydantic_object=ResearchPaper)
+        prompt_template = ChatPromptTemplate.from_messages([
+            SystemMessage(
+                "You are Research Paper Information Retriever. You are an expert in gathering required details about the given research paper."
+                "Your personal goal is: Retrieve the author information and date the research paper was published in the format of dd/mm/yyyy."
+                f"Formatting Instructions: {parser.get_format_instructions()}"
+            ),
+            HumanMessage(
+                f"Here is the information about the research paper title: {article['title']}, url: {article['url']},"
+                f" summary: \n{article['content']}.\n\n Research Paper content:\n{article_content}"
+            )
+        ])
+        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
+        information_scrapper_chain = prompt_template | llm | parser
+        article_info = information_scrapper_chain.invoke({})
+        print("\nGathered Article Info: ", article_info)
+        article_info['article_content'] = article_content
+        return article_info
+    def _article_pitch(self, article):
+        article_info = self._gather_information(article)
+        try:
+            date_obj = datetime.strptime(
+                article_info['published_on'], "%d/%m/%Y")
+            start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
+            # Compare if the input date is older
+            if date_obj < start_date:
+                print(
+                    f"\nRejecting research paper {article['title']} because it was published on {date_obj},"
+                    f" which is before the expected timeframe {start_date} & {datetime.now()}")
+                return None
+        except ValueError:
+            print("Invalid date format. Please use dd/mm/yyyy.")
+            return None
+        print(f"\nCreating pitch for the research paper: {article['title']}")
+        pitch_parser = JsonOutputParser(pydantic_object=ResearchPaperWithPitch)
+        pitch_template = ChatPromptTemplate.from_messages([
+            SystemMessage(
+                "You are Curiosity Catalyst. As a Curiosity Catalyst, you know exactly how to pique the user's curiosity to read the research paper."
+                "Your personal goal is: To pique the user's curiosity to read the research paper."
+                "Read the Research Paper Content to create a pitch."
+                f"Formatting Instructions: {pitch_parser.get_format_instructions()}"
+            ),
+            HumanMessage(
+                f"Here is the information about the research paper title: {article_info['title']}, url: {article_info['url']}, "
+                f"published_on: {article_info['published_on']}, authors: {article_info['author']}, "
+                f"summary: \n{article_info['summary']}.\n\n Research Paper content:\n{article_info['article_content']}"
+            )
+        ])
+        pitch_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
+        pitcher_chain = pitch_template | pitch_llm | pitch_parser
+        article_pitch = pitcher_chain.invoke({})
+        print("\nResearch Paper with the pitch: ", article_pitch)
+        return article_pitch
+    # Deprecated
     def _create_pitch_crew(self):
         information_gatherer = Agent(
             role="Research Paper Information Retriever",
                 date_obj = datetime.strptime(
                     article_info['published_on'], "%d/%m/%Y")
                 start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
                 # Compare if the input date is older
                 if date_obj < start_date:
+                    raise BaseException(
+                        f"{date_obj} Older than given timeframe {start_date}")
             except ValueError:
                 print("Invalid date format. Please use dd/mm/yyyy.")

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ langchain_google_genai
 langchain_openai
 streamlit
 tavily-python

 langchain_openai
 streamlit
 tavily-python
+arxiv

test.py CHANGED Viewed

@@ -2,4 +2,4 @@ from crew.research_article_suggester import RecentArticleSuggester
 suggester = RecentArticleSuggester()
 results = suggester.kickoff(inputs={"topic": "GenAI"})
-print(results)

 suggester = RecentArticleSuggester()
 results = suggester.kickoff(inputs={"topic": "GenAI"})
+print("\nFinal Results: \n\n", results)

tools/scrape_website.py CHANGED Viewed

@@ -1,3 +1,14 @@
 from crewai_tools import ScrapeWebsiteTool
 scrape_tool = ScrapeWebsiteTool()

 from crewai_tools import ScrapeWebsiteTool
+import requests
+from bs4 import BeautifulSoup
 scrape_tool = ScrapeWebsiteTool()
+def CustomScrapeWebsiteTool(url):
+    response = requests.get(url)
+    parsed = BeautifulSoup(response.content, "html.parser")
+    text = parsed.get_text()
+    text = '\n'.join([i for i in text.split('\n') if i.strip() != ''])
+    text = ' '.join([i for i in text.split(' ') if i.strip() != ''])
+    return text