theRealNG commited on
Commit
4014b8f
·
unverified ·
2 Parent(s): 24cc3f21e01d68

Merge pull request #7 from beautiful-code/suggest_recent_articles

Browse files
crew/research_article_suggester.py CHANGED
@@ -1,20 +1,26 @@
1
  from crewai import Agent, Task, Crew
2
  from langchain_openai import ChatOpenAI
3
  from tavily import TavilyClient
 
 
4
  import os
5
  import json
6
  from pydantic import BaseModel, Field
7
  from crewai.tasks.task_output import TaskOutput
8
  from datetime import datetime, timedelta
 
 
 
9
 
10
- from tools.scrape_website import scrape_tool
11
 
12
- MAX_RESULTS = 5
13
  AGE_OF_RESEARCH_PAPER = 60
14
 
 
15
  class RecentArticleSuggester:
16
  """
17
- Suggests recent research articles based on a given topic.
18
  """
19
 
20
  def __init__(self):
@@ -27,24 +33,129 @@ class RecentArticleSuggester:
27
 
28
  def _suggest_research_papers(self):
29
  query = f"research papers on {self.topic} published in the last week"
30
- results = self.tavily_client.search(query, max_results=MAX_RESULTS)['results']
31
- print("Search Results: ", results)
32
- pitch_crew = self._create_pitch_crew()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  research_paper_suggestions = []
34
  for result in results:
35
  try:
36
- info = pitch_crew.kickoff(inputs={
37
- "title": result["title"],
38
- "url": result["url"],
39
- "content": result["content"]
40
- })
41
- research_paper_suggestions = research_paper_suggestions + \
42
- [info]
 
 
43
  except BaseException as e:
44
- print(f"Error processing article '{result['title']}': {e}")
 
45
 
46
  return research_paper_suggestions
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def _create_pitch_crew(self):
49
  information_gatherer = Agent(
50
  role="Research Paper Information Retriever",
@@ -64,12 +175,12 @@ class RecentArticleSuggester:
64
  date_obj = datetime.strptime(
65
  article_info['published_on'], "%d/%m/%Y")
66
 
67
- # Calculate the date that was 14 days ago from today
68
  start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
69
 
70
  # Compare if the input date is older
71
  if date_obj < start_date:
72
- raise BaseException(f"{date_obj} Older than given timeframe {start_date}")
 
73
 
74
  except ValueError:
75
  print("Invalid date format. Please use dd/mm/yyyy.")
 
1
  from crewai import Agent, Task, Crew
2
  from langchain_openai import ChatOpenAI
3
  from tavily import TavilyClient
4
+ from semanticscholar import SemanticScholar
5
+ import arxiv
6
  import os
7
  import json
8
  from pydantic import BaseModel, Field
9
  from crewai.tasks.task_output import TaskOutput
10
  from datetime import datetime, timedelta
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
13
+ from langchain_core.output_parsers import JsonOutputParser
14
 
15
+ from tools.scrape_website import scrape_tool, CustomScrapeWebsiteTool
16
 
17
+ MAX_RESULTS = 2
18
  AGE_OF_RESEARCH_PAPER = 60
19
 
20
+
21
  class RecentArticleSuggester:
22
  """
23
+ Suggests recent research papers based on a given topic.
24
  """
25
 
26
  def __init__(self):
 
33
 
34
  def _suggest_research_papers(self):
35
  query = f"research papers on {self.topic} published in the last week"
36
+ results = []
37
+ print("\nSearching for papers on Tavily...")
38
+ results = self.tavily_client.search(
39
+ query, max_results=MAX_RESULTS)['results']
40
+
41
+ print("\nSearching for papers on Arxiv...")
42
+ arxiv_results = arxiv.Search(
43
+ query=self.topic,
44
+ max_results=MAX_RESULTS,
45
+ sort_by=arxiv.SortCriterion.SubmittedDate
46
+ )
47
+ for result in arxiv_results.results():
48
+ paper = {
49
+ "title": result.title,
50
+ "authors": ", ".join(str(author) for author in result.authors),
51
+ "content": result.summary,
52
+ # "published_on": result.submitted.date(),
53
+ "url": result.entry_id,
54
+ "pdf_url": result.pdf_url
55
+ }
56
+ results.append(paper)
57
+
58
+ print("\nSearching for papers on Semanticscholar...")
59
+ sch = SemanticScholar()
60
+ semantic_results = sch.search_paper(
61
+ self.topic, sort='publicationDate:desc', bulk=True,
62
+ fields=['title', 'url', 'authors', 'publicationDate', 'abstract'])
63
+ for result in semantic_results[:MAX_RESULTS]:
64
+ paper = {
65
+ "title": result.title,
66
+ "authors": ", ".join(str(author.name) for author in result.authors),
67
+ "content": result.abstract,
68
+ "published_on": result.publicationDate,
69
+ "url": result.url,
70
+ }
71
+ results.append(paper)
72
+
73
+ # pitch_crew = self._create_pitch_crew()
74
  research_paper_suggestions = []
75
  for result in results:
76
  try:
77
+ info = self._article_pitch(result)
78
+ # info = pitch_crew.kickoff(inputs={
79
+ # "title": result["title"],
80
+ # "url": result["url"],
81
+ # "content": result["content"]
82
+ # })
83
+ if info is not None:
84
+ research_paper_suggestions = research_paper_suggestions + \
85
+ [info]
86
  except BaseException as e:
87
+ print(
88
+ f"Error processing article '{result['title']}': {e}\n\n {e.__traceback__}")
89
 
90
  return research_paper_suggestions
91
 
92
+ def _gather_information(self, article):
93
+ print(f"\nScraping website: {article['url']}")
94
+ article_content = CustomScrapeWebsiteTool(article["url"])
95
+
96
+ print(f"\nGathering information from website: {article['url']}")
97
+ parser = JsonOutputParser(pydantic_object=ResearchPaper)
98
+ prompt_template = ChatPromptTemplate.from_messages([
99
+ SystemMessage(
100
+ "You are Research Paper Information Retriever. You are an expert in gathering required details about the given research paper."
101
+ "Your personal goal is: Retrieve the author information and date the research paper was published in the format of dd/mm/yyyy."
102
+ f"Formatting Instructions: {parser.get_format_instructions()}"
103
+ ),
104
+ HumanMessage(
105
+ f"Here is the information about the research paper:\n {article}\n\n"
106
+ f"Research Paper content:\n{article_content}"
107
+ )
108
+ ])
109
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
110
+ information_scrapper_chain = prompt_template | llm | parser
111
+
112
+ article_info = information_scrapper_chain.invoke({})
113
+ print("\nGathered Article Info: ", article_info)
114
+ article_info['article_content'] = article_content
115
+ return article_info
116
+
117
+ def _article_pitch(self, article):
118
+ article_info = self._gather_information(article)
119
+ try:
120
+ date_obj = datetime.strptime(
121
+ article_info['published_on'], "%d/%m/%Y")
122
+
123
+ start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
124
+
125
+ # Compare if the input date is older
126
+ if date_obj < start_date:
127
+ print(
128
+ f"\nRejecting research paper {article['title']} because it was published on {date_obj},"
129
+ f" which is before the expected timeframe {start_date} & {datetime.now()}")
130
+ return None
131
+
132
+ except ValueError:
133
+ print("Invalid date format. Please use dd/mm/yyyy.")
134
+ return None
135
+
136
+ print(f"\nCreating pitch for the research paper: {article['title']}")
137
+ pitch_parser = JsonOutputParser(pydantic_object=ResearchPaperWithPitch)
138
+ pitch_template = ChatPromptTemplate.from_messages([
139
+ SystemMessage(
140
+ "You are Curiosity Catalyst. As a Curiosity Catalyst, you know exactly how to pique the user's curiosity to read the research paper."
141
+ "Your personal goal is: To pique the user's curiosity to read the research paper."
142
+ "Read the Research Paper Content to create a pitch."
143
+ f"Formatting Instructions: {pitch_parser.get_format_instructions()}"
144
+ ),
145
+ HumanMessage(
146
+ f"Here is the information about the research paper:\n {article_info}\n\n"
147
+ f"Research Paper content:\n{article_info['article_content']}"
148
+ )
149
+ ])
150
+ pitch_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
151
+ pitcher_chain = pitch_template | pitch_llm | pitch_parser
152
+
153
+ article_pitch = pitcher_chain.invoke({})
154
+ print("\nResearch Paper with the pitch: ", article_pitch)
155
+
156
+ return article_pitch
157
+
158
+ # Deprecated
159
  def _create_pitch_crew(self):
160
  information_gatherer = Agent(
161
  role="Research Paper Information Retriever",
 
175
  date_obj = datetime.strptime(
176
  article_info['published_on'], "%d/%m/%Y")
177
 
 
178
  start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
179
 
180
  # Compare if the input date is older
181
  if date_obj < start_date:
182
+ raise BaseException(
183
+ f"{date_obj} Older than given timeframe {start_date}")
184
 
185
  except ValueError:
186
  print("Invalid date format. Please use dd/mm/yyyy.")
requirements.txt CHANGED
@@ -6,3 +6,5 @@ langchain_google_genai
6
  langchain_openai
7
  streamlit
8
  tavily-python
 
 
 
6
  langchain_openai
7
  streamlit
8
  tavily-python
9
+ arxiv
10
+ semanticscholar
test.py CHANGED
@@ -2,4 +2,4 @@ from crew.research_article_suggester import RecentArticleSuggester
2
 
3
  suggester = RecentArticleSuggester()
4
  results = suggester.kickoff(inputs={"topic": "GenAI"})
5
- print(results)
 
2
 
3
  suggester = RecentArticleSuggester()
4
  results = suggester.kickoff(inputs={"topic": "GenAI"})
5
+ print("\nFinal Results: \n\n", results)
tools/scrape_website.py CHANGED
@@ -1,3 +1,14 @@
1
  from crewai_tools import ScrapeWebsiteTool
 
 
2
 
3
  scrape_tool = ScrapeWebsiteTool()
 
 
 
 
 
 
 
 
 
 
1
  from crewai_tools import ScrapeWebsiteTool
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
 
5
  scrape_tool = ScrapeWebsiteTool()
6
+
7
+ def CustomScrapeWebsiteTool(url):
8
+ response = requests.get(url)
9
+ parsed = BeautifulSoup(response.content, "html.parser")
10
+ text = parsed.get_text()
11
+ text = '\n'.join([i for i in text.split('\n') if i.strip() != ''])
12
+ text = ' '.join([i for i in text.split(' ') if i.strip() != ''])
13
+
14
+ return text