AnanthulaShravya commited on
Commit
e6603b1
·
verified ·
1 Parent(s): ead92ea

Upload 3 files

Browse files
Files changed (3) hide show
  1. __init__.py +0 -0
  2. research_article_suggester.py +261 -0
  3. til.py +161 -0
__init__.py ADDED
File without changes
research_article_suggester.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from crewai import Agent, Task, Crew
2
+ from langchain_openai import ChatOpenAI
3
+ from tavily import TavilyClient
4
+ from semanticscholar import SemanticScholar
5
+ import arxiv
6
+ import os
7
+ import json
8
+ from pydantic import BaseModel, Field
9
+ from crewai.tasks.task_output import TaskOutput
10
+ from datetime import datetime, timedelta
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
13
+ from langchain_core.output_parsers import JsonOutputParser
14
+
15
+ from tools.scrape_website import scrape_tool, CustomScrapeWebsiteTool
16
+
17
+ MAX_RESULTS = 2
18
+ AGE_OF_RESEARCH_PAPER = 60
19
+
20
+
21
+ class RecentArticleSuggester:
22
+ """
23
+ Suggests recent research papers based on a given topic.
24
+ """
25
+
26
+ def __init__(self):
27
+ self.tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
28
+
29
+ def kickoff(self, inputs={}):
30
+ self.topic = inputs["topic"]
31
+ suggested_research_papers = self._suggest_research_papers()
32
+ return suggested_research_papers
33
+
34
+ def _suggest_research_papers(self):
35
+ query = f"research papers on {self.topic} published in the last week"
36
+ results = []
37
+ print("\nSearching for papers on Tavily...")
38
+ results = self.tavily_client.search(
39
+ query, max_results=MAX_RESULTS)['results']
40
+
41
+ print("\nSearching for papers on Arxiv...")
42
+ arxiv_results = arxiv.Search(
43
+ query=self.topic,
44
+ max_results=MAX_RESULTS,
45
+ sort_by=arxiv.SortCriterion.SubmittedDate
46
+ )
47
+ for result in arxiv_results.results():
48
+ paper = {
49
+ "title": result.title,
50
+ "authors": ", ".join(str(author) for author in result.authors),
51
+ "content": result.summary,
52
+ # "published_on": result.submitted.date(),
53
+ "url": result.entry_id,
54
+ "pdf_url": result.pdf_url
55
+ }
56
+ results.append(paper)
57
+
58
+ print("\nSearching for papers on Semanticscholar...")
59
+ sch = SemanticScholar()
60
+ semantic_results = sch.search_paper(
61
+ self.topic, sort='publicationDate:desc', bulk=True,
62
+ fields=['title', 'url', 'authors', 'publicationDate', 'abstract'])
63
+ for result in semantic_results[:MAX_RESULTS]:
64
+ paper = {
65
+ "title": result.title,
66
+ "authors": ", ".join(str(author.name) for author in result.authors),
67
+ "content": result.abstract,
68
+ "published_on": result.publicationDate,
69
+ "url": result.url,
70
+ }
71
+ results.append(paper)
72
+
73
+ # pitch_crew = self._create_pitch_crew()
74
+ research_paper_suggestions = []
75
+ for result in results:
76
+ try:
77
+ info = self._article_pitch(result)
78
+ # info = pitch_crew.kickoff(inputs={
79
+ # "title": result["title"],
80
+ # "url": result["url"],
81
+ # "content": result["content"]
82
+ # })
83
+ if info is not None:
84
+ research_paper_suggestions = research_paper_suggestions + \
85
+ [info]
86
+ except BaseException as e:
87
+ print(
88
+ f"Error processing article '{result['title']}': {e}\n\n {e.__traceback__}")
89
+
90
+ return research_paper_suggestions
91
+
92
+ def _gather_information(self, article):
93
+ print(f"\nScraping website: {article['url']}")
94
+ article_content = CustomScrapeWebsiteTool(article["url"])
95
+
96
+ print(f"\nGathering information from website: {article['url']}")
97
+ parser = JsonOutputParser(pydantic_object=ResearchPaper)
98
+ prompt_template = ChatPromptTemplate.from_messages([
99
+ SystemMessage(
100
+ "You are Research Paper Information Retriever. You are an expert in gathering required details about the given research paper."
101
+ "Your personal goal is: Retrieve the author information and date the research paper was published in the format of dd/mm/yyyy."
102
+ f"Formatting Instructions: {parser.get_format_instructions()}"
103
+ ),
104
+ HumanMessage(
105
+ f"Here is the information about the research paper:\n {article}\n\n"
106
+ f"Research Paper content:\n{article_content}"
107
+ )
108
+ ])
109
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
110
+ information_scrapper_chain = prompt_template | llm | parser
111
+
112
+ article_info = information_scrapper_chain.invoke({})
113
+ print("\nGathered Article Info: ", article_info)
114
+ article_info['article_content'] = article_content
115
+ return article_info
116
+
117
+ def _article_pitch(self, article):
118
+ article_info = self._gather_information(article)
119
+ try:
120
+ date_obj = datetime.strptime(
121
+ article_info['published_on'], "%d/%m/%Y")
122
+
123
+ start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
124
+
125
+ # Compare if the input date is older
126
+ if date_obj < start_date:
127
+ print(
128
+ f"\nRejecting research paper {article['title']} because it was published on {date_obj},"
129
+ f" which is before the expected timeframe {start_date} & {datetime.now()}")
130
+ return None
131
+
132
+ except ValueError:
133
+ print("Invalid date format. Please use dd/mm/yyyy.")
134
+ return None
135
+
136
+ print(f"\nCreating pitch for the research paper: {article['title']}")
137
+ pitch_parser = JsonOutputParser(pydantic_object=ResearchPaperWithPitch)
138
+ pitch_template = ChatPromptTemplate.from_messages([
139
+ SystemMessage(
140
+ "You are Curiosity Catalyst. As a Curiosity Catalyst, you know exactly how to pique the user's curiosity to read the research paper."
141
+ "Your personal goal is: To pique the user's curiosity to read the research paper."
142
+ "Read the Research Paper Content to create a pitch."
143
+ f"Formatting Instructions: {pitch_parser.get_format_instructions()}"
144
+ ),
145
+ HumanMessage(
146
+ f"Here is the information about the research paper:\n {article_info}\n\n"
147
+ f"Research Paper content:\n{article_info['article_content']}"
148
+ )
149
+ ])
150
+ pitch_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
151
+ pitcher_chain = pitch_template | pitch_llm | pitch_parser
152
+
153
+ article_pitch = pitcher_chain.invoke({})
154
+ print("\nResearch Paper with the pitch: ", article_pitch)
155
+
156
+ return article_pitch
157
+
158
+ # Deprecated
159
+ def _create_pitch_crew(self):
160
+ information_gatherer = Agent(
161
+ role="Research Paper Information Retriever",
162
+ goal="Gather required information for the given research papers.",
163
+ verbose=True,
164
+ backstory=(
165
+ "You are an expert in gathering required details "
166
+ "about the given research paper."
167
+ ),
168
+ llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2),
169
+ tools=[scrape_tool],
170
+ )
171
+
172
+ def evaluator(output: TaskOutput):
173
+ article_info = json.loads(output.exported_output)
174
+ try:
175
+ date_obj = datetime.strptime(
176
+ article_info['published_on'], "%d/%m/%Y")
177
+
178
+ start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
179
+
180
+ # Compare if the input date is older
181
+ if date_obj < start_date:
182
+ raise BaseException(
183
+ f"{date_obj} Older than given timeframe {start_date}")
184
+
185
+ except ValueError:
186
+ print("Invalid date format. Please use dd/mm/yyyy.")
187
+ return False
188
+
189
+ information_gathering_task = Task(
190
+ description=(
191
+ "Here is the information of a research paper: title {title}, "
192
+ "url: {url} and content: {content}.\n"
193
+ "Gather following information about the research paper: "
194
+ "1. When was the research paper published and present it in dd/mm/yyyy format. "
195
+ "2. Who is the author of the research paper. "
196
+ ),
197
+ expected_output=(
198
+ "Following details of the research paper: title, url, "
199
+ "content/summary, date it was published and author."
200
+ ),
201
+ agent=information_gatherer,
202
+ async_exection=False,
203
+ output_json=ResearchPaper,
204
+ callback=evaluator,
205
+ )
206
+
207
+ pitcher = Agent(
208
+ role="Curiosity Catalyst",
209
+ goal="To pique the user's curiosity to read the research paper.",
210
+ verbose=True,
211
+ backstory=(
212
+ "As a Curiosity Catalyst, you know exactly how to pique the user's curiosity "
213
+ "to read the research paper."
214
+ ),
215
+ llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2),
216
+ tools=[scrape_tool],
217
+ )
218
+
219
+ create_pitch = Task(
220
+ description=(
221
+ "Craft the pitch so to that it teases the research paper's most intriguing aspects, "
222
+ "by posing questions that the research paper might answer or "
223
+ "highlighting surprising facts to pique the user's curiosity "
224
+ " to read the research paper so that he is up-to-date with latest research."
225
+ ),
226
+ expected_output=(
227
+ "All the details of the research paper along with the pitch."
228
+ ),
229
+ tools=[scrape_tool],
230
+ agent=pitcher,
231
+ context=[information_gathering_task],
232
+ output_json=ResearchPaperWithPitch,
233
+ )
234
+
235
+ crew = Crew(
236
+ agents=[information_gatherer, pitcher],
237
+ tasks=[information_gathering_task, create_pitch],
238
+ verbose=True,
239
+ max_rpm=4,
240
+ )
241
+
242
+ return crew
243
+
244
+
245
+ class ResearchPaper(BaseModel):
246
+ title: str
247
+ url: str
248
+ summary: str
249
+ author: str = Field(description="author of the article")
250
+ published_on: str = Field(
251
+ description="Date the article was publised on in foramt dd/mm/yyyy")
252
+
253
+
254
+ class ResearchPaperWithPitch(BaseModel):
255
+ title: str
256
+ url: str
257
+ summary: str
258
+ author: str = Field(description="author of the article")
259
+ published_on: str = Field(
260
+ description="Date the article was publised on in foramt dd/mm/yyyy")
261
+ pitch: str
til.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain import callbacks
2
+ from langchain import hub
3
+ from langchain.agents import AgentExecutor, create_react_agent
4
+ from langchain_community.tools.tavily_search import TavilyAnswer
5
+ from langchain_core.messages import SystemMessage
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+ from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
8
+ from langchain_openai import ChatOpenAI
9
+ from pydantic import BaseModel, Field, UUID4
10
+ from typing import List, Optional
11
+ import os
12
+ import pprint
13
+
14
+ class TilCrew:
15
+ def kickoff(self, inputs={}):
16
+ print("Human Message:")
17
+ pprint.pp(inputs)
18
+ self.content = inputs["content"]
19
+ # self._gather_facts()
20
+ self._gather_feedback()
21
+ return self._final_call_on_feedback()
22
+
23
+ def _final_call_on_feedback(self):
24
+ final_results = []
25
+ for feedback in self.feedback_results:
26
+ print("Final analysis of:")
27
+ pprint.pp(feedback)
28
+ result = {
29
+ "til": feedback.get('til', ""),
30
+ "feedback": "not_ok",
31
+ }
32
+ if feedback["factuality_categorization"] != 'High':
33
+ result["feedback_criteria"] = "factuality_feedback"
34
+ result["reason"] = feedback["factuality_reason"]
35
+ final_results = final_results + [result]
36
+ continue
37
+
38
+ if feedback["insightful_categorization"] != 'High':
39
+ result["feedback_criteria"] = "insightful_feedback"
40
+ result["reason"] = feedback["insightful_reason"]
41
+ final_results = final_results + [result]
42
+ continue
43
+
44
+ if feedback["simplicity_categorization"] == 'Low':
45
+ result["feedback_criteria"] = "simplicity_feedback"
46
+ result["reason"] = feedback["simplicity_reason"]
47
+ result["suggestion"] = feedback["final_suggestion"]
48
+ final_results = final_results + [result]
49
+ continue
50
+
51
+ if feedback["grammatical_categorization"] == 'Low':
52
+ result["feedback_criteria"] = "grammatical_feedback"
53
+ result["reason"] = feedback["grammatical_reason"]
54
+ result["suggestion"] = feedback["final_suggestion"]
55
+ final_results = final_results + [result]
56
+ continue
57
+
58
+ result["feedback"] = "ok"
59
+ final_results = final_results + [result]
60
+
61
+ response = {"feedback": final_results, "run_id": self.run_id }
62
+ print("Final Results:")
63
+ pprint.pp(response)
64
+ return response
65
+
66
+ def _gather_feedback(self):
67
+ feedback_chain = self._build_feedback_chain()
68
+ pprint.pp("Analysing the TIL.....")
69
+ with callbacks.collect_runs() as cb:
70
+ self.feedback_results = feedback_chain.invoke({"til_content": self.content})['tils']
71
+ self.run_id = cb.traced_runs[0].id
72
+ print("Run ID: ", self.run_id)
73
+
74
+ print("Feedback: ")
75
+ pprint.pp(self.feedback_results)
76
+
77
+ # Deprecated: Not using this as we are getting similar results by using or without using this
78
+ def _gather_facts(self):
79
+ facts_prompt = PromptTemplate.from_template("What are the facts on the topics mentioned the following user's TILs: {content}")
80
+ tools = [TavilyAnswer()]
81
+ llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
82
+ prompt = hub.pull("hwchase17/react")
83
+ agent = create_react_agent(llm, tools, prompt)
84
+ agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
85
+ self.facts = agent_executor.invoke({"input": facts_prompt.format(content=self.content)})['output']
86
+ print("Gathered Facts: ")
87
+ pprint.pp(self.facts)
88
+
89
+ def _build_feedback_chain(self):
90
+ feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
91
+ feedback_prompt = ChatPromptTemplate.from_messages([
92
+ SystemMessage(
93
+ "You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. "
94
+ "You are an expert in writing TILs which are Insightful, Factually correct, Easy to read and grammatically correct."
95
+ "Your goal is to review user's TILs and categorize their correctness as High, Medium, or Low based on the following metrics:"
96
+ "1. Is the TIL insightful?"
97
+ "2. Is the TIL factually correct and accurate?"
98
+ "3. Is the TIL written in simple english?"
99
+ "4. Is the TIL grammatically correct?\n\n"
100
+
101
+ "The criteria to use for assessing if they are insightful or not are:\n"
102
+ "* They TIL shouldn't just be a outright statement, it should contain even the reason on why the statement is true."
103
+ "* It should showcase the understanding of the user on the subject.\n\n"
104
+
105
+ "The criteria to use for assessing if they are factual or not are:\n"
106
+ "* They are related to facts."
107
+ "* You are able to find a source which agrees to the fact from reputable websites.\n\n"
108
+
109
+ "Give reason for your assessment in one or two sentences for each metric and And also rewrite the TIL if you were given the option to write it. "
110
+ "Evaluate each TIL in the context of all the user's TILs."
111
+ f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
112
+ ),
113
+ HumanMessagePromptTemplate.from_template("{til_content}")
114
+ ])
115
+ print("Prompt: ")
116
+ pprint.pp(feedback_prompt, width=80)
117
+ llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
118
+ analysis_chain = (feedback_prompt | llm | feedback_parser).with_config({
119
+ "tags": ["til"], "run_name": "Analysing TIL",
120
+ "metadata" : {
121
+ "versoin": "v1.0.0",
122
+ "growth_activity": "til",
123
+ "env": os.environ["ENV"],
124
+ "model": os.environ["OPENAI_MODEL"],
125
+ }
126
+ })
127
+
128
+ return analysis_chain
129
+
130
+
131
+ class TilFeedbackResult(BaseModel):
132
+ til: str = Field(description="TIL as exactly captured by the user without any modifications.")
133
+ insightful_categorization: str = Field(
134
+ description="TIL categorization as High/Medium/Low based on correctness on the insightful metric.")
135
+ insightful_reason: str = Field(description="Reason for your assessment in one or two sentences on insightful metric for the user.")
136
+ factuality_categorization: str = Field(
137
+ description="TIL categorization as High/Medium/Low based on correctness on the factuality metric.")
138
+ factuality_reason: str = Field(description="Reason for your assessment in one or two sentences on factuality metric for the user.")
139
+ simplicity_categorization: str = Field(
140
+ description="TIL categorization as High/Medium/Low based on correctness on the simplicity metric.")
141
+ simplicity_reason: str = Field(description="Reason for your assessment in one or two sentences on simplicity metric for the user.")
142
+ grammatical_categorization: str = Field(
143
+ description="TIL categorization as High/Medium/Low based on correctness on the grammatical metric.")
144
+ grammatical_reason: str = Field(description="Reason for your assessment in one or two sentences on grammatical metric for the user.")
145
+ final_suggestion: str = Field(
146
+ description="Rewrite the TIL if you were given the option to write it which should score High on all the metrics.")
147
+
148
+
149
+ class TilFeedbackResults(BaseModel):
150
+ tils: List[TilFeedbackResult]
151
+
152
+ class TilFinalFeedback(BaseModel):
153
+ til: str
154
+ feedback: str
155
+ feedback_criteria: Optional[str] = None
156
+ reason: Optional[str] = None
157
+ suggestion: Optional[str] = None
158
+
159
+ class TilFeedbackResponse(BaseModel):
160
+ run_id: UUID4
161
+ feedback: List[TilFinalFeedback]