Spaces:

AnanthulaShravya
/

setup

Sleeping

App Files Files Community

AnanthulaShravya commited on Jul 1, 2024

Commit

e6603b1

verified ·

1 Parent(s): ead92ea

Upload 3 files

Browse files

Files changed (3) hide show

__init__.py +0 -0
research_article_suggester.py +261 -0
til.py +161 -0

__init__.py ADDED Viewed

File without changes

research_article_suggester.py ADDED Viewed

	@@ -0,0 +1,261 @@

+from crewai import Agent, Task, Crew
+from langchain_openai import ChatOpenAI
+from tavily import TavilyClient
+from semanticscholar import SemanticScholar
+import arxiv
+import os
+import json
+from pydantic import BaseModel, Field
+from crewai.tasks.task_output import TaskOutput
+from datetime import datetime, timedelta
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
+from langchain_core.output_parsers import JsonOutputParser
+from tools.scrape_website import scrape_tool, CustomScrapeWebsiteTool
+MAX_RESULTS = 2
+AGE_OF_RESEARCH_PAPER = 60
+class RecentArticleSuggester:
+    """
+    Suggests recent research papers based on a given topic.
+    """
+    def __init__(self):
+        self.tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+    def kickoff(self, inputs={}):
+        self.topic = inputs["topic"]
+        suggested_research_papers = self._suggest_research_papers()
+        return suggested_research_papers
+    def _suggest_research_papers(self):
+        query = f"research papers on {self.topic} published in the last week"
+        results = []
+        print("\nSearching for papers on Tavily...")
+        results = self.tavily_client.search(
+            query, max_results=MAX_RESULTS)['results']
+        print("\nSearching for papers on Arxiv...")
+        arxiv_results = arxiv.Search(
+            query=self.topic,
+            max_results=MAX_RESULTS,
+            sort_by=arxiv.SortCriterion.SubmittedDate
+        )
+        for result in arxiv_results.results():
+            paper = {
+                "title": result.title,
+                "authors": ", ".join(str(author) for author in result.authors),
+                "content": result.summary,
+                # "published_on": result.submitted.date(),
+                "url": result.entry_id,
+                "pdf_url": result.pdf_url
+            }
+            results.append(paper)
+        print("\nSearching for papers on Semanticscholar...")
+        sch = SemanticScholar()
+        semantic_results = sch.search_paper(
+            self.topic, sort='publicationDate:desc', bulk=True,
+            fields=['title', 'url', 'authors', 'publicationDate', 'abstract'])
+        for result in semantic_results[:MAX_RESULTS]:
+            paper = {
+                "title": result.title,
+                "authors": ", ".join(str(author.name) for author in result.authors),
+                "content": result.abstract,
+                "published_on": result.publicationDate,
+                "url": result.url,
+            }
+            results.append(paper)
+        # pitch_crew = self._create_pitch_crew()
+        research_paper_suggestions = []
+        for result in results:
+            try:
+                info = self._article_pitch(result)
+                # info = pitch_crew.kickoff(inputs={
+                #     "title": result["title"],
+                #     "url": result["url"],
+                #     "content": result["content"]
+                # })
+                if info is not None:
+                    research_paper_suggestions = research_paper_suggestions + \
+                        [info]
+            except BaseException as e:
+                print(
+                    f"Error processing article '{result['title']}': {e}\n\n {e.__traceback__}")
+        return research_paper_suggestions
+    def _gather_information(self, article):
+        print(f"\nScraping website: {article['url']}")
+        article_content = CustomScrapeWebsiteTool(article["url"])
+        print(f"\nGathering information from website: {article['url']}")
+        parser = JsonOutputParser(pydantic_object=ResearchPaper)
+        prompt_template = ChatPromptTemplate.from_messages([
+            SystemMessage(
+                "You are Research Paper Information Retriever. You are an expert in gathering required details about the given research paper."
+                "Your personal goal is: Retrieve the author information and date the research paper was published in the format of dd/mm/yyyy."
+                f"Formatting Instructions: {parser.get_format_instructions()}"
+            ),
+            HumanMessage(
+                f"Here is the information about the research paper:\n {article}\n\n"
+                f"Research Paper content:\n{article_content}"
+            )
+        ])
+        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
+        information_scrapper_chain = prompt_template | llm | parser
+        article_info = information_scrapper_chain.invoke({})
+        print("\nGathered Article Info: ", article_info)
+        article_info['article_content'] = article_content
+        return article_info
+    def _article_pitch(self, article):
+        article_info = self._gather_information(article)
+        try:
+            date_obj = datetime.strptime(
+                article_info['published_on'], "%d/%m/%Y")
+            start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
+            # Compare if the input date is older
+            if date_obj < start_date:
+                print(
+                    f"\nRejecting research paper {article['title']} because it was published on {date_obj},"
+                    f" which is before the expected timeframe {start_date} & {datetime.now()}")
+                return None
+        except ValueError:
+            print("Invalid date format. Please use dd/mm/yyyy.")
+            return None
+        print(f"\nCreating pitch for the research paper: {article['title']}")
+        pitch_parser = JsonOutputParser(pydantic_object=ResearchPaperWithPitch)
+        pitch_template = ChatPromptTemplate.from_messages([
+            SystemMessage(
+                "You are Curiosity Catalyst. As a Curiosity Catalyst, you know exactly how to pique the user's curiosity to read the research paper."
+                "Your personal goal is: To pique the user's curiosity to read the research paper."
+                "Read the Research Paper Content to create a pitch."
+                f"Formatting Instructions: {pitch_parser.get_format_instructions()}"
+            ),
+            HumanMessage(
+                f"Here is the information about the research paper:\n {article_info}\n\n"
+                f"Research Paper content:\n{article_info['article_content']}"
+            )
+        ])
+        pitch_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
+        pitcher_chain = pitch_template | pitch_llm | pitch_parser
+        article_pitch = pitcher_chain.invoke({})
+        print("\nResearch Paper with the pitch: ", article_pitch)
+        return article_pitch
+    # Deprecated
+    def _create_pitch_crew(self):
+        information_gatherer = Agent(
+            role="Research Paper Information Retriever",
+            goal="Gather required information for the given research papers.",
+            verbose=True,
+            backstory=(
+                "You are an expert in gathering required details "
+                "about the given research paper."
+            ),
+            llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2),
+            tools=[scrape_tool],
+        )
+        def evaluator(output: TaskOutput):
+            article_info = json.loads(output.exported_output)
+            try:
+                date_obj = datetime.strptime(
+                    article_info['published_on'], "%d/%m/%Y")
+                start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
+                # Compare if the input date is older
+                if date_obj < start_date:
+                    raise BaseException(
+                        f"{date_obj} Older than given timeframe {start_date}")
+            except ValueError:
+                print("Invalid date format. Please use dd/mm/yyyy.")
+                return False
+        information_gathering_task = Task(
+            description=(
+                "Here is the information of a research paper: title {title}, "
+                "url: {url} and content: {content}.\n"
+                "Gather following information about the research paper: "
+                "1. When was the research paper published and present it in dd/mm/yyyy format. "
+                "2. Who is the author of the research paper. "
+            ),
+            expected_output=(
+                "Following details of the research paper: title, url, "
+                "content/summary, date it was published and author."
+            ),
+            agent=information_gatherer,
+            async_exection=False,
+            output_json=ResearchPaper,
+            callback=evaluator,
+        )
+        pitcher = Agent(
+            role="Curiosity Catalyst",
+            goal="To pique the user's curiosity to read the research paper.",
+            verbose=True,
+            backstory=(
+                "As a Curiosity Catalyst, you know exactly how to pique the user's curiosity "
+                "to read the research paper."
+            ),
+            llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2),
+            tools=[scrape_tool],
+        )
+        create_pitch = Task(
+            description=(
+                "Craft the pitch so to that it teases the research paper's most intriguing aspects, "
+                "by posing questions that the research paper might answer or "
+                "highlighting surprising facts to pique the user's curiosity "
+                " to read the research paper so that he is up-to-date with latest research."
+            ),
+            expected_output=(
+                "All the details of the research paper along with the pitch."
+            ),
+            tools=[scrape_tool],
+            agent=pitcher,
+            context=[information_gathering_task],
+            output_json=ResearchPaperWithPitch,
+        )
+        crew = Crew(
+            agents=[information_gatherer, pitcher],
+            tasks=[information_gathering_task, create_pitch],
+            verbose=True,
+            max_rpm=4,
+        )
+        return crew
+class ResearchPaper(BaseModel):
+    title: str
+    url: str
+    summary: str
+    author: str = Field(description="author of the article")
+    published_on: str = Field(
+        description="Date the article was publised on in foramt dd/mm/yyyy")
+class ResearchPaperWithPitch(BaseModel):
+    title: str
+    url: str
+    summary: str
+    author: str = Field(description="author of the article")
+    published_on: str = Field(
+        description="Date the article was publised on in foramt dd/mm/yyyy")
+    pitch: str

til.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from langchain import callbacks
+from langchain import hub
+from langchain.agents import AgentExecutor, create_react_agent
+from langchain_community.tools.tavily_search import TavilyAnswer
+from langchain_core.messages import SystemMessage
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, Field, UUID4
+from typing import List, Optional
+import os
+import pprint
+class TilCrew:
+    def kickoff(self, inputs={}):
+        print("Human Message:")
+        pprint.pp(inputs)
+        self.content = inputs["content"]
+        # self._gather_facts()
+        self._gather_feedback()
+        return self._final_call_on_feedback()
+    def _final_call_on_feedback(self):
+        final_results = []
+        for feedback in self.feedback_results:
+            print("Final analysis of:")
+            pprint.pp(feedback)
+            result = {
+                "til": feedback.get('til', ""),
+                "feedback": "not_ok",
+            }
+            if feedback["factuality_categorization"] != 'High':
+                result["feedback_criteria"] = "factuality_feedback"
+                result["reason"] = feedback["factuality_reason"]
+                final_results = final_results + [result]
+                continue
+            if feedback["insightful_categorization"] != 'High':
+                result["feedback_criteria"] = "insightful_feedback"
+                result["reason"] = feedback["insightful_reason"]
+                final_results = final_results + [result]
+                continue
+            if feedback["simplicity_categorization"] == 'Low':
+                result["feedback_criteria"] = "simplicity_feedback"
+                result["reason"] = feedback["simplicity_reason"]
+                result["suggestion"] = feedback["final_suggestion"]
+                final_results = final_results + [result]
+                continue
+            if feedback["grammatical_categorization"] == 'Low':
+                result["feedback_criteria"] = "grammatical_feedback"
+                result["reason"] = feedback["grammatical_reason"]
+                result["suggestion"] = feedback["final_suggestion"]
+                final_results = final_results + [result]
+                continue
+            result["feedback"] = "ok"
+            final_results = final_results + [result]
+        response = {"feedback": final_results, "run_id": self.run_id }
+        print("Final Results:")
+        pprint.pp(response)
+        return response
+    def _gather_feedback(self):
+        feedback_chain = self._build_feedback_chain()
+        pprint.pp("Analysing the TIL.....")
+        with callbacks.collect_runs() as cb:
+          self.feedback_results = feedback_chain.invoke({"til_content": self.content})['tils']
+          self.run_id = cb.traced_runs[0].id
+          print("Run ID: ", self.run_id)
+        print("Feedback: ")
+        pprint.pp(self.feedback_results)
+    # Deprecated: Not using this as we are getting similar results by using or without using this
+    def _gather_facts(self):
+        facts_prompt = PromptTemplate.from_template("What are the facts on the topics mentioned the following user's TILs: {content}")
+        tools = [TavilyAnswer()]
+        llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
+        prompt = hub.pull("hwchase17/react")
+        agent = create_react_agent(llm, tools, prompt)
+        agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
+        self.facts = agent_executor.invoke({"input": facts_prompt.format(content=self.content)})['output']
+        print("Gathered Facts: ")
+        pprint.pp(self.facts)
+    def _build_feedback_chain(self):
+        feedback_parser = JsonOutputParser(pydantic_object=TilFeedbackResults)
+        feedback_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(
+                "You are a 'Personal TIL Reviewer' who works in a Product Engineering Services company. "
+                "You are an expert in writing TILs which are Insightful, Factually correct, Easy to read and grammatically correct."
+                "Your goal is to review user's TILs and categorize their correctness as High, Medium, or Low based on the following metrics:"
+                "1. Is the TIL insightful?"
+                "2. Is the TIL factually correct and accurate?"
+                "3. Is the TIL written in simple english?"
+                "4. Is the TIL grammatically correct?\n\n"
+                "The criteria to use for assessing if they are insightful or not are:\n"
+                "* They TIL shouldn't just be a outright statement, it should contain even the reason on why the statement is true."
+                "* It should showcase the understanding of the user on the subject.\n\n"
+                "The criteria to use for assessing if they are factual or not are:\n"
+                "* They are related to facts."
+                "* You are able to find a source which agrees to the fact from reputable websites.\n\n"
+                "Give reason for your assessment in one or two sentences for each metric and And also rewrite the TIL if you were given the option to write it. "
+                "Evaluate each TIL in the context of all the user's TILs."
+                f"Formatting Instructions: {feedback_parser.get_format_instructions()}"
+            ),
+            HumanMessagePromptTemplate.from_template("{til_content}")
+        ])
+        print("Prompt: ")
+        pprint.pp(feedback_prompt, width=80)
+        llm = ChatOpenAI(model=os.environ['OPENAI_MODEL'], temperature=0.2)
+        analysis_chain = (feedback_prompt | llm | feedback_parser).with_config({
+            "tags": ["til"], "run_name": "Analysing TIL",
+            "metadata" : {
+                "versoin": "v1.0.0",
+                "growth_activity": "til",
+                "env": os.environ["ENV"],
+                "model": os.environ["OPENAI_MODEL"],
+            }
+        })
+        return analysis_chain
+class TilFeedbackResult(BaseModel):
+    til: str = Field(description="TIL as exactly captured by the user without any modifications.")
+    insightful_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the insightful metric.")
+    insightful_reason: str = Field(description="Reason for your assessment in one or two sentences on insightful metric for the user.")
+    factuality_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the factuality metric.")
+    factuality_reason: str = Field(description="Reason for your assessment in one or two sentences on factuality metric for the user.")
+    simplicity_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the simplicity metric.")
+    simplicity_reason: str = Field(description="Reason for your assessment in one or two sentences on simplicity metric for the user.")
+    grammatical_categorization: str = Field(
+        description="TIL categorization as High/Medium/Low based on correctness on the grammatical metric.")
+    grammatical_reason: str = Field(description="Reason for your assessment in one or two sentences on grammatical metric for the user.")
+    final_suggestion: str = Field(
+        description="Rewrite the TIL if you were given the option to write it which should score High on all the metrics.")
+class TilFeedbackResults(BaseModel):
+    tils: List[TilFeedbackResult]
+class TilFinalFeedback(BaseModel):
+    til: str
+    feedback: str
+    feedback_criteria: Optional[str] = None
+    reason: Optional[str] = None
+    suggestion: Optional[str] = None
+class TilFeedbackResponse(BaseModel):
+    run_id: UUID4
+    feedback: List[TilFinalFeedback]