Spaces:

snrspeaks
/

ResearchAgent

Runtime error

File size: 7,335 Bytes

d7d4404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75d6091
 
d7d4404
 
 
f04acf1
 
 
 
 
 
 
 
 
 
d7d4404
 
75d6091
 
f04acf1
75d6091
 
 
 
 
d7d4404
 
f04acf1
d7d4404
f04acf1
d7d4404
 
f04acf1
 
75d6091
 
 
 
 
f04acf1
75d6091
 
 
d7d4404
f04acf1
 
d7d4404
f04acf1
 
 
 
 
 
 
d7d4404
 
f04acf1
d7d4404
 
 
f04acf1
d7d4404
 
 
 
 
 
 
 
 
f04acf1
 
 
 
 
 
 
 
 
 
6358572
d7d4404
f04acf1
 
d7d4404
f04acf1
d7d4404
 
 
 
 
f04acf1
d7d4404
f04acf1
 
d7d4404
f04acf1
d7d4404
 
 
f04acf1
 
d7d4404
 
 
f04acf1
 
 
d7d4404
 
 
 
 
f04acf1
d7d4404
 
 
f04acf1
d7d4404
 
 
 
f04acf1
 
 
 
 
 
d7d4404
f04acf1
d7d4404
 
 
 
 
 
 
 
 
369423f
d5daebf
 
 
 
 
d7d4404
 
 
 
 
 
f04acf1
d7d4404
f04acf1

import os
from langchain import PromptTemplate
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.prompts import MessagesPlaceholder
from langchain.memory import ConversationSummaryBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from typing import Type
import requests
import json
from langchain.schema import SystemMessage
import chainlit as cl
from newsplease import NewsPlease
import time
from duckduckgo_search import DDGS
from itertools import islice


def search(query, max_retries=5):
    """
    Search the given query using DuckDuckGo.

    Args:
    - query (str): The search query.
    - max_retries (int): Maximum number of retries in case of request failure.

    Returns:
    - list[dict]: A list of search results with 'title' and 'url'.
    """
    for attempt in range(max_retries):
        try:
            result = []

            # Initialize the DuckDuckGo search object.
            with DDGS() as ddgs:
                response = ddgs.text(query, region='wt-wt', safesearch='Off', timelimit='y')
                for r in islice(response, 20):
                    result.append({'title': r['title'], 'url': r['href']})
                return result

        except requests.RequestException as e:
            # Handle request exceptions.
            print(f"Attempt {attempt + 1} raised an error: {e}. Retrying...")
            if attempt < max_retries - 1:
                time.sleep(1)

        except Exception as e:
            # Handle other exceptions.
            print(f"An unexpected error occurred on attempt {attempt + 1}: {e}. Retrying...")
            if attempt < max_retries - 1:
                time.sleep(1)

    else:
        # If max retries reached, exit the function.
        print("Max retries reached. Exiting...")
        return None

def scrape_website(objective: str, url: str):
    """
    Scrape and potentially summarize the content of a website based on a given objective.

    Args:
    - objective (str): The objective & task that users give to the agent.
    - url (str): The URL of the website to be scraped.

    Returns:
    - str: Extracted or summarized content of the website.
    """
    print("Scraping website...")
    try:
        # Use NewsPlease to scrape the website.
        article = NewsPlease.from_url(url)
        print(f'{article.title} - {article.url}')
        text = article.maintext
        # Summarize if content is too large.
        if len(text) > 10000:
            output = summary(objective, text)
            return output
        else:
            return text
    except:
        pass

def summary(objective, content):
    """
    Generate a summary for a given content based on the objective.

    Args:
    - objective (str): The objective for the summary.
    - content (str): The content to be summarized.

    Returns:
    - str: Summarized content.
    """
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613", streaming=True)

    # Split the content into manageable chunks.
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
    docs = text_splitter.create_documents([content])

    map_prompt = """
    Write a summary of the following text for {objective}:
    "{text}"
    SUMMARY:
    """
    map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text", "objective"])

    # Load the summary chain with necessary configurations.
    summary_chain = load_summarize_chain(llm=llm, chain_type='map_reduce', map_prompt=map_prompt_template, combine_prompt=map_prompt_template, verbose=True)

    output = summary_chain.run(input_documents=docs, objective=objective)
    return output

class ScrapeWebsiteInput(BaseModel):
    """Inputs for scrape_website function."""
    objective: str = Field(description="The objective & task that users give to the agent")
    url: str = Field(description="The url of the website to be scraped")

class ScrapeWebsiteTool(BaseTool):
    """
    A tool that provides functionality to scrape a website.
    """
    name = "scrape_website"
    description = "useful when you need to get data from a website url, passing both url and objective to the function; DO NOT make up any url, the url should only be from the search results"
    args_schema: Type[BaseModel] = ScrapeWebsiteInput

    def _run(self, objective: str, url: str):
        """Runs the scrape_website function."""
        return scrape_website(objective, url)

    def _arun(self, url: str):
        """Asynchronous version of _run. (Currently not implemented)"""
        raise NotImplementedError("error here")

@cl.langchain_factory(use_async=False)
def run():
    """
    Initialize and return a langchain agent with search and scraping tools.

    Returns:
    - Agent: Initialized langchain agent.
    """
    tools = [
        Tool(name="Search", func=search, description="useful for when you need to answer questions about current events, data. You should ask targeted questions"),
        ScrapeWebsiteTool(),
    ]

    system_message = SystemMessage(
        content="""You are a world class researcher, who can do detailed research on any topic and produce facts based results; 
                you do not make things up, you will try as hard as possible to gather facts & data to back up the research
                
                Please make sure you complete the objective above with the following rules:
                1/ You should do enough research to gather as much information as possible about the objective
                2/ If scraping a URL returns "None" go to the next URL. Keep on iterating over URLs untill you find sunstantial information.
                3/ If there are url of relevant links & articles, you will scrape it to gather more information
                4/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins
                5/ You should not make things up, you should only write facts & data that you have gathered
                6/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research
                7/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research"""
    )
    agent_kwargs = {
        "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
        "system_message": system_message,
    }

    # Initialize the ChatOpenAI model.
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613", streaming=True)
    memory = ConversationSummaryBufferMemory(memory_key="memory", return_messages=True, llm=llm)

    # Initialize the agent with tools and other configurations.
    return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True, agent_kwargs=agent_kwargs, memory=memory)