Spaces:

snrspeaks
/

ResearchAgent

Runtime error

App Files Files Community

ResearchAgent / app.py

snrspeaks

Update app.py

369423f over 2 years ago

raw

history blame contribute delete

7.34 kB

	import os
	from langchain import PromptTemplate
	from langchain.agents import initialize_agent, Tool
	from langchain.agents import AgentType
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import MessagesPlaceholder
	from langchain.memory import ConversationSummaryBufferMemory
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.summarize import load_summarize_chain
	from langchain.tools import BaseTool
	from pydantic import BaseModel, Field
	from typing import Type
	import requests
	import json
	from langchain.schema import SystemMessage
	import chainlit as cl
	from newsplease import NewsPlease
	import time
	from duckduckgo_search import DDGS
	from itertools import islice


	def search(query, max_retries=5):
	"""
	Search the given query using DuckDuckGo.

	Args:
	- query (str): The search query.
	- max_retries (int): Maximum number of retries in case of request failure.

	Returns:
	- list[dict]: A list of search results with 'title' and 'url'.
	"""
	for attempt in range(max_retries):
	try:
	result = []

	# Initialize the DuckDuckGo search object.
	with DDGS() as ddgs:
	response = ddgs.text(query, region='wt-wt', safesearch='Off', timelimit='y')
	for r in islice(response, 20):
	result.append({'title': r['title'], 'url': r['href']})
	return result

	except requests.RequestException as e:
	# Handle request exceptions.
	print(f"Attempt {attempt + 1} raised an error: {e}. Retrying...")
	if attempt < max_retries - 1:
	time.sleep(1)

	except Exception as e:
	# Handle other exceptions.
	print(f"An unexpected error occurred on attempt {attempt + 1}: {e}. Retrying...")
	if attempt < max_retries - 1:
	time.sleep(1)

	else:
	# If max retries reached, exit the function.
	print("Max retries reached. Exiting...")
	return None

	def scrape_website(objective: str, url: str):
	"""
	Scrape and potentially summarize the content of a website based on a given objective.

	Args:
	- objective (str): The objective & task that users give to the agent.
	- url (str): The URL of the website to be scraped.

	Returns:
	- str: Extracted or summarized content of the website.
	"""
	print("Scraping website...")
	try:
	# Use NewsPlease to scrape the website.
	article = NewsPlease.from_url(url)
	print(f'{article.title} - {article.url}')
	text = article.maintext
	# Summarize if content is too large.
	if len(text) > 10000:
	output = summary(objective, text)
	return output
	else:
	return text
	except:
	pass

	def summary(objective, content):
	"""
	Generate a summary for a given content based on the objective.

	Args:
	- objective (str): The objective for the summary.
	- content (str): The content to be summarized.

	Returns:
	- str: Summarized content.
	"""
	llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613", streaming=True)

	# Split the content into manageable chunks.
	text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
	docs = text_splitter.create_documents([content])

	map_prompt = """
	Write a summary of the following text for {objective}:
	"{text}"
	SUMMARY:
	"""
	map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text", "objective"])

	# Load the summary chain with necessary configurations.
	summary_chain = load_summarize_chain(llm=llm, chain_type='map_reduce', map_prompt=map_prompt_template, combine_prompt=map_prompt_template, verbose=True)

	output = summary_chain.run(input_documents=docs, objective=objective)
	return output

	class ScrapeWebsiteInput(BaseModel):
	"""Inputs for scrape_website function."""
	objective: str = Field(description="The objective & task that users give to the agent")
	url: str = Field(description="The url of the website to be scraped")

	class ScrapeWebsiteTool(BaseTool):
	"""
	A tool that provides functionality to scrape a website.
	"""
	name = "scrape_website"
	description = "useful when you need to get data from a website url, passing both url and objective to the function; DO NOT make up any url, the url should only be from the search results"
	args_schema: Type[BaseModel] = ScrapeWebsiteInput

	def _run(self, objective: str, url: str):
	"""Runs the scrape_website function."""
	return scrape_website(objective, url)

	def _arun(self, url: str):
	"""Asynchronous version of _run. (Currently not implemented)"""
	raise NotImplementedError("error here")

	@cl.langchain_factory(use_async=False)
	def run():
	"""
	Initialize and return a langchain agent with search and scraping tools.

	Returns:
	- Agent: Initialized langchain agent.
	"""
	tools = [
	Tool(name="Search", func=search, description="useful for when you need to answer questions about current events, data. You should ask targeted questions"),
	ScrapeWebsiteTool(),
	]

	system_message = SystemMessage(
	content="""You are a world class researcher, who can do detailed research on any topic and produce facts based results;
	you do not make things up, you will try as hard as possible to gather facts & data to back up the research

	Please make sure you complete the objective above with the following rules:
	1/ You should do enough research to gather as much information as possible about the objective
	2/ If scraping a URL returns "None" go to the next URL. Keep on iterating over URLs untill you find sunstantial information.
	3/ If there are url of relevant links & articles, you will scrape it to gather more information
	4/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins
	5/ You should not make things up, you should only write facts & data that you have gathered
	6/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research
	7/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research"""
	)
	agent_kwargs = {
	"extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
	"system_message": system_message,
	}

	# Initialize the ChatOpenAI model.
	llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613", streaming=True)
	memory = ConversationSummaryBufferMemory(memory_key="memory", return_messages=True, llm=llm)

	# Initialize the agent with tools and other configurations.
	return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True, agent_kwargs=agent_kwargs, memory=memory)