Spaces:

watanabe3tipapa
/

web-sge-agent

Paused

App Files Files Community

web-sge-agent / tools /fetch_page.py

watanabe3tipapa

Upload 4 files

6b7ef92 over 2 years ago

raw

history blame contribute delete

3.57 kB

	import requests
	import html2text
	from readability import Document
	from langchain.agents import Tool
	from urllib.parse import urlparse, parse_qs, urlunparse
	from langchain.text_splitter import RecursiveCharacterTextSplitter


	def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10):
	"""Tool to fetch the content of a web page from a given URL.
	- This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format.
	- By default, only up to 2,000 tokens of content are retrieved.
	- If there is more content available on the page, the `has_next` value will be True.
	- To read the continuation, you can increment the `page` parameter with the same URL and input them again.

	Returns
	-------
	Dict[str, Any]:
	- status: str
	- page_content
	- title: str
	- content: str
	- has_next: bool
	"""
	# page parameter
	parsed_url = urlparse(url)
	parsed_qs = parse_qs(parsed_url.query)
	page = int(parsed_qs.get("page", [1])[0]) - 1
	url = urlunparse(
	(parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")
	)

	try:
	response = requests.get(url, timeout=timeout_sec)
	response.encoding = 'utf-8'
	except requests.exceptions.Timeout:
	return {
	"status": 500,
	"page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'}
	}

	if response.status_code != 200:
	return {
	"status": response.status_code,
	"page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'}
	}

	try:
	doc = Document(response.text)
	title = doc.title()
	html_content = doc.summary()
	content = html2text.html2text(html_content)
	except:
	return {
	"status": 500,
	"page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'}
	}

	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	model_name=model_name,
	chunk_size=1000,
	chunk_overlap=0,
	)
	chunks = text_splitter.split_text(content)
	if page >= len(chunks):
	return {
	"status": 500,
	"page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'}
	}
	else:
	return {
	"status": 200,
	"page_content": {
	"title": title,
	"content": chunks[page],
	"has_next": page < len(chunks) - 1
	}
	}


	def get_fetch_page_tool():
	fetch_page_tool_description = """
	Tool to fetch the content of a web page from a given URL.

	This returns `status` and `page_content` (`title`, `content` and `has_next` indicator).
	If status is not 200, there was some error of fetching page. (Try fetch other pages.)
	If a status code other than 200 is returned, please don't give up and make sure to check other pages.

	By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True.
	To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2)
	e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2
	"""
	return Tool(
	name='fetch_page',
	func=fetch_page,
	description=fetch_page_tool_description
	)