| import requests |
| import html2text |
| from readability import Document |
| from langchain.agents import Tool |
| from urllib.parse import urlparse, parse_qs, urlunparse |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
|
| def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10): |
| """Tool to fetch the content of a web page from a given URL. |
| - This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format. |
| - By default, only up to 2,000 tokens of content are retrieved. |
| - If there is more content available on the page, the `has_next` value will be True. |
| - To read the continuation, you can increment the `page` parameter with the same URL and input them again. |
| |
| Returns |
| ------- |
| Dict[str, Any]: |
| - status: str |
| - page_content |
| - title: str |
| - content: str |
| - has_next: bool |
| """ |
| |
| parsed_url = urlparse(url) |
| parsed_qs = parse_qs(parsed_url.query) |
| page = int(parsed_qs.get("page", [1])[0]) - 1 |
| url = urlunparse( |
| (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "") |
| ) |
|
|
| try: |
| response = requests.get(url, timeout=timeout_sec) |
| response.encoding = 'utf-8' |
| except requests.exceptions.Timeout: |
| return { |
| "status": 500, |
| "page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'} |
| } |
|
|
| if response.status_code != 200: |
| return { |
| "status": response.status_code, |
| "page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'} |
| } |
| |
| try: |
| doc = Document(response.text) |
| title = doc.title() |
| html_content = doc.summary() |
| content = html2text.html2text(html_content) |
| except: |
| return { |
| "status": 500, |
| "page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'} |
| } |
|
|
| text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( |
| model_name=model_name, |
| chunk_size=1000, |
| chunk_overlap=0, |
| ) |
| chunks = text_splitter.split_text(content) |
| if page >= len(chunks): |
| return { |
| "status": 500, |
| "page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'} |
| } |
| else: |
| return { |
| "status": 200, |
| "page_content": { |
| "title": title, |
| "content": chunks[page], |
| "has_next": page < len(chunks) - 1 |
| } |
| } |
|
|
|
|
| def get_fetch_page_tool(): |
| fetch_page_tool_description = """ |
| Tool to fetch the content of a web page from a given URL. |
| |
| This returns `status` and `page_content` (`title`, `content` and `has_next` indicator). |
| If status is not 200, there was some error of fetching page. (Try fetch other pages.) |
| If a status code other than 200 is returned, please don't give up and make sure to check other pages. |
| |
| By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True. |
| To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2) |
| e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2 |
| """ |
| return Tool( |
| name='fetch_page', |
| func=fetch_page, |
| description=fetch_page_tool_description |
| ) |
|
|