Spaces:
Running
Running
| import httpx | |
| from langchain.document_loaders import AsyncHtmlLoader | |
| from langchain.document_transformers import BeautifulSoupTransformer | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from ..tools.tool import Tool | |
| class WebBrowser(Tool): | |
| description = '生成艺术字纹理图片' | |
| name = 'web_browser' | |
| parameters: list = [{ | |
| 'name': 'urls', | |
| 'description': 'the urls that the user wants to browse', | |
| 'required': True | |
| }] | |
| def __init__(self, cfg={}): | |
| super().__init__(cfg) | |
| self.split_url_into_chunk = self.cfg.get('split_url_into_chunk', False) | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' | |
| } | |
| self.client = httpx.Client( | |
| headers=self.headers, verify=False, timeout=30.0) | |
| def _local_call(self, *args, **kwargs): | |
| parsed_args, parsed_kwargs = self._local_parse_input(*args, **kwargs) | |
| urls = parsed_kwargs['urls'] | |
| print(urls) | |
| if urls is None: | |
| return {'result': ''} | |
| # # load html | |
| loader = AsyncHtmlLoader(urls) | |
| docs = loader.load() | |
| # Transform | |
| bs_transformer = BeautifulSoupTransformer() | |
| docs_transformed = bs_transformer.transform_documents( | |
| docs, tags_to_extract=['span']) | |
| # split url content into chunk in order to get fine-grained results | |
| if self.split_url_into_chunk: | |
| # Grab the first 1000 tokens of the site | |
| splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
| chunk_size=1000, chunk_overlap=0) | |
| splits = splitter.split_documents(docs_transformed) | |
| else: | |
| splits = docs_transformed | |
| search_results = [] | |
| for item in splits: | |
| result = { | |
| 'url': item.metadata['source'], | |
| 'content': item.page_content | |
| } | |
| search_results.append(result) | |
| return {'result': search_results} | |
| def _local_parse_input(self, *args, **kwargs): | |
| urls = kwargs.get('urls', []) | |
| if isinstance(urls, str): | |
| urls = [urls] | |
| kwargs['urls'] = urls | |
| return args, kwargs | |
| if __name__ == '__main__': | |
| tool = WebBrowser() | |
| urls = ['https://blog.sina.com.cn/zhangwuchang'] | |
| result = tool._local_call(urls=urls) | |
| print(result) | |