Spaces:
Runtime error
Runtime error
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| import os | |
| from typing import Any, Optional | |
| from warnings import warn | |
| from camel.types.enums import JinaReturnFormat | |
| JINA_ENDPOINT = "https://r.jina.ai/" | |
| class JinaURLReader: | |
| r"""URL Reader provided by Jina AI. The output is cleaner and more | |
| LLM-friendly than the URL Reader of UnstructuredIO. Can be configured to | |
| replace the UnstructuredIO URL Reader in the pipeline. | |
| Args: | |
| api_key (Optional[str], optional): The API key for Jina AI. If not | |
| provided, the reader will have a lower rate limit. Defaults to | |
| None. | |
| return_format (ReturnFormat, optional): The level of detail | |
| of the returned content, which is optimized for LLMs. For | |
| now screenshots are not supported. Defaults to | |
| ReturnFormat.DEFAULT. | |
| json_response (bool, optional): Whether to return the response | |
| in JSON format. Defaults to False. | |
| timeout (int, optional): The maximum time in seconds to wait for | |
| the page to be rendered. Defaults to 30. | |
| **kwargs (Any): Additional keyword arguments, including proxies, | |
| cookies, etc. It should align with the HTTP Header field and | |
| value pairs listed in the reference. | |
| References: | |
| https://jina.ai/reader | |
| """ | |
| def __init__( | |
| self, | |
| api_key: Optional[str] = None, | |
| return_format: JinaReturnFormat = JinaReturnFormat.DEFAULT, | |
| json_response: bool = False, | |
| timeout: int = 30, | |
| **kwargs: Any, | |
| ) -> None: | |
| api_key = api_key or os.getenv('JINA_API_KEY') | |
| if not api_key: | |
| warn( | |
| "JINA_API_KEY not set. This will result in a low rate limit " | |
| "of Jina URL Reader. Get API key here: https://jina.ai/reader." | |
| ) | |
| # if the following field not provided, it will be None | |
| api_field = f"Bearer {api_key}" if api_key else None | |
| json_field = "application/json" if json_response else None | |
| raw_headers = { | |
| "Authorization": api_field, | |
| "X-Return-Format": return_format.value, | |
| "Accept": json_field, | |
| "X-Timeout": str(timeout), | |
| **kwargs, | |
| } | |
| # eliminate None values | |
| self._headers = {k: v for k, v in raw_headers.items() if v} | |
| def read_content(self, url: str) -> str: | |
| r"""Reads the content of a URL and returns it as a string with | |
| given form. | |
| Args: | |
| url (str): The URL to read. | |
| Returns: | |
| str: The content of the URL. | |
| """ | |
| import requests | |
| full_url = f"{JINA_ENDPOINT}{url}" | |
| try: | |
| resp = requests.get(full_url, headers=self._headers) | |
| resp.raise_for_status() | |
| except Exception as e: | |
| raise ValueError(f"Failed to read content from {url}: {e}") from e | |
| return resp.text | |