Spaces:
Runtime error
Runtime error
| import logging | |
| from typing import Any, List | |
| import requests | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| logger = logging.getLogger(__name__) | |
| class DiffbotLoader(BaseLoader): | |
| """Load `Diffbot` json file.""" | |
| def __init__( | |
| self, api_token: str, urls: List[str], continue_on_failure: bool = True | |
| ): | |
| """Initialize with API token, ids, and key. | |
| Args: | |
| api_token: Diffbot API token. | |
| urls: List of URLs to load. | |
| continue_on_failure: Whether to continue loading other URLs if one fails. | |
| Defaults to True. | |
| """ | |
| self.api_token = api_token | |
| self.urls = urls | |
| self.continue_on_failure = continue_on_failure | |
| def _diffbot_api_url(self, diffbot_api: str) -> str: | |
| return f"https://api.diffbot.com/v3/{diffbot_api}" | |
| def _get_diffbot_data(self, url: str) -> Any: | |
| """Get Diffbot file from Diffbot REST API.""" | |
| # TODO: Add support for other Diffbot APIs | |
| diffbot_url = self._diffbot_api_url("article") | |
| params = { | |
| "token": self.api_token, | |
| "url": url, | |
| } | |
| response = requests.get(diffbot_url, params=params, timeout=10) | |
| # TODO: handle non-ok errors | |
| return response.json() if response.ok else {} | |
| def load(self) -> List[Document]: | |
| """Extract text from Diffbot on all the URLs and return Documents""" | |
| docs: List[Document] = list() | |
| for url in self.urls: | |
| try: | |
| data = self._get_diffbot_data(url) | |
| text = data["objects"][0]["text"] if "objects" in data else "" | |
| metadata = {"source": url} | |
| docs.append(Document(page_content=text, metadata=metadata)) | |
| except Exception as e: | |
| if self.continue_on_failure: | |
| logger.error(f"Error fetching or processing {url}, exception: {e}") | |
| else: | |
| raise e | |
| return docs | |