Spaces:
Sleeping
Sleeping
| """ | |
| Requests data from the Wayback Machine API. | |
| """ | |
| from typing import Any, Dict, Optional | |
| from rich import print as rprint | |
| from waybacktweets.config.config import config | |
| from waybacktweets.exceptions.exceptions import ( | |
| ConnectionError, | |
| EmptyResponseError, | |
| GetResponseError, | |
| HTTPError, | |
| ReadTimeoutError, | |
| ) | |
| from waybacktweets.utils.utils import get_response | |
| class WaybackTweets: | |
| """ | |
| Class responsible for requesting data from the Wayback CDX Server API. | |
| Args: | |
| username (str): The username associated with the tweets. | |
| collapse (str, optional): The field to collapse duplicate lines on. | |
| timestamp_from (str, optional): The timestamp to start retrieving tweets from. | |
| timestamp_to (str, optional): The timestamp to stop retrieving tweets at. | |
| limit (int, optional): The maximum number of results to return. | |
| offset (int, optional): The number of lines to skip in the results. | |
| matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains. | |
| """ # noqa: E501 | |
| def __init__( | |
| self, | |
| username: str, | |
| collapse: str = None, | |
| timestamp_from: str = None, | |
| timestamp_to: str = None, | |
| limit: int = None, | |
| offset: int = None, | |
| matchtype: str = None, | |
| ): | |
| self.username = username | |
| self.collapse = collapse | |
| self.timestamp_from = timestamp_from | |
| self.timestamp_to = timestamp_to | |
| self.limit = limit | |
| self.offset = offset | |
| self.matchtype = matchtype | |
| def get(self) -> Optional[Dict[str, Any]]: | |
| """ | |
| Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets. | |
| Returns: | |
| The response from the CDX API in JSON format, if successful. Otherwise, None. | |
| """ # noqa: E501 | |
| url = "https://web.archive.org/cdx/search/cdx" | |
| wildcard_pathname = "/*" | |
| if self.matchtype: | |
| wildcard_pathname = "" | |
| params = { | |
| "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}", | |
| "output": "json", | |
| } | |
| if self.collapse: | |
| params["collapse"] = self.collapse | |
| if self.timestamp_from: | |
| params["from"] = self.timestamp_from | |
| if self.timestamp_to: | |
| params["to"] = self.timestamp_to | |
| if self.limit: | |
| params["limit"] = self.limit | |
| if self.offset: | |
| params["offset"] = self.offset | |
| if self.matchtype: | |
| params["matchType"] = self.matchtype | |
| try: | |
| response = get_response(url=url, params=params) | |
| return response.json() | |
| except ReadTimeoutError: | |
| if config.verbose: | |
| rprint("[red]Connection to web.archive.org timed out.") | |
| except ConnectionError: | |
| if config.verbose: | |
| rprint( | |
| "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501 | |
| ) | |
| except HTTPError as e: | |
| if config.verbose: | |
| rprint(f"[red]HTTP error occurred: {str(e)}") | |
| except EmptyResponseError: | |
| if config.verbose: | |
| rprint("[red]No data was saved due to an empty response.") | |
| except GetResponseError as e: | |
| if config.verbose: | |
| rprint(f"[red]An error occurred: {str(e)}") | |
| return None | |