yuyutsu07's picture
Upload 43 files
cbb84f2 verified
"""
Requests data from the Wayback Machine API.
"""
from typing import Any, Dict, Optional
from rich import print as rprint
from waybacktweets.config.config import config
from waybacktweets.exceptions.exceptions import (
ConnectionError,
EmptyResponseError,
GetResponseError,
HTTPError,
ReadTimeoutError,
)
from waybacktweets.utils.utils import get_response
class WaybackTweets:
"""
Class responsible for requesting data from the Wayback CDX Server API.
Args:
username (str): The username associated with the tweets.
collapse (str, optional): The field to collapse duplicate lines on.
timestamp_from (str, optional): The timestamp to start retrieving tweets from.
timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
limit (int, optional): The maximum number of results to return.
offset (int, optional): The number of lines to skip in the results.
matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
""" # noqa: E501
def __init__(
self,
username: str,
collapse: str = None,
timestamp_from: str = None,
timestamp_to: str = None,
limit: int = None,
offset: int = None,
matchtype: str = None,
):
self.username = username
self.collapse = collapse
self.timestamp_from = timestamp_from
self.timestamp_to = timestamp_to
self.limit = limit
self.offset = offset
self.matchtype = matchtype
def get(self) -> Optional[Dict[str, Any]]:
"""
Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets.
Returns:
The response from the CDX API in JSON format, if successful. Otherwise, None.
""" # noqa: E501
url = "https://web.archive.org/cdx/search/cdx"
wildcard_pathname = "/*"
if self.matchtype:
wildcard_pathname = ""
params = {
"url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
"output": "json",
}
if self.collapse:
params["collapse"] = self.collapse
if self.timestamp_from:
params["from"] = self.timestamp_from
if self.timestamp_to:
params["to"] = self.timestamp_to
if self.limit:
params["limit"] = self.limit
if self.offset:
params["offset"] = self.offset
if self.matchtype:
params["matchType"] = self.matchtype
try:
response = get_response(url=url, params=params)
return response.json()
except ReadTimeoutError:
if config.verbose:
rprint("[red]Connection to web.archive.org timed out.")
except ConnectionError:
if config.verbose:
rprint(
"[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
)
except HTTPError as e:
if config.verbose:
rprint(f"[red]HTTP error occurred: {str(e)}")
except EmptyResponseError:
if config.verbose:
rprint("[red]No data was saved due to an empty response.")
except GetResponseError as e:
if config.verbose:
rprint(f"[red]An error occurred: {str(e)}")
return None