Spaces:
Sleeping
Sleeping
File size: 3,525 Bytes
cbb84f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | """
Requests data from the Wayback Machine API.
"""
from typing import Any, Dict, Optional
from rich import print as rprint
from waybacktweets.config.config import config
from waybacktweets.exceptions.exceptions import (
ConnectionError,
EmptyResponseError,
GetResponseError,
HTTPError,
ReadTimeoutError,
)
from waybacktweets.utils.utils import get_response
class WaybackTweets:
"""
Class responsible for requesting data from the Wayback CDX Server API.
Args:
username (str): The username associated with the tweets.
collapse (str, optional): The field to collapse duplicate lines on.
timestamp_from (str, optional): The timestamp to start retrieving tweets from.
timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
limit (int, optional): The maximum number of results to return.
offset (int, optional): The number of lines to skip in the results.
matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
""" # noqa: E501
def __init__(
self,
username: str,
collapse: str = None,
timestamp_from: str = None,
timestamp_to: str = None,
limit: int = None,
offset: int = None,
matchtype: str = None,
):
self.username = username
self.collapse = collapse
self.timestamp_from = timestamp_from
self.timestamp_to = timestamp_to
self.limit = limit
self.offset = offset
self.matchtype = matchtype
def get(self) -> Optional[Dict[str, Any]]:
"""
Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets.
Returns:
The response from the CDX API in JSON format, if successful. Otherwise, None.
""" # noqa: E501
url = "https://web.archive.org/cdx/search/cdx"
wildcard_pathname = "/*"
if self.matchtype:
wildcard_pathname = ""
params = {
"url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
"output": "json",
}
if self.collapse:
params["collapse"] = self.collapse
if self.timestamp_from:
params["from"] = self.timestamp_from
if self.timestamp_to:
params["to"] = self.timestamp_to
if self.limit:
params["limit"] = self.limit
if self.offset:
params["offset"] = self.offset
if self.matchtype:
params["matchType"] = self.matchtype
try:
response = get_response(url=url, params=params)
return response.json()
except ReadTimeoutError:
if config.verbose:
rprint("[red]Connection to web.archive.org timed out.")
except ConnectionError:
if config.verbose:
rprint(
"[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
)
except HTTPError as e:
if config.verbose:
rprint(f"[red]HTTP error occurred: {str(e)}")
except EmptyResponseError:
if config.verbose:
rprint("[red]No data was saved due to an empty response.")
except GetResponseError as e:
if config.verbose:
rprint(f"[red]An error occurred: {str(e)}")
return None
|