File size: 3,525 Bytes
cbb84f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Requests data from the Wayback Machine API.
"""

from typing import Any, Dict, Optional

from rich import print as rprint

from waybacktweets.config.config import config
from waybacktweets.exceptions.exceptions import (
    ConnectionError,
    EmptyResponseError,
    GetResponseError,
    HTTPError,
    ReadTimeoutError,
)
from waybacktweets.utils.utils import get_response


class WaybackTweets:
    """
    Class responsible for requesting data from the Wayback CDX Server API.

    Args:
        username (str): The username associated with the tweets.
        collapse (str, optional): The field to collapse duplicate lines on.
        timestamp_from (str, optional): The timestamp to start retrieving tweets from.
        timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
        limit (int, optional): The maximum number of results to return.
        offset (int, optional): The number of lines to skip in the results.
        matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
    """  # noqa: E501

    def __init__(
        self,
        username: str,
        collapse: str = None,
        timestamp_from: str = None,
        timestamp_to: str = None,
        limit: int = None,
        offset: int = None,
        matchtype: str = None,
    ):
        self.username = username
        self.collapse = collapse
        self.timestamp_from = timestamp_from
        self.timestamp_to = timestamp_to
        self.limit = limit
        self.offset = offset
        self.matchtype = matchtype

    def get(self) -> Optional[Dict[str, Any]]:
        """
        Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets.

        Returns:
            The response from the CDX API in JSON format, if successful. Otherwise, None.
        """  # noqa: E501
        url = "https://web.archive.org/cdx/search/cdx"

        wildcard_pathname = "/*"
        if self.matchtype:
            wildcard_pathname = ""

        params = {
            "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
            "output": "json",
        }

        if self.collapse:
            params["collapse"] = self.collapse

        if self.timestamp_from:
            params["from"] = self.timestamp_from

        if self.timestamp_to:
            params["to"] = self.timestamp_to

        if self.limit:
            params["limit"] = self.limit

        if self.offset:
            params["offset"] = self.offset

        if self.matchtype:
            params["matchType"] = self.matchtype

        try:
            response = get_response(url=url, params=params)
            return response.json()
        except ReadTimeoutError:
            if config.verbose:
                rprint("[red]Connection to web.archive.org timed out.")
        except ConnectionError:
            if config.verbose:
                rprint(
                    "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again."  # noqa: E501
                )
        except HTTPError as e:
            if config.verbose:
                rprint(f"[red]HTTP error occurred: {str(e)}")
        except EmptyResponseError:
            if config.verbose:
                rprint("[red]No data was saved due to an empty response.")
        except GetResponseError as e:
            if config.verbose:
                rprint(f"[red]An error occurred: {str(e)}")

        return None