Spaces:
Sleeping
Sleeping
File size: 4,609 Bytes
cbb84f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
"""
CLI functions for retrieving archived tweets.
"""
from datetime import datetime
from typing import Any, Optional
import click
from rich import print as rprint
from waybacktweets.api.export import TweetsExporter
from waybacktweets.api.parse import TweetsParser
from waybacktweets.api.request import WaybackTweets
from waybacktweets.config.config import config
def _parse_date(
ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None
) -> Optional[str]:
"""
Parses a date string and returns it in the format "YYYYMMDD".
Args:
ctx: Necessary when used with the click package. Defaults to None.
param: Necessary when used with the click package. Defaults to None.
value: A date string in the "YYYYMMDD" format. Defaults to None.
Returns:
The input date string formatted in the "YYYYMMDD" format, or None if no date string was provided.
""" # noqa: E501
try:
if value is None:
return None
date = datetime.strptime(value, "%Y%m%d")
return date.strftime("%Y%m%d")
except ValueError:
raise click.BadParameter("Date must be in format YYYYmmdd")
@click.command()
@click.argument("username", type=str)
@click.option(
"-c",
"--collapse",
type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
default=None,
help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501
)
@click.option(
"-f",
"--from",
"timestamp_from",
type=click.UNPROCESSED,
metavar="DATE",
callback=_parse_date,
default=None,
help="Filtering by date range from this date. Format: YYYYmmdd",
)
@click.option(
"-t",
"--to",
"timestamp_to",
type=click.UNPROCESSED,
metavar="DATE",
callback=_parse_date,
default=None,
help="Filtering by date range up to this date. Format: YYYYmmdd",
)
@click.option(
"-l",
"--limit",
type=int,
metavar="INTEGER",
default=None,
help="Query result limits.",
)
@click.option(
"-o",
"--offset",
type=int,
metavar="INTEGER",
default=None,
help="Allows for a simple way to scroll through the results.",
)
@click.option(
"-mt",
"--matchtype",
type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
default=None,
help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501
)
@click.option(
"-v",
"--verbose",
"verbose",
is_flag=True,
default=False,
help="Shows the error log.",
)
def main(
username: str,
collapse: Optional[str],
timestamp_from: Optional[str],
timestamp_to: Optional[str],
limit: Optional[int],
offset: Optional[int],
matchtype: Optional[str],
verbose: Optional[bool],
) -> None:
"""
Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data.
USERNAME: The Twitter username without @.
""" # noqa: E501
try:
config.verbose = verbose
api = WaybackTweets(
username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
)
print(f"Waybacking @{username}'s archived tweets...")
archived_tweets = api.get()
if archived_tweets:
field_options = [
"archived_urlkey",
"archived_timestamp",
"parsed_archived_timestamp",
"archived_tweet_url",
"parsed_archived_tweet_url",
"original_tweet_url",
"parsed_tweet_url",
"available_tweet_text",
"available_tweet_is_RT",
"available_tweet_info",
"archived_mimetype",
"archived_statuscode",
"archived_digest",
"archived_length",
]
parser = TweetsParser(archived_tweets, username, field_options)
parsed_tweets = parser.parse(print_progress=True)
exporter = TweetsExporter(parsed_tweets, username, field_options)
exporter.save_to_csv()
exporter.save_to_json()
exporter.save_to_html()
except Exception as e:
rprint(f"[red]{e}")
finally:
rprint(
"[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501
)
|