yuyutsu07 commited on
Commit
cbb84f2
·
verified ·
1 Parent(s): 001810e

Upload 43 files

Browse files
.github/FUNDING.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ github: [claromes]
.github/workflows/docs.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: docs
2
+
3
+ on: [push, pull_request, workflow_dispatch]
4
+
5
+ permissions:
6
+ contents: write
7
+
8
+ jobs:
9
+ docs:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: actions/setup-python@v5
14
+ with:
15
+ python-version: 3.11
16
+ - name: Install Poetry
17
+ run: |
18
+ curl -sSL https://install.python-poetry.org | python3 -
19
+ - name: Install dependencies
20
+ run: |
21
+ poetry install
22
+ - name: Sphinx build
23
+ run: |
24
+ mkdir gh-pages
25
+ touch gh-pages/.nojekyll
26
+ cd docs/
27
+ poetry run sphinx-build -b html . _build
28
+ cp -r _build/* ../gh-pages/
29
+ - name: Deploy documentation
30
+ if: ${{ github.event_name == 'push' }}
31
+ uses: JamesIves/github-pages-deploy-action@4.1.4
32
+ with:
33
+ branch: gh-pages
34
+ folder: gh-pages
.streamlit/config.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ base = "light"
3
+ primaryColor = "black"
4
+ secondaryBackgroundColor = "gainsboro"
5
+ textColor = "black"
6
+ backgroundColor = "whitesmoke"
7
+ font = "serif"
8
+
9
+ [client]
10
+ toolbarMode = "minimal"
11
+
12
+ [server]
13
+ port = 8501
app/app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from datetime import datetime, timedelta
3
+
4
+ import streamlit as st
5
+ import streamlit.components.v1 as components
6
+
7
+ from waybacktweets.api.export import TweetsExporter
8
+ from waybacktweets.api.parse import TweetsParser
9
+ from waybacktweets.api.request import WaybackTweets
10
+ from waybacktweets.api.visualize import HTMLTweetsVisualizer
11
+ from waybacktweets.config import FIELD_OPTIONS, config
12
+
13
+ # ------ Initial Settings ------ #
14
+
15
+ PAGE_ICON = "assets/parthenon.png"
16
+ TITLE = "assets/waybacktweets.png"
17
+ DOWNLOAD = "assets/download.svg"
18
+
19
+ collapse = None
20
+ matchtype = None
21
+ start_date = datetime.now() - timedelta(days=30 * 6)
22
+ end_date = datetime.now()
23
+ min_date = datetime(2006, 1, 1)
24
+
25
+ # ------ Verbose Mode Configuration ------ #
26
+
27
+ config.verbose = False
28
+
29
+ # ------ Page Configuration ------ #
30
+
31
+ st.set_page_config(
32
+ page_title="Wayback Tweets",
33
+ page_icon=PAGE_ICON,
34
+ layout="centered",
35
+ menu_items={
36
+ "About": f"""
37
+ [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)
38
+
39
+ The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool.
40
+
41
+ © 2023 - {end_date.year}, [Claromes](https://claromes.com)
42
+
43
+ ---
44
+ """, # noqa: E501
45
+ "Report a bug": "https://github.com/claromes/waybacktweets/issues",
46
+ },
47
+ )
48
+
49
+ # ------ Set States and Params ------ #
50
+
51
+ if "current_username" not in st.session_state:
52
+ st.session_state.current_username = ""
53
+
54
+ if "count" not in st.session_state:
55
+ st.session_state.count = False
56
+
57
+ if "archived_timestamp_filter" not in st.session_state:
58
+ st.session_state.archived_timestamp_filter = (start_date, end_date)
59
+
60
+ if "username_value" not in st.session_state:
61
+ st.session_state.username_value = ""
62
+
63
+ if "expanded_value" not in st.session_state:
64
+ st.session_state.expanded_value = False
65
+
66
+ if "query" not in st.session_state:
67
+ st.session_state.query = False
68
+
69
+ if "update_component" not in st.session_state:
70
+ st.session_state.update_component = 0
71
+
72
+ if "username" not in st.query_params:
73
+ st.query_params["username"] = ""
74
+
75
+ # ------ Add Custom CSS Style ------ #
76
+
77
+ st.html(
78
+ """
79
+ <style>
80
+ header[data-testid="stHeader"] {
81
+ opacity: 0.5;
82
+ }
83
+ iframe {
84
+ border: 1px solid #dddddd;
85
+ border-radius: 0.5rem;
86
+ }
87
+ div[data-testid="InputInstructions"] {
88
+ visibility: hidden;
89
+ }
90
+ button[data-testid="StyledFullScreenButton"] {
91
+ display: none;
92
+ }
93
+ div[class="st-emotion-cache-1v0mbdj e115fcil1"] {
94
+ max-width: 100%;
95
+ }
96
+ </style>
97
+ """
98
+ )
99
+
100
+ # ------ Requestings ------ #
101
+
102
+
103
+ @st.cache_data(ttl=600, show_spinner=False)
104
+ def wayback_tweets(
105
+ username,
106
+ collapse,
107
+ timestamp_from,
108
+ timestamp_to,
109
+ limit,
110
+ offset,
111
+ matchtype,
112
+ ):
113
+ response = WaybackTweets(
114
+ username,
115
+ collapse,
116
+ timestamp_from,
117
+ timestamp_to,
118
+ limit,
119
+ offset,
120
+ matchtype,
121
+ )
122
+ archived_tweets = response.get()
123
+
124
+ return archived_tweets
125
+
126
+
127
+ @st.cache_data(ttl=600, show_spinner=False)
128
+ def tweets_parser(archived_tweets, username, field_options):
129
+ parser = TweetsParser(archived_tweets, username, field_options)
130
+ parsed_tweets = parser.parse()
131
+
132
+ return parsed_tweets
133
+
134
+
135
+ @st.cache_data(ttl=600, show_spinner=False)
136
+ def tweets_exporter(parsed_tweets, username, field_options):
137
+ exporter = TweetsExporter(parsed_tweets, username, field_options)
138
+
139
+ df = exporter.dataframe
140
+ file_name = exporter.filename
141
+
142
+ return df, file_name
143
+
144
+
145
+ # ------ Custom JavaScript ------ #
146
+
147
+
148
+ def scroll_page():
149
+ js = f"""
150
+ <script>
151
+ window.parent.document.querySelector('section.main').scrollTo(700, 700);
152
+ let update_component = {st.session_state.update_component} // Force component update to generate scroll
153
+ </script>
154
+ """ # noqa: E501
155
+
156
+ components.html(js, width=0, height=0)
157
+
158
+
159
+ # ------ Query Param ------ #
160
+
161
+ if st.query_params.username != "":
162
+ st.session_state.username_value = st.query_params.username
163
+ st.session_state.expanded_value = True
164
+ st.session_state.query = True
165
+
166
+ st.session_state.update_component += 1
167
+ scroll_page()
168
+
169
+ # ------ User Interface Settings ------ #
170
+
171
+ st.info(
172
+ "🥳 [**Pre-release 1.0x: Python module, CLI, and new Streamlit app**](https://github.com/claromes/waybacktweets/releases)" # noqa: E501
173
+ )
174
+
175
+ st.image(TITLE, use_column_width="never")
176
+ st.caption(
177
+ "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)" # noqa: E501
178
+ )
179
+ st.write(
180
+ "Retrieves archived tweets CDX data in HTML (for easy viewing of the tweets using the `iframe` tag), CSV, and JSON formats." # noqa: E501
181
+ )
182
+
183
+ st.write(
184
+ "This application uses the Wayback Tweets Python package, which can be used either as a module or as a standalone command-line tool. [Read the documentation](https://claromes.github.io/waybacktweets) for more information." # noqa: E501
185
+ )
186
+
187
+ st.write(
188
+ "To access the legacy version of Wayback Tweets, [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
189
+ )
190
+
191
+ st.divider()
192
+
193
+ # -- Filters -- #
194
+
195
+ username = st.text_input(
196
+ "Username *",
197
+ value=st.session_state.username_value,
198
+ key="username",
199
+ placeholder="Without @",
200
+ )
201
+
202
+ with st.expander("Filtering", expanded=st.session_state.expanded_value):
203
+
204
+ st.session_state.archived_timestamp_filter = st.date_input(
205
+ "Tweets saved between",
206
+ (start_date, end_date),
207
+ min_date,
208
+ end_date,
209
+ format="YYYY/MM/DD",
210
+ help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
211
+ )
212
+ st.caption(
213
+ ":orange[note: large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]" # noqa: E501
214
+ )
215
+
216
+ col1, col2 = st.columns(2)
217
+
218
+ with col1:
219
+ limit = st.text_input(
220
+ "Limit",
221
+ key="limit",
222
+ help="Query result limits",
223
+ )
224
+
225
+ with col2:
226
+ offset = st.text_input(
227
+ "Offset",
228
+ key="offset",
229
+ help="Allows for a simple way to scroll through the results",
230
+ )
231
+
232
+ unique = st.checkbox(
233
+ "Only unique Wayback Machine URLs",
234
+ key="unique",
235
+ help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
236
+ )
237
+ st.caption(
238
+ ":orange[note: according to the official documentation of the Wayback CDX Server API, the query to retrieve unique URLs may be slow at the moment.]" # noqa: E501
239
+ )
240
+
241
+
242
+ query = st.button("Query", type="primary", use_container_width=True)
243
+
244
+ if st.query_params.username == "":
245
+ st.query_params.clear()
246
+ st.session_state.query = query
247
+
248
+ # ------ Results ------ #
249
+
250
+ if username != st.session_state.current_username:
251
+ st.session_state.current_username = username
252
+
253
+ if st.session_state.query or st.session_state.count:
254
+ if unique:
255
+ collapse = "urlkey"
256
+ matchtype = "prefix"
257
+
258
+ try:
259
+ with st.spinner(
260
+ f"Waybacking @{st.session_state.current_username}'s archived tweets"
261
+ ):
262
+ wayback_tweets = wayback_tweets(
263
+ st.session_state.current_username,
264
+ collapse,
265
+ st.session_state.archived_timestamp_filter[0],
266
+ st.session_state.archived_timestamp_filter[1],
267
+ limit,
268
+ offset,
269
+ matchtype,
270
+ )
271
+
272
+ if not wayback_tweets:
273
+ st.error("No data was saved due to an empty response.")
274
+ st.stop()
275
+
276
+ with st.spinner(
277
+ f"Parsing @{st.session_state.current_username}'s archived tweets"
278
+ ):
279
+ parsed_tweets = tweets_parser(
280
+ wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
281
+ )
282
+
283
+ df, file_name = tweets_exporter(
284
+ parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
285
+ )
286
+
287
+ csv_data = df.to_csv(index=False)
288
+ json_data = df.to_json(orient="records", lines=False)
289
+ html = HTMLTweetsVisualizer(username, json_data)
290
+ html_content = html.generate()
291
+
292
+ # -- Rendering -- #
293
+
294
+ if csv_data and json_data and html_content:
295
+ st.session_state.count = len(df)
296
+ st.write(f"**{st.session_state.count} URLs have been captured**")
297
+
298
+ # -- HTML -- #
299
+
300
+ st.header("HTML", divider="gray", anchor=False)
301
+ st.write(
302
+ f"Visualize tweets more efficiently through `iframes`. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
303
+ )
304
+
305
+ col5, col6 = st.columns([1, 18])
306
+
307
+ with col5:
308
+ st.image(DOWNLOAD, width=22)
309
+
310
+ with col6:
311
+ b64_html = base64.b64encode(html_content.encode()).decode()
312
+ href_html = f"data:text/html;base64,{b64_html}"
313
+
314
+ st.markdown(
315
+ f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
316
+ unsafe_allow_html=True,
317
+ )
318
+
319
+ # -- CSV -- #
320
+
321
+ st.header("CSV", divider="gray", anchor=False)
322
+ st.write(
323
+ "Check the data returned in the dataframe below and download the file."
324
+ )
325
+
326
+ col7, col8 = st.columns([1, 18])
327
+
328
+ with col7:
329
+ st.image(DOWNLOAD, width=22)
330
+
331
+ with col8:
332
+ b64_csv = base64.b64encode(csv_data.encode()).decode()
333
+ href_csv = f"data:file/csv;base64,{b64_csv}"
334
+
335
+ st.markdown(
336
+ f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
337
+ unsafe_allow_html=True,
338
+ )
339
+
340
+ st.dataframe(df, use_container_width=True)
341
+
342
+ # -- JSON -- #
343
+
344
+ st.header("JSON", divider="gray", anchor=False)
345
+ st.write(
346
+ "Check the data returned in JSON format below and download the file."
347
+ )
348
+
349
+ col9, col10 = st.columns([1, 18])
350
+
351
+ with col9:
352
+ st.image(DOWNLOAD, width=22)
353
+
354
+ with col10:
355
+ b64_json = base64.b64encode(json_data.encode()).decode()
356
+ href_json = f"data:file/json;base64,{b64_json}"
357
+
358
+ st.markdown(
359
+ f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
360
+ unsafe_allow_html=True,
361
+ )
362
+
363
+ st.json(json_data, expanded=False)
364
+ except TypeError as e:
365
+ st.error(
366
+ f"""
367
+ {e}. Refresh this page and try again.
368
+
369
+ If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501
370
+ )
371
+ st.stop()
372
+ except Exception as e:
373
+ st.error(str(e))
374
+ st.stop()
app/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit==1.36.0
2
+ waybacktweets==1.0a5
assets/download.svg ADDED
assets/parthenon.png ADDED
assets/waybacktweets.png ADDED
assets/waybacktweets_title.png ADDED
docs/Makefile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = .
9
+ BUILDDIR = _build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
docs/_static/card.png ADDED
docs/_static/css/custom.css ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Georgia, 'Times New Roman', Times, serif;
3
+ background-color: whitesmoke;
4
+ }
5
+
6
+ a:hover {
7
+ background-color: whitesmoke !important;
8
+ }
9
+
10
+ #cli #usage #waybacktweets h3,
11
+ #cli .admonition-title,
12
+ .sphinxsidebarwrapper li ul li ul:has(a[href="#waybacktweets"]):last-child {
13
+ display: none;
14
+ }
docs/_templates/page.html ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "!page.html" %}
2
+
3
+ {% block extrahead %}
4
+ {{ super() }}
5
+ <meta name="description" content="Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data">
6
+
7
+ <meta property="og:title" content="{{ title|e }}" />
8
+ <meta property="og:description" content="Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data">
9
+ <meta property="og:image" content="https://claromes.github.io/waybacktweets/_static/card.png" />
10
+
11
+ <meta name="twitter:title" content="{{ title|e }}">
12
+ <meta name="twitter:description" content="Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data">
13
+ <meta property="twitter:image" content="https://claromes.github.io/waybacktweets/_static/card.png" />
14
+ {% endblock %}
docs/api.rst ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ API
2
+ ====
3
+
4
+ Request
5
+ ---------
6
+
7
+ .. automodule:: waybacktweets.api.request
8
+
9
+ .. autoclass:: WaybackTweets
10
+ :members:
11
+
12
+ .. _parser:
13
+
14
+ Parse
15
+ ---------
16
+
17
+ .. automodule:: waybacktweets.api.parse
18
+
19
+ .. autoclass:: TweetsParser
20
+ :members:
21
+ :private-members:
22
+
23
+ .. autoclass:: TwitterEmbed
24
+ :members:
25
+
26
+ .. autoclass:: JsonParser
27
+ :members:
28
+
29
+ .. _exporter:
30
+
31
+ Export
32
+ ---------
33
+
34
+ .. automodule:: waybacktweets.api.export
35
+
36
+ .. autoclass:: TweetsExporter
37
+ :members:
38
+ :private-members:
39
+
40
+ Visualize
41
+ -----------
42
+
43
+ .. automodule:: waybacktweets.api.visualize
44
+
45
+ .. autoclass:: HTMLTweetsVisualizer
46
+ :members:
47
+ :private-members:
48
+
49
+ .. _utils:
50
+
51
+ Utils
52
+ -------
53
+
54
+ .. automodule:: waybacktweets.utils.utils
55
+
56
+ .. autofunction:: check_double_status
57
+ .. autofunction:: check_pattern_tweet
58
+ .. autofunction:: check_url_scheme
59
+ .. autofunction:: clean_tweet_url
60
+ .. autofunction:: clean_wayback_machine_url
61
+ .. autofunction:: delete_tweet_pathnames
62
+ .. autofunction:: get_response
63
+ .. autofunction:: is_tweet_url
64
+ .. autofunction:: semicolon_parser
65
+ .. autofunction:: timestamp_parser
66
+
67
+
68
+ Config
69
+ ------------
70
+
71
+ .. automodule:: waybacktweets.config.config
72
+ :members:
docs/cli.rst ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CLI
2
+ ================
3
+
4
+ Usage
5
+ ---------
6
+
7
+ .. click:: waybacktweets._cli:main
8
+ :prog: waybacktweets
9
+ :nested: full
10
+
11
+ Collapsing
12
+ ------------
13
+
14
+ The Wayback Tweets command line tool recommends the use of three types of "collapse": ``urlkey``, ``digest``, and ``timestamp`` field.
15
+
16
+ - ``urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
17
+
18
+ - ``digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string.
19
+
20
+ - ``timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format. We recommend ``YYYYMMDD``.
21
+
22
+ However, it is possible to use it with other options. Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
23
+
24
+ .. note::
25
+
26
+ A new form of filtering is the option to "collapse" results based on a field, or a substring of a field. Collapsing is done on adjacent CDX lines where all captures after the first one that are duplicate are filtered out. This is useful for filtering out captures that are "too dense" or when looking for unique captures.
27
+
28
+ To use collapsing, add one or more ``collapse=field`` or ``collapse=field:N`` where ``N`` is the first ``N`` characters of field to test.
29
+
30
+ - Ex: Only show at most 1 capture per hour (compare the first 10 digits of the ``timestamp`` field). Given 2 captures ``20130226010000`` and ``20130226010800``, since first 10 digits ``2013022601`` match, the 2nd capture will be filtered out:
31
+
32
+ http://web.archive.org/cdx/search/cdx?url=google.com&collapse=timestamp:10
33
+
34
+ The calendar page at `web.archive.org` uses this filter by default: `http://web.archive.org/web/*/archive.org`
35
+
36
+ - Ex: Only show unique captures by ``digest`` (note that only adjacent digest are collapsed, duplicates elsewhere in the cdx are not affected):
37
+
38
+ http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=digest
39
+
40
+ - Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment):
41
+
42
+ http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix
43
+
44
+
45
+ URL Match Scope
46
+ -----------------
47
+
48
+ The CDX Server can return results matching a certain prefix, a certain host or all subdomains by using the ``matchType`` param.
49
+
50
+ The package ``waybacktweets`` uses the pathname ``/status`` followed by the wildcard '*' at the end of the URL to retrieve only tweets. However, if a value is provided for this parameter, the search will be made from the URL `twitter.com/<USERNAME>`.
51
+
52
+ Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
53
+
54
+ .. note::
55
+
56
+ For example, if given the url: archive.org/about/ and:
57
+
58
+ - ``matchType=exact`` (default if omitted) will return results matching exactly archive.org/about/
59
+
60
+ - ``matchType=prefix`` will return results for all results under the path archive.org/about/
61
+
62
+ http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=prefix&limit=1000
63
+
64
+ - ``matchType=host`` will return results from host archive.org
65
+
66
+ http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=host&limit=1000
67
+
68
+ - ``matchType=domain`` will return results from host archive.org and all subhosts \*.archive.org
69
+
70
+ http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=domain&limit=1000
71
+
72
+ The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url:
73
+
74
+ - If url is ends in '/\*', eg url=archive.org/\* the query is equivalent to url=archive.org/&matchType=prefix
75
+ - If url starts with '\*.', eg url=\*.archive.org/ the query is equivalent to url=archive.org/&matchType=domain
76
+
77
+ (Note: The domain mode is only available if the CDX is in `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_-order format.)
docs/conf.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ from pallets_sphinx_themes import ProjectLink, get_version
4
+
5
+ project = "Wayback Tweets"
6
+ release, version = get_version("waybacktweets")
7
+ rst_epilog = f".. |release| replace:: v{release}"
8
+ copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License · Pre-release: v{release}" # noqa: E501
9
+ author = "Claromes"
10
+
11
+ # -- General configuration ---------------------------------------------------
12
+
13
+ extensions = [
14
+ "sphinx.ext.autodoc",
15
+ "sphinx.ext.napoleon",
16
+ "sphinx.ext.extlinks",
17
+ "sphinx.ext.intersphinx",
18
+ "pallets_sphinx_themes",
19
+ "sphinxcontrib.mermaid",
20
+ "sphinx_new_tab_link",
21
+ "sphinx_click.ext",
22
+ "sphinx_autodoc_typehints",
23
+ ]
24
+
25
+ templates_path = ["_templates"]
26
+ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
27
+ autodoc_typehints = "description"
28
+
29
+ # -- Options for HTML output -------------------------------------------------
30
+
31
+ html_theme = "flask"
32
+ html_static_path = ["_static"]
33
+ html_css_files = ["css/custom.css"]
34
+ html_context = {
35
+ "project_links": [
36
+ ProjectLink("PyPI Releases", "https://pypi.org/project/waybacktweets/"),
37
+ ProjectLink("Source Code", "https://github.com/claromes/waybacktweets/"),
38
+ ProjectLink(
39
+ "Issue Tracker", "https://github.com/claromes/waybacktweets/issues/"
40
+ ),
41
+ ProjectLink("Mastodon", "https://ruby.social/@claromes"),
42
+ ProjectLink("Bluesky", "https://bsky.app/profile/claromes.com"),
43
+ ]
44
+ }
45
+ html_sidebars = {
46
+ "index": ["project.html", "localtoc.html", "searchbox.html"],
47
+ "**": ["localtoc.html", "relations.html", "searchbox.html"],
48
+ }
49
+ html_favicon = "../assets/parthenon.png"
50
+ html_logo = "../assets/parthenon.png"
51
+ html_title = f"Wayback Tweets Documentation ({version})"
52
+ html_show_sourcelink = False
docs/contribute.rst ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Contribute
2
+ ================
3
+
4
+ Here are all the ways you can contribute to this project.
5
+
6
+ Testing
7
+ ---------
8
+
9
+ The best way to help is by using the package, either on the command line or as a module, suggesting improvements and reporting bugs. You're very welcome to `open an issue <https://github.com/claromes/waybacktweets/issues/>`_.
10
+
11
+
12
+ Hacking
13
+ ---------
14
+
15
+ If you have Python skills, contribute to the `code <https://github.com/claromes/waybacktweets/>`_.
16
+
17
+ These are the prerequisites:
18
+
19
+ - Python 3.10+
20
+ - Poetry
21
+
22
+ Install from the source, following the :ref:`installation` instructions.
23
+
24
+ Brief explanation about the code under the Wayback Tweets directory:
25
+
26
+ - ``app``: Streamlit application code
27
+ - ``assets``: Title and logo images
28
+ - ``docs``: Documentation generated with Sphinx
29
+ - ``legacy_app``: Legacy Streamlit application code
30
+ - ``waybacktweets/api``: Main package modules
31
+ - ``waybacktweets/config``: Global configuration module
32
+ - ``waybacktweets/exceptions``: Wayback Tweets Exceptions
33
+ - ``waybacktweets/utils``: Helper functions used in the package
34
+
35
+ Sponsoring
36
+ ------------
37
+
38
+ You can also donate to the project's developer and maintainer, `Claromes <https://claromes.com>`_, via `GitHub Sponsor <https://github.com/sponsors/claromes>`_ or if you are interested in sponsoring the project you can contact via email at support at claromes dot com.
docs/exceptions.rst ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Exceptions
2
+ ================
3
+
4
+ These are the most common errors and are handled by the ``waybacktweets`` package.
5
+
6
+ ReadTimeoutError
7
+ ------------------
8
+
9
+ This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues.
10
+
11
+ The output message from the package would be: ``Connection to web.archive.org timed out.``
12
+
13
+ ConnectionError
14
+ ------------------
15
+
16
+ This error is raised when the package fails to establish a new connection with web.archive.org. This could be due to network issues or the server being down.
17
+
18
+ The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
19
+
20
+
21
+ This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
22
+
23
+ The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
24
+
25
+ HTTPError
26
+ ------------------
27
+
28
+ This error occurs when the Internet Archive services are temporarily offline. This could be due to maintenance or server issues.
29
+
30
+ The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.``
31
+
32
+ EmptyResponseError
33
+ ---------------------
34
+
35
+ This exception raised for empty responses.
36
+
37
+ The output message from the package would be: ``No data was saved due to an empty response.``
38
+
39
+ Warning
40
+ ------------------
41
+
42
+ It is possible to encounter the following warning when running the ``TweetsParser`` class (:ref:`parser`): ``<TWEET_URL> not available on the user's Twitter account, but the CDX data was saved.``
43
+
44
+ This occurs when the original tweet is no longer available on Twitter and has possibly been deleted.
docs/field_options.rst ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. _field_options:
2
+
3
+ Field Options
4
+ ================
5
+
6
+ The package performs several parses to facilitate the analysis of archived tweets and types of tweets. The fields below are available, which can be passed to the :ref:`parser` and :ref:`exporter`, in addition, the command line tool returns all these fields.
7
+
8
+ - ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
9
+
10
+ - ``archived_timestamp``: (`str`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
11
+
12
+ - ``parsed_archived_timestamp``: (`str`) The ``archived_timestamp`` in human-readable format.
13
+
14
+ - ``archived_tweet_url``: (`str`) The archived URL.
15
+
16
+ - ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
17
+
18
+ - ``original_tweet_url``: (`str`) The original tweet URL.
19
+
20
+ - ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`.
21
+
22
+ - ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account.
23
+
24
+ - ``available_tweet_is_RT``: (`bool`) Whether the tweet from the ``available_tweet_text`` field is a retweet or not.
25
+
26
+ - ``available_tweet_info``: (`str`) Name and date of the tweet from the ``available_tweet_text`` field.
27
+
28
+ - ``archived_mimetype``: (`str`) The mimetype of the archived content, which can be one of these:
29
+
30
+ - ``text/html``
31
+
32
+ - ``warc/revisit``
33
+
34
+ - ``application/json``
35
+
36
+ - ``unk``
37
+
38
+ - ``archived_statuscode``: (`str`) The HTTP status code of the snapshot. If the mimetype is ``warc/revisit``, the value returned for the ``statuscode`` key can be blank, but the actual value is the same as that of any other entry that has the same ``digest`` as this entry. If the mimetype is ``application/json``, the value is usually empty or ``-``.
39
+
40
+ - ``archived_digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string.
41
+
42
+ - ``archived_length``: (`int`) The compressed byte size of the corresponding WARC record, which includes WARC headers, HTTP headers, and content payload.
docs/index.rst ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. rst-class:: hide-header
2
+
3
+ Wayback Tweets
4
+ ================
5
+
6
+ .. image:: ../assets/waybacktweets_title.png
7
+ :alt: Wayback Tweets
8
+ :align: center
9
+
10
+ Pre-release: |release|
11
+
12
+ Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing (see :ref:`field_options`), and saves the data in HTML (for easy viewing of the tweets using the ``iframe`` tag), CSV, and JSON formats.
13
+
14
+ .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.12528448.svg
15
+ :target: https://doi.org/10.5281/zenodo.12528448
16
+
17
+ .. note::
18
+ Intensive queries can lead to rate limiting, resulting in a temporary ban of a few minutes from web.archive.org.
19
+
20
+
21
+ User Guide
22
+ ------------
23
+
24
+ .. toctree::
25
+ :maxdepth: 2
26
+
27
+ installation
28
+ quickstart
29
+ workflow
30
+ field_options
31
+ outputs
32
+ exceptions
33
+ contribute
34
+ todo
35
+
36
+
37
+ Command-Line Interface
38
+ ------------------------
39
+ .. toctree::
40
+ :maxdepth: 2
41
+
42
+ cli
43
+
44
+ Streamlit Web App
45
+ -------------------
46
+
47
+ .. toctree::
48
+ :maxdepth: 2
49
+
50
+ streamlit
51
+
52
+
53
+ API Reference
54
+ ---------------
55
+
56
+ .. toctree::
57
+ :maxdepth: 2
58
+
59
+ api
60
+
61
+
62
+ Additional Information
63
+ -----------------------
64
+
65
+ .. toctree::
66
+ :maxdepth: 1
67
+
68
+ .. raw:: html
69
+
70
+ <ul>
71
+ <li><a href="https://github.com/claromes/waybacktweets/blob/main/LICENSE.md" target="_blank">GPL-3.0 license</a></li>
72
+ <li><a href="https://github.com/claromes/waybacktweets/releases" target="_blank">Changes</a></li>
73
+ </ul>
74
+
75
+ Indices and tables
76
+ ----------------------
77
+
78
+ .. toctree::
79
+ :maxdepth: 2
80
+
81
+ genindex
82
+ modindex
83
+ search
docs/installation.rst ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. _installation:
2
+
3
+ Installation
4
+ ================
5
+
6
+
7
+ Using pip
8
+ ------------
9
+
10
+ .. code-block:: shell
11
+
12
+ pip install waybacktweets
13
+
14
+ From source
15
+ -------------
16
+
17
+ Clone the repository:
18
+
19
+ .. code-block:: shell
20
+
21
+ git clone git@github.com:claromes/waybacktweets.git
22
+
23
+ Change directory:
24
+
25
+ .. code-block:: shell
26
+
27
+ cd waybacktweets
28
+
29
+ Install poetry, if you haven't already:
30
+
31
+ .. code-block:: shell
32
+
33
+ pip install poetry
34
+
35
+
36
+ Install the dependencies:
37
+
38
+ .. code-block:: shell
39
+
40
+ poetry install
41
+
42
+ Run the CLI:
43
+
44
+ .. code-block:: shell
45
+
46
+ poetry run waybacktweets [SUBCOMMANDS]
47
+
48
+ Run the Streamlit App:
49
+
50
+ .. code-block:: shell
51
+
52
+ streamlit run app/app.py
53
+
54
+ Build the docs:
55
+
56
+ .. code-block:: shell
57
+
58
+ cd docs
59
+
60
+ .. code-block:: shell
61
+
62
+ make clean html
63
+
64
+ `Read the Poetry CLI documentation <https://python-poetry.org/docs/cli/>`_.
docs/make.bat ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=.
11
+ set BUILDDIR=_build
12
+
13
+ %SPHINXBUILD% >NUL 2>NUL
14
+ if errorlevel 9009 (
15
+ echo.
16
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17
+ echo.installed, then set the SPHINXBUILD environment variable to point
18
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
19
+ echo.may add the Sphinx directory to PATH.
20
+ echo.
21
+ echo.If you don't have Sphinx installed, grab it from
22
+ echo.https://www.sphinx-doc.org/
23
+ exit /b 1
24
+ )
25
+
26
+ if "%1" == "" goto help
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
docs/outputs.rst ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Outputs
2
+ ==========
3
+
4
+ It is possible to save the CDX data in three formats. In the command line tool, these three formats are saved automatically.
5
+
6
+ HTML
7
+ --------
8
+
9
+ This format allows for easy viewing of the archived tweets, through the use of the ``iframe`` tag. Each tweet contains four viewing options, which render when clicking on the accordion:
10
+
11
+ - ``archived_tweet_url``: (`str`) The archived URL.
12
+
13
+ - ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
14
+
15
+ - ``original_tweet_url``: (`str`) The original tweet URL.
16
+
17
+ - ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`.
18
+
19
+ Additionally, other fields are displayed.
20
+
21
+ CSV
22
+ --------
23
+
24
+ Option to analyze the CDX data in comma-separated values.
25
+
26
+ JSON
27
+ --------
28
+
29
+ Option to analyze the data in JavaScript Object Notation.
docs/quickstart.rst ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Quickstart
2
+ ================
3
+
4
+ CLI
5
+ -------------
6
+
7
+ Using Wayback Tweets as a standalone command line tool.
8
+
9
+ waybacktweets [OPTIONS] USERNAME
10
+
11
+ .. code-block:: shell
12
+
13
+ waybacktweets --from 20150101 --to 20191231 --limit 250 jack
14
+
15
+ Web App
16
+ -------------
17
+
18
+ Using Wayback Tweets as a Streamlit Web App.
19
+
20
+ `Open the application <https://waybacktweets.streamlit.app>`_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
21
+
22
+ Module
23
+ -------------
24
+
25
+ Using Wayback Tweets as a Python Module.
26
+
27
+ .. code-block:: python
28
+
29
+ from waybacktweets import WaybackTweets, TweetsParser, TweetsExporter
30
+
31
+ USERNAME = "jack"
32
+
33
+ api = WaybackTweets(USERNAME)
34
+ archived_tweets = api.get()
35
+
36
+ if archived_tweets:
37
+ field_options = [
38
+ "archived_timestamp",
39
+ "original_tweet_url",
40
+ "archived_tweet_url",
41
+ "archived_statuscode",
42
+ ]
43
+
44
+ parser = TweetsParser(archived_tweets, USERNAME, field_options)
45
+ parsed_tweets = parser.parse()
46
+
47
+ exporter = TweetsExporter(parsed_tweets, USERNAME, field_options)
48
+ exporter.save_to_csv()
docs/streamlit.rst ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Web App
2
+ =========
3
+
4
+ The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool.
5
+
6
+ `Open the application <https://waybacktweets.streamlit.app>`_.
7
+
8
+
9
+ Filters
10
+ ----------
11
+
12
+ - Filtering by date range: Using the ``from`` and ``to`` filters
13
+
14
+ - Limit: Query result limits.
15
+
16
+ - Offset: Allows for a simple way to scroll through the results.
17
+
18
+ - Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
19
+
20
+
21
+ Username Query Parameter
22
+ --------------------------
23
+
24
+ An alternative way to access the application is by using the ``username`` query parameter. This allows for automatic configuration of the Username input and automatically searches. Additionally, when the ``username`` parameter is sent, the accordion with the filters will already be open.
25
+
26
+ Example URL format:
27
+
28
+ ``https://waybacktweets.streamlit.app?username=<USERNAME>``
29
+
30
+
31
+ Community Comments
32
+ --------------------
33
+
34
+ .. raw:: html
35
+
36
+ <ul>
37
+ <li>"We're always delighted when we see our community members create tools for open source research." <a href="https://twitter.com/bellingcat/status/1728085974138122604" target="_blank">Bellingcat</a></li>
38
+ <br>
39
+ <li>"#myOSINTtip Clarissa Mendes launched a new tool for accessing old tweets via archive.org called the Wayback Tweets app. For those who love to look deeper at #osint tools, it is available on GitHub and uses the Wayback CDX Server API server (which is a hidden gem for accessing archive.org data!)" <a href="https://www.linkedin.com/posts/my-osint-training_myosinttip-osint-activity-7148425933324963841-0Q2n/" target="_blank">My OSINT Training</a></li>
40
+ <br>
41
+ <li>"Original way to find deleted tweets." <a href="https://twitter.com/henkvaness/status/1693298101765701676" target="_blank">Henk Van Ess</a></li>
42
+ <br>
43
+ <li>"This is an excellent tool to use now that most Twitter API-based tools have gone down with changes to the pricing structure over at X." <a href="https://osintnewsletter.com/p/22#%C2%A7osint-community" target="_blank">The OSINT Newsletter - Issue #22</a></li>
44
+ <br>
45
+ <li>"One of the keys to using the Wayback Machine effectively is knowing what it can and can't archive. It can, and has, archived many, many Twitter accounts... Utilize fun tools such as Wayback Tweets to do so more effectively." <a href="https://memeticwarfareweekly.substack.com/p/mww-paradise-by-the-telegram-dashboard" target="_blank">Ari Ben Am</a></li>
46
+ <br>
47
+ <li>"Want to see archived tweets on Wayback Machine in bulk? You can use Wayback Tweets." <a href="https://twitter.com/DailyOsint/status/1695065018662855102" target="_blank">Daily OSINT</a></li>
48
+ <br>
49
+ <li>"Untuk mempermudah penelusuran arsip, gunakan Wayback Tweets." <a href="https://twitter.com/gijnIndonesia/status/1685912219408805888" target="_blank">GIJN Indonesia</a></li>
50
+ <br>
51
+ <li>"A tool to quickly view tweets saved on archive.org." <a href="https://irinatechtips.substack.com/p/irina_tech_tips-newsletter-3-2023#%C2%A7wayback-tweets" target="_blank">Irina_Tech_Tips Newsletter #3</a></li>
52
+ <br>
53
+ </ul>
54
+
55
+ Legacy App
56
+ -------------
57
+
58
+ To access the legacy version of Wayback Tweets `click here <https://waybacktweets-legacy.streamlit.app>`_.
59
+
60
+ .. note::
61
+
62
+ If the application is down, please check the `Streamlit Cloud Status <https://www.streamlitstatus.com/>`_.
63
+
docs/todo.rst ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TODO
2
+ ================
3
+
4
+ .. |uncheck| raw:: html
5
+
6
+ <input type="checkbox">
7
+
8
+ |uncheck| Unit Tests
9
+
10
+ |uncheck| JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:110``), and avoid rate limiting
11
+
12
+ |uncheck| Download images when tweet URL has extensions like JPG or PNG
13
+
14
+ |uncheck| Implement logging system (remove print statements)
15
+
16
+ |uncheck| Mapping and parsing of other Twitter-related URLs
17
+
18
+ |uncheck| Develop a scraper to download snapshots from https://archive.today
docs/workflow.rst ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. _flowchart:
2
+
3
+ Workflow
4
+ ================
5
+
6
+ The tool was written following a proposal not only to retrieve data from archived tweets, but also to facilitate the reading of these tweets. Therefore, a flow is defined to obtain these results in the best possible way.
7
+
8
+ Due to limitations of the Wayback CDX Server API, it is not always possible to parse the results with the mimetype ``application/json``, regardless, the data in CDX format are saved.
9
+
10
+ Use the mouse to zoom in and out the flowchart.
11
+
12
+ .. mermaid::
13
+ :zoom:
14
+ :align: center
15
+
16
+ flowchart TB
17
+ A[input Username]--> B[(Wayback Machine)]
18
+ B--> B1[save Archived Tweets CDX data]
19
+ B1--> |parsing| C{embed Tweet URL\nvia Twitter Publisher}
20
+ C--> |2xx/3xx| D[return Tweet text]
21
+ C--> |4xx| E[return None]
22
+ E--> F{request Archived\nTweet URL}
23
+ F--> |4xx| G[return Only CDX data]
24
+ F--> |TODO: 2xx/3xx: application/json| J[return JSON text]
25
+ F--> |2xx/3xx: text/html, warc/revisit, unk| K[return HTML iframe tag]
legacy_app/legacy_app.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import re
3
+ from urllib.parse import unquote
4
+
5
+ import requests
6
+ import streamlit as st
7
+ import streamlit.components.v1 as components
8
+
9
+ year = datetime.datetime.now().year
10
+
11
+ st.set_page_config(
12
+ page_title="Wayback Tweets",
13
+ page_icon="🏛️",
14
+ layout="centered",
15
+ menu_items={
16
+ "About": """
17
+ ## 🏛️ Wayback Tweets
18
+
19
+ Tool that displays, via Wayback CDX Server API, multiple archived tweets on Wayback Machine to avoid opening each link manually. Users can apply filters based on specific years and view tweets that do not have the original URL available.
20
+
21
+ This tool is a prototype, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues). Created by [@claromes](https://claromes.com).
22
+
23
+ -------
24
+ """, # noqa: E501
25
+ },
26
+ )
27
+
28
+ # https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
29
+ hide_streamlit_style = """
30
+ <style>
31
+ header[data-testid="stHeader"] {
32
+ opacity: 0.5;
33
+ }
34
+ iframe {
35
+ border: 1px solid #dddddd;
36
+ border-radius: 0.5rem;
37
+ }
38
+ div[data-testid="InputInstructions"] {
39
+ visibility: hidden;
40
+ }
41
+ </style>
42
+ """
43
+
44
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
45
+
46
+ if "current_handle" not in st.session_state:
47
+ st.session_state.current_handle = ""
48
+
49
+ if "prev_disabled" not in st.session_state:
50
+ st.session_state.prev_disabled = False
51
+
52
+ if "next_disabled" not in st.session_state:
53
+ st.session_state.next_disabled = False
54
+
55
+ if "next_button" not in st.session_state:
56
+ st.session_state.next_button = False
57
+
58
+ if "prev_button" not in st.session_state:
59
+ st.session_state.prev_button = False
60
+
61
+ if "update_component" not in st.session_state:
62
+ st.session_state.update_component = 0
63
+
64
+ if "offset" not in st.session_state:
65
+ st.session_state.offset = 0
66
+
67
+ if "saved_at" not in st.session_state:
68
+ st.session_state.saved_at = (2006, year)
69
+
70
+ if "count" not in st.session_state:
71
+ st.session_state.count = False
72
+
73
+
74
+ def scroll_into_view():
75
+ js = f"""
76
+ <script>
77
+ window.parent.document.querySelector('section.main').scrollTo(0, 0);
78
+ let update_component = {st.session_state.update_component} // Force component update to generate scroll
79
+ </script>
80
+ """ # noqa: E501
81
+
82
+ components.html(js, width=0, height=0)
83
+
84
+
85
+ def clean_tweet(tweet):
86
+ handle = st.session_state.current_handle.lower()
87
+ tweet_lower = tweet.lower()
88
+
89
+ pattern = re.compile(r"/status/(\d+)")
90
+ match_lower_case = pattern.search(tweet_lower)
91
+ match_original_case = pattern.search(tweet)
92
+
93
+ if match_lower_case and handle in tweet_lower:
94
+ return f"https://twitter.com/{st.session_state.current_handle}/status/{match_original_case.group(1)}" # noqa: E501
95
+ else:
96
+ return tweet
97
+
98
+
99
+ def clean_link(link):
100
+ handle = st.session_state.current_handle.lower()
101
+ link = link.lower()
102
+
103
+ pattern = re.compile(r"/status/(\d+)")
104
+ match = pattern.search(link)
105
+
106
+ if match and handle in link:
107
+ return f"https://web.archive.org/web/{timestamp[i]}/https://twitter.com/{st.session_state.current_handle}/status/{match.group(1)}" # noqa: E501
108
+ else:
109
+ return link
110
+
111
+
112
+ def pattern_tweet(tweet):
113
+ # Reply: /status//
114
+ # Link: /status///
115
+ # Twimg: /status/https://pbs
116
+
117
+ pattern = re.compile(r'/status/"([^"]+)"')
118
+
119
+ match = pattern.search(tweet)
120
+ if match:
121
+ return match.group(1).lstrip("/")
122
+ else:
123
+ return tweet
124
+
125
+
126
+ def pattern_tweet_id(tweet):
127
+ # Delete sub-endpoint (/photos, /likes, /retweet...)
128
+ pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
129
+ match_username = pattern_username.match(tweet)
130
+
131
+ pattern_id = r"https://twitter.com/\w+/status/(\d+)"
132
+ match_id = re.search(pattern_id, tweet)
133
+
134
+ if match_id and match_username:
135
+ tweet_id = match_id.group(1)
136
+ username = match_username.group(1)
137
+ return f"https://twitter.com/{username}/status/{tweet_id}"
138
+ else:
139
+ return tweet
140
+
141
+
142
+ def check_double_status(url_wb, url_tweet):
143
+ if url_wb.count("/status/") == 2 and "twitter.com" not in url_tweet:
144
+ return True
145
+
146
+ return False
147
+
148
+
149
+ def embed(tweet):
150
+ try:
151
+ url = f"https://publish.twitter.com/oembed?url={clean_tweet(tweet)}"
152
+ response = requests.get(url)
153
+
154
+ regex = r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>' # noqa: E501
155
+ regex_author = r"^(.*?)\s*\("
156
+
157
+ if response.status_code == 200 or response.status_code == 302:
158
+ status_code = response.status_code
159
+ html = response.json()["html"]
160
+ author_name = response.json()["author_name"]
161
+
162
+ matches_html = re.findall(regex, html, re.DOTALL)
163
+
164
+ tweet_content = []
165
+ user_info = []
166
+ is_RT = []
167
+
168
+ for match in matches_html:
169
+ tweet_content_match = re.sub(r"<a[^>]*>|<\/a>", "", match[0].strip())
170
+ tweet_content_match = tweet_content_match.replace("<br>", "\n")
171
+
172
+ user_info_match = re.sub(r"<a[^>]*>|<\/a>", "", match[1].strip())
173
+ user_info_match = user_info_match.replace(")", "), ")
174
+
175
+ match_author = re.search(regex_author, user_info_match)
176
+ author_tweet = match_author.group(1)
177
+
178
+ if tweet_content_match:
179
+ tweet_content.append(tweet_content_match)
180
+ if user_info_match:
181
+ user_info.append(user_info_match)
182
+
183
+ is_RT_match = False
184
+ if author_name != author_tweet:
185
+ is_RT_match = True
186
+
187
+ is_RT.append(is_RT_match)
188
+
189
+ return status_code, tweet_content, user_info, is_RT
190
+ else:
191
+ return False
192
+ except requests.exceptions.Timeout:
193
+ st.error("Connection to web.archive.org timed out.")
194
+ except requests.exceptions.ConnectionError:
195
+ st.error("Failed to establish a new connection with web.archive.org.")
196
+ except UnboundLocalError:
197
+ st.empty()
198
+
199
+
200
+ @st.cache_data(ttl=1800, show_spinner=False)
201
+ def tweets_count(handle, saved_at):
202
+ url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&from={saved_at[0]}&to={saved_at[1]}" # noqa: E501
203
+ try:
204
+ response = requests.get(url)
205
+
206
+ if response.status_code == 200:
207
+ data = response.json()
208
+ if data and len(data) > 1:
209
+ total_tweets = len(data) - 1
210
+ return total_tweets
211
+ else:
212
+ return 0
213
+ except requests.exceptions.Timeout:
214
+ st.error("Connection to web.archive.org timed out.")
215
+ st.stop()
216
+ except requests.exceptions.ConnectionError:
217
+ st.error("Failed to establish a new connection with web.archive.org.")
218
+ except UnboundLocalError:
219
+ st.empty()
220
+
221
+
222
+ @st.cache_data(ttl=1800, show_spinner=False)
223
+ def query_api(handle, limit, offset, saved_at):
224
+ if not handle:
225
+ st.warning("username, please!")
226
+ st.stop()
227
+
228
+ url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&limit={limit}&offset={offset}&from={saved_at[0]}&to={saved_at[1]}" # noqa: E501
229
+ try:
230
+ response = requests.get(url)
231
+ response.raise_for_status()
232
+
233
+ if response.status_code == 200 or response.status_code == 304:
234
+ return response.json()
235
+ except requests.exceptions.Timeout:
236
+ st.error("Connection to web.archive.org timed out.")
237
+ except requests.exceptions.ConnectionError:
238
+ st.error("Failed to establish a new connection with web.archive.org.")
239
+ except UnboundLocalError:
240
+ st.empty()
241
+ except requests.exceptions.HTTPError:
242
+ st.error(
243
+ """
244
+ **Temporarily Offline**
245
+
246
+ Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information.
247
+ """ # noqa: E501
248
+ )
249
+ st.stop()
250
+
251
+
252
+ @st.cache_data(ttl=1800, show_spinner=False)
253
+ def parse_links(links):
254
+ parsed_links = []
255
+ timestamp = []
256
+ tweet_links = []
257
+ parsed_mimetype = []
258
+
259
+ for link in links[1:]:
260
+ tweet_remove_char = unquote(link[2]).replace("’", "")
261
+ cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"')
262
+
263
+ url = f"https://web.archive.org/web/{link[1]}/{tweet_remove_char}"
264
+
265
+ parsed_links.append(url)
266
+ timestamp.append(link[1])
267
+ tweet_links.append(cleaned_tweet)
268
+ parsed_mimetype.append(link[3])
269
+
270
+ return parsed_links, tweet_links, parsed_mimetype, timestamp
271
+
272
+
273
+ def attr(i):
274
+ original_tweet = pattern_tweet_id(clean_tweet(tweet_links[i]))
275
+
276
+ if status:
277
+ original_tweet = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
278
+ elif "://" not in tweet_links[i]:
279
+ original_tweet = pattern_tweet_id(f"https://{tweet_links[i]}")
280
+
281
+ st.markdown(
282
+ f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}' # noqa: E501
283
+ )
284
+
285
+
286
+ def display_tweet():
287
+ if (
288
+ mimetype[i] == "application/json"
289
+ or mimetype[i] == "text/html"
290
+ or mimetype[i] == "unk"
291
+ or mimetype[i] == "warc/revisit"
292
+ ):
293
+ if is_RT[0] is True:
294
+ st.info("*Retweet*")
295
+ st.write(tweet_content[0])
296
+ st.write(f"**{user_info[0]}**")
297
+
298
+ st.divider()
299
+ else:
300
+ st.warning("MIME Type was not parsed.")
301
+
302
+ st.divider()
303
+
304
+
305
+ def display_not_tweet():
306
+ original_link = pattern_tweet_id(clean_tweet(tweet_links[i]))
307
+
308
+ if status:
309
+ original_link = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
310
+ elif "://" not in tweet_links[i]:
311
+ original_link = pattern_tweet_id(f"https://{tweet_links[i]}")
312
+
313
+ response_html = requests.get(original_link)
314
+
315
+ if (
316
+ mimetype[i] == "text/html"
317
+ or mimetype[i] == "warc/revisit"
318
+ or mimetype[i] == "unk"
319
+ ):
320
+ if (
321
+ ".jpg" in tweet_links[i] or ".png" in tweet_links[i]
322
+ ) and response_html.status_code == 200:
323
+ components.iframe(tweet_links[i], height=500, scrolling=True)
324
+ elif "/status/" not in original_link:
325
+ st.info("This isn't a status or is not available")
326
+ elif status or f"{st.session_state.current_handle}" not in original_link:
327
+ st.info(f"Replying to {st.session_state.current_handle}")
328
+ else:
329
+ components.iframe(clean_link(link), height=500, scrolling=True)
330
+
331
+ st.divider()
332
+ elif mimetype[i] == "application/json":
333
+ try:
334
+ response_json = requests.get(link)
335
+
336
+ if response_json.status_code == 200:
337
+ json_data = response_json.json()
338
+
339
+ if "data" in json_data:
340
+ if "text" in json_data["data"]:
341
+ json_text = json_data["data"]["text"]
342
+ else:
343
+ json_text = json_data["data"]
344
+ else:
345
+ if "text" in json_data:
346
+ json_text = json_data["text"]
347
+ else:
348
+ json_text = json_data
349
+
350
+ st.code(json_text)
351
+ st.json(json_data, expanded=False)
352
+
353
+ st.divider()
354
+ else:
355
+ st.error(response_json.status_code)
356
+
357
+ st.divider()
358
+ except requests.exceptions.Timeout:
359
+ st.error("Connection to web.archive.org timed out.")
360
+ st.divider()
361
+ except requests.exceptions.ConnectionError:
362
+ st.error("Failed to establish a new connection with web.archive.org.")
363
+ st.divider()
364
+ except UnboundLocalError:
365
+ st.empty()
366
+ else:
367
+ st.warning("MIME Type was not parsed.")
368
+ st.divider()
369
+
370
+
371
+ def prev_page():
372
+ st.session_state.offset -= tweets_per_page
373
+
374
+ # scroll to top config
375
+ st.session_state.update_component += 1
376
+ scroll_into_view()
377
+
378
+
379
+ def next_page():
380
+ st.session_state.offset += tweets_per_page
381
+
382
+ # scroll to top config
383
+ st.session_state.update_component += 1
384
+ scroll_into_view()
385
+
386
+
387
+ # UI
388
+ st.title(
389
+ "Wayback Tweets [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)", # noqa: E501
390
+ anchor=False,
391
+ help="v0.4.3",
392
+ )
393
+ st.write(
394
+ "Display multiple archived tweets on Wayback Machine and avoid opening each link manually" # noqa: E501
395
+ )
396
+
397
+ handle = st.text_input("Username", placeholder="jack")
398
+
399
+ st.session_state.saved_at = st.slider("Tweets saved between", 2006, year, (2006, year))
400
+
401
+ not_available = st.checkbox(
402
+ "Original URLs not available",
403
+ help="Due to changes in X, it is possible to find available tweets if you are logged into X", # noqa: E501
404
+ )
405
+
406
+ query = st.button("Query", type="primary", use_container_width=True)
407
+
408
+ if handle != st.session_state.current_handle:
409
+ st.session_state.current_handle = handle
410
+ st.session_state.offset = 0
411
+
412
+ if query or st.session_state.count:
413
+ tweets_per_page = 25
414
+
415
+ st.session_state.count = tweets_count(handle, st.session_state.saved_at)
416
+
417
+ st.caption(
418
+ "The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
419
+ )
420
+ st.write(f"**{st.session_state.count} URLs have been captured**")
421
+
422
+ if st.session_state.count:
423
+ if tweets_per_page > st.session_state.count:
424
+ tweets_per_page = st.session_state.count
425
+
426
+ try:
427
+ progress = st.empty()
428
+ links = query_api(
429
+ handle, tweets_per_page, st.session_state.offset, st.session_state.saved_at
430
+ )
431
+
432
+ parse = parse_links(links)
433
+ parsed_links = parse[0]
434
+ tweet_links = parse[1]
435
+ mimetype = parse[2]
436
+ timestamp = parse[3]
437
+
438
+ if links:
439
+ st.divider()
440
+
441
+ st.session_state.current_handle = handle
442
+
443
+ return_none_count = 0
444
+
445
+ start_index = st.session_state.offset
446
+ end_index = min(st.session_state.count, start_index + tweets_per_page)
447
+
448
+ with st.spinner("Fetching..."):
449
+ for i in range(tweets_per_page):
450
+ try:
451
+ if tweet_links[i]:
452
+ link = parsed_links[i]
453
+ tweet = embed(tweet_links[i])
454
+
455
+ status = check_double_status(link, tweet_links[i])
456
+
457
+ if not not_available:
458
+ attr(i)
459
+
460
+ if tweet:
461
+ status_code = tweet[0]
462
+ tweet_content = tweet[1]
463
+ user_info = tweet[2]
464
+ is_RT = tweet[3]
465
+
466
+ display_tweet()
467
+ elif not tweet:
468
+ display_not_tweet()
469
+
470
+ if not_available:
471
+ if not tweet:
472
+ return_none_count += 1
473
+ attr(i)
474
+
475
+ display_not_tweet()
476
+
477
+ progress.write(
478
+ f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
479
+ )
480
+
481
+ if start_index <= 0:
482
+ st.session_state.prev_disabled = True
483
+ else:
484
+ st.session_state.prev_disabled = False
485
+
486
+ if i + 1 == st.session_state.count:
487
+ st.session_state.next_disabled = True
488
+ else:
489
+ st.session_state.next_disabled = False
490
+ except IndexError:
491
+ if start_index <= 0:
492
+ st.session_state.prev_disabled = True
493
+ else:
494
+ st.session_state.prev_disabled = False
495
+
496
+ st.session_state.next_disabled = True
497
+
498
+ prev, _, next = st.columns([3, 4, 3])
499
+
500
+ prev.button(
501
+ "Previous",
502
+ disabled=st.session_state.prev_disabled,
503
+ key="prev_button_key",
504
+ on_click=prev_page,
505
+ type="primary",
506
+ use_container_width=True,
507
+ )
508
+ next.button(
509
+ "Next",
510
+ disabled=st.session_state.next_disabled,
511
+ key="next_button_key",
512
+ on_click=next_page,
513
+ type="primary",
514
+ use_container_width=True,
515
+ )
516
+
517
+ if not links:
518
+ st.error("Unable to query the Wayback Machine API.")
519
+ except TypeError as e:
520
+ st.error(
521
+ f"""
522
+ {e}. Refresh this page and try again.
523
+ """ # noqa: E501
524
+ )
525
+ st.session_state.offset = 0
legacy_app/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ requests==2.30.0
2
+ streamlit==1.27.0
waybacktweets/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # flake8: noqa: F401
2
+
3
+ from waybacktweets.api.export import TweetsExporter
4
+ from waybacktweets.api.parse import JsonParser, TweetsParser, TwitterEmbed
5
+ from waybacktweets.api.request import WaybackTweets
6
+ from waybacktweets.api.visualize import HTMLTweetsVisualizer
waybacktweets/_cli.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CLI functions for retrieving archived tweets.
3
+ """
4
+
5
+ from datetime import datetime
6
+ from typing import Any, Optional
7
+
8
+ import click
9
+ from rich import print as rprint
10
+
11
+ from waybacktweets.api.export import TweetsExporter
12
+ from waybacktweets.api.parse import TweetsParser
13
+ from waybacktweets.api.request import WaybackTweets
14
+ from waybacktweets.config.config import config
15
+
16
+
17
+ def _parse_date(
18
+ ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None
19
+ ) -> Optional[str]:
20
+ """
21
+ Parses a date string and returns it in the format "YYYYMMDD".
22
+
23
+ Args:
24
+ ctx: Necessary when used with the click package. Defaults to None.
25
+ param: Necessary when used with the click package. Defaults to None.
26
+ value: A date string in the "YYYYMMDD" format. Defaults to None.
27
+
28
+ Returns:
29
+ The input date string formatted in the "YYYYMMDD" format, or None if no date string was provided.
30
+ """ # noqa: E501
31
+ try:
32
+ if value is None:
33
+ return None
34
+
35
+ date = datetime.strptime(value, "%Y%m%d")
36
+
37
+ return date.strftime("%Y%m%d")
38
+ except ValueError:
39
+ raise click.BadParameter("Date must be in format YYYYmmdd")
40
+
41
+
42
+ @click.command()
43
+ @click.argument("username", type=str)
44
+ @click.option(
45
+ "-c",
46
+ "--collapse",
47
+ type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
48
+ default=None,
49
+ help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501
50
+ )
51
+ @click.option(
52
+ "-f",
53
+ "--from",
54
+ "timestamp_from",
55
+ type=click.UNPROCESSED,
56
+ metavar="DATE",
57
+ callback=_parse_date,
58
+ default=None,
59
+ help="Filtering by date range from this date. Format: YYYYmmdd",
60
+ )
61
+ @click.option(
62
+ "-t",
63
+ "--to",
64
+ "timestamp_to",
65
+ type=click.UNPROCESSED,
66
+ metavar="DATE",
67
+ callback=_parse_date,
68
+ default=None,
69
+ help="Filtering by date range up to this date. Format: YYYYmmdd",
70
+ )
71
+ @click.option(
72
+ "-l",
73
+ "--limit",
74
+ type=int,
75
+ metavar="INTEGER",
76
+ default=None,
77
+ help="Query result limits.",
78
+ )
79
+ @click.option(
80
+ "-o",
81
+ "--offset",
82
+ type=int,
83
+ metavar="INTEGER",
84
+ default=None,
85
+ help="Allows for a simple way to scroll through the results.",
86
+ )
87
+ @click.option(
88
+ "-mt",
89
+ "--matchtype",
90
+ type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
91
+ default=None,
92
+ help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501
93
+ )
94
+ @click.option(
95
+ "-v",
96
+ "--verbose",
97
+ "verbose",
98
+ is_flag=True,
99
+ default=False,
100
+ help="Shows the error log.",
101
+ )
102
+ def main(
103
+ username: str,
104
+ collapse: Optional[str],
105
+ timestamp_from: Optional[str],
106
+ timestamp_to: Optional[str],
107
+ limit: Optional[int],
108
+ offset: Optional[int],
109
+ matchtype: Optional[str],
110
+ verbose: Optional[bool],
111
+ ) -> None:
112
+ """
113
+ Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data.
114
+
115
+ USERNAME: The Twitter username without @.
116
+ """ # noqa: E501
117
+ try:
118
+ config.verbose = verbose
119
+
120
+ api = WaybackTweets(
121
+ username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
122
+ )
123
+
124
+ print(f"Waybacking @{username}'s archived tweets...")
125
+ archived_tweets = api.get()
126
+
127
+ if archived_tweets:
128
+ field_options = [
129
+ "archived_urlkey",
130
+ "archived_timestamp",
131
+ "parsed_archived_timestamp",
132
+ "archived_tweet_url",
133
+ "parsed_archived_tweet_url",
134
+ "original_tweet_url",
135
+ "parsed_tweet_url",
136
+ "available_tweet_text",
137
+ "available_tweet_is_RT",
138
+ "available_tweet_info",
139
+ "archived_mimetype",
140
+ "archived_statuscode",
141
+ "archived_digest",
142
+ "archived_length",
143
+ ]
144
+
145
+ parser = TweetsParser(archived_tweets, username, field_options)
146
+ parsed_tweets = parser.parse(print_progress=True)
147
+
148
+ exporter = TweetsExporter(parsed_tweets, username, field_options)
149
+
150
+ exporter.save_to_csv()
151
+ exporter.save_to_json()
152
+ exporter.save_to_html()
153
+ except Exception as e:
154
+ rprint(f"[red]{e}")
155
+ finally:
156
+ rprint(
157
+ "[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501
158
+ )
waybacktweets/api/__init__.py ADDED
File without changes
waybacktweets/api/export.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Exports the parsed archived tweets.
3
+ """
4
+
5
+ import datetime
6
+ import os
7
+ import re
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ import pandas as pd
11
+
12
+ from waybacktweets.api.visualize import HTMLTweetsVisualizer
13
+
14
+
15
+ class TweetsExporter:
16
+ """
17
+ Class responsible for exporting parsed archived tweets.
18
+
19
+ Args:
20
+ data (Dict[str, List[Any]]): The parsed archived tweets data.
21
+ username (str): The username associated with the tweets.
22
+ field_options (List[str]): The fields to be included in the exported data. For more details on each option, visit :ref:`field_options`.
23
+ """ # noqa: E501
24
+
25
+ def __init__(
26
+ self, data: Dict[str, List[Any]], username: str, field_options: List[str]
27
+ ):
28
+ self.data = data
29
+ self.username = username
30
+ self.field_options = field_options
31
+ self.formatted_datetime = self._datetime_now()
32
+ self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
33
+ self.dataframe = self._create_dataframe()
34
+
35
+ @staticmethod
36
+ def _datetime_now() -> str:
37
+ """
38
+ Returns the current datetime, formatted as a string.
39
+
40
+ Returns:
41
+ The current datetime.
42
+ """
43
+ now = datetime.datetime.now()
44
+ formatted_now = now.strftime("%Y%m%d%H%M%S")
45
+ formatted_now = re.sub(r"\W+", "", formatted_now)
46
+
47
+ return formatted_now
48
+
49
+ @staticmethod
50
+ def _transpose_matrix(
51
+ data: Dict[str, List[Any]], fill_value: Optional[Any] = None
52
+ ) -> List[List[Any]]:
53
+ """
54
+ Transposes a matrix, filling in missing values with a specified fill value if needed.
55
+
56
+ Args:
57
+ data (Dict[str, List[Any]]): The matrix to be transposed.
58
+ fill_value (Optional[Any]): The value to fill in missing values with.
59
+
60
+ Returns:
61
+ The transposed matrix.
62
+ """ # noqa: E501
63
+ max_length = max(len(sublist) for sublist in data.values())
64
+
65
+ filled_data = {
66
+ key: value + [fill_value] * (max_length - len(value))
67
+ for key, value in data.items()
68
+ }
69
+
70
+ data_transposed = [list(row) for row in zip(*filled_data.values())]
71
+
72
+ return data_transposed
73
+
74
+ def _create_dataframe(self) -> pd.DataFrame:
75
+ """
76
+ Creates a DataFrame from the transposed data.
77
+
78
+ Returns:
79
+ The DataFrame representation of the data.
80
+ """
81
+ data_transposed = self._transpose_matrix(self.data)
82
+
83
+ df = pd.DataFrame(data_transposed, columns=self.field_options)
84
+
85
+ return df
86
+
87
+ def save_to_csv(self) -> None:
88
+ """
89
+ Saves the DataFrame to a CSV file.
90
+ """
91
+ csv_file_path = f"{self.filename}.csv"
92
+ self.dataframe.to_csv(csv_file_path, index=False)
93
+
94
+ print(f"Saved to {csv_file_path}")
95
+
96
+ def save_to_json(self) -> None:
97
+ """
98
+ Saves the DataFrame to a JSON file.
99
+ """
100
+ json_path = f"{self.filename}.json"
101
+ self.dataframe.to_json(json_path, orient="records", lines=False)
102
+
103
+ print(f"Saved to {json_path}")
104
+
105
+ def save_to_html(self) -> None:
106
+ """
107
+ Saves the DataFrame to an HTML file.
108
+ """
109
+ json_path = f"{self.filename}.json"
110
+
111
+ if not os.path.exists(json_path):
112
+ self.save_to_json()
113
+
114
+ html_file_path = f"{self.filename}.html"
115
+
116
+ html = HTMLTweetsVisualizer(self.username, json_path, html_file_path)
117
+
118
+ html_content = html.generate()
119
+ html.save(html_content)
120
+
121
+ print(f"Saved to {html_file_path}")
waybacktweets/api/parse.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Parses the returned data from the Wayback CDX Server API.
3
+ """
4
+
5
+ import re
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from contextlib import nullcontext
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ from urllib.parse import unquote
10
+
11
+ from rich import print as rprint
12
+ from rich.progress import Progress
13
+
14
+ from waybacktweets.config.config import config
15
+ from waybacktweets.config.field_options import FIELD_OPTIONS
16
+ from waybacktweets.exceptions.exceptions import (
17
+ ConnectionError,
18
+ GetResponseError,
19
+ HTTPError,
20
+ )
21
+ from waybacktweets.utils.utils import (
22
+ check_double_status,
23
+ check_pattern_tweet,
24
+ check_url_scheme,
25
+ clean_tweet_url,
26
+ delete_tweet_pathnames,
27
+ get_response,
28
+ is_tweet_url,
29
+ semicolon_parser,
30
+ timestamp_parser,
31
+ )
32
+
33
+
34
+ class TwitterEmbed:
35
+ """
36
+ This class is responsible for parsing tweets using the Twitter Publish service.
37
+
38
+ Args:
39
+ tweet_url (str): The URL of the tweet to be parsed.
40
+ """
41
+
42
+ def __init__(self, tweet_url: str):
43
+ self.tweet_url = tweet_url
44
+
45
+ def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
46
+ """
47
+ Parses the archived tweets when they are still available.
48
+
49
+ This function goes through each archived tweet and checks if it is still available. If the tweet is available, it extracts the necessary information and adds it to the respective lists. The function returns a tuple of three lists:
50
+
51
+ - The first list contains the tweet texts.
52
+ - The second list contains boolean values indicating whether each tweet is still available.
53
+ - The third list contains the URLs of the tweets.
54
+
55
+ Returns:
56
+ A tuple of three lists containing the tweet texts, availability statuses, and URLs, respectively. If no tweets are available, returns None.
57
+ """ # noqa: E501
58
+ try:
59
+ url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
60
+ response = get_response(url=url)
61
+ if response:
62
+ json_response = response.json()
63
+ html = json_response["html"]
64
+ author_name = json_response["author_name"]
65
+
66
+ regex = re.compile(
67
+ r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>', # noqa
68
+ re.DOTALL,
69
+ )
70
+ regex_author = re.compile(r"^(.*?)\s*\(")
71
+
72
+ matches_html = regex.findall(html)
73
+
74
+ tweet_content = []
75
+ user_info = []
76
+ is_RT = []
77
+
78
+ for match in matches_html:
79
+ tweet_content_match = re.sub(
80
+ r"<a[^>]*>|<\/a>", "", match[0].strip()
81
+ ).replace("<br>", "\n")
82
+ user_info_match = re.sub(
83
+ r"<a[^>]*>|<\/a>", "", match[1].strip()
84
+ ).replace(")", "), ")
85
+ match_author = regex_author.search(user_info_match)
86
+ author_tweet = match_author.group(1) if match_author else ""
87
+
88
+ if tweet_content_match:
89
+ tweet_content.append(tweet_content_match)
90
+ if user_info_match:
91
+ user_info.append(user_info_match)
92
+ is_RT.append(author_name != author_tweet)
93
+
94
+ return tweet_content, is_RT, user_info
95
+ except ConnectionError:
96
+ if config.verbose:
97
+ rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
98
+ except HTTPError:
99
+ if config.verbose:
100
+ rprint(
101
+ f"[yellow]{self.tweet_url} not available on the user's Twitter account, but the CDX data was saved." # noqa: E501
102
+ )
103
+ except GetResponseError as e:
104
+ if config.verbose:
105
+ rprint(f"[red]An error occurred: {str(e)}")
106
+
107
+ return None
108
+
109
+
110
+ class JsonParser:
111
+ """
112
+ This class is responsible for parsing tweets when the mimetype is application/json.
113
+
114
+ Note: This class is in an experimental phase.
115
+
116
+ Args:
117
+ archived_tweet_url (str): The URL of the archived tweet to be parsed.
118
+ """ # noqa: E501
119
+
120
+ def __init__(self, archived_tweet_url: str):
121
+ self.archived_tweet_url = archived_tweet_url
122
+
123
+ def parse(self) -> str:
124
+ """
125
+ Parses the archived tweets in JSON format.
126
+
127
+ Returns:
128
+ The parsed tweet text.
129
+ """
130
+ try:
131
+ response = get_response(url=self.archived_tweet_url)
132
+
133
+ if response:
134
+ json_data = response.json()
135
+
136
+ if "data" in json_data:
137
+ return json_data["data"].get("text", json_data["data"])
138
+
139
+ if "retweeted_status" in json_data:
140
+ return json_data["retweeted_status"].get(
141
+ "text", json_data["retweeted_status"]
142
+ )
143
+
144
+ return json_data.get("text", json_data)
145
+ except ConnectionError:
146
+ if config.verbose:
147
+ rprint(
148
+ f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
149
+ )
150
+ except GetResponseError as e:
151
+ if config.verbose:
152
+ rprint(f"[red]An error occurred: {str(e)}")
153
+
154
+ return None
155
+
156
+
157
+ class TweetsParser:
158
+ """
159
+ This class is responsible for the overall parsing of archived tweets.
160
+
161
+ Args:
162
+ archived_tweets_response (List[str]): The response from the archived tweets.
163
+ username (str): The username associated with the tweets.
164
+ field_options (List[str]): The fields to be included in the parsed data. For more details on each option, visit :ref:`field_options`.
165
+ """ # noqa: E501
166
+
167
+ def __init__(
168
+ self,
169
+ archived_tweets_response: List[str],
170
+ username: str,
171
+ field_options: List[str],
172
+ ):
173
+ if not all(option in FIELD_OPTIONS for option in field_options):
174
+ raise ValueError("Some field options are not valid.")
175
+
176
+ self.archived_tweets_response = archived_tweets_response
177
+ self.username = username
178
+ self.field_options = field_options
179
+ self.parsed_tweets = {option: [] for option in self.field_options}
180
+
181
+ def _add_field(self, key: str, value: Any) -> None:
182
+ """
183
+ Appends a value to a list in the parsed data structure.
184
+
185
+ Args:
186
+ key (str): The key in the parsed data structure.
187
+ value (Any): The value to be appended.
188
+ """
189
+ if key in self.parsed_tweets:
190
+ self.parsed_tweets[key].append(value)
191
+
192
+ def _process_response(self, response: List[str]) -> None:
193
+ """
194
+ Processes the archived tweet's response and adds the relevant CDX data.
195
+
196
+ Args:
197
+ response (List[str]): The response from the archived tweet.
198
+ """
199
+ tweet_remove_char = unquote(response[2]).replace("’", "")
200
+ cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
201
+
202
+ wayback_machine_url = (
203
+ f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
204
+ )
205
+ original_tweet = delete_tweet_pathnames(
206
+ clean_tweet_url(cleaned_tweet, self.username)
207
+ )
208
+
209
+ double_status = check_double_status(wayback_machine_url, original_tweet)
210
+
211
+ if double_status:
212
+ original_tweet = delete_tweet_pathnames(
213
+ f"https://twitter.com{original_tweet}"
214
+ )
215
+ elif "://" not in original_tweet:
216
+ original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
217
+
218
+ parsed_wayback_machine_url = (
219
+ f"https://web.archive.org/web/{response[1]}/{original_tweet}"
220
+ )
221
+
222
+ encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url))
223
+ encoded_parsed_archived_tweet = check_url_scheme(
224
+ semicolon_parser(parsed_wayback_machine_url)
225
+ )
226
+ encoded_tweet = check_url_scheme(semicolon_parser(response[2]))
227
+ encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet))
228
+
229
+ available_tweet_text = None
230
+ available_tweet_is_RT = None
231
+ available_tweet_info = None
232
+
233
+ is_tweet = is_tweet_url(encoded_tweet)
234
+
235
+ if is_tweet:
236
+ embed_parser = TwitterEmbed(encoded_tweet)
237
+ content = embed_parser.embed()
238
+
239
+ if content:
240
+ available_tweet_text = semicolon_parser(content[0][0])
241
+ available_tweet_is_RT = content[1][0]
242
+ available_tweet_info = semicolon_parser(content[2][0])
243
+
244
+ self._add_field("available_tweet_text", available_tweet_text)
245
+ self._add_field("available_tweet_is_RT", available_tweet_is_RT)
246
+ self._add_field("available_tweet_info", available_tweet_info)
247
+
248
+ self._add_field("archived_urlkey", response[0])
249
+ self._add_field("archived_timestamp", response[1])
250
+ self._add_field("parsed_archived_timestamp", timestamp_parser(response[1]))
251
+ self._add_field("archived_tweet_url", encoded_archived_tweet)
252
+ self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
253
+ self._add_field("original_tweet_url", encoded_tweet)
254
+ self._add_field("parsed_tweet_url", encoded_parsed_tweet)
255
+ self._add_field("archived_mimetype", response[3])
256
+ self._add_field("archived_statuscode", response[4])
257
+ self._add_field("archived_digest", response[5])
258
+ self._add_field("archived_length", response[6])
259
+
260
+ def parse(self, print_progress=False) -> Dict[str, List[Any]]:
261
+ """
262
+ Parses the archived tweets CDX data and structures it.
263
+
264
+ Args:
265
+ print_progress (bool): A boolean indicating whether to print progress or not.
266
+
267
+ Returns:
268
+ The parsed tweets data.
269
+ """ # noqa: E501
270
+ with ThreadPoolExecutor(max_workers=10) as executor:
271
+
272
+ futures = {
273
+ executor.submit(self._process_response, response): response
274
+ for response in self.archived_tweets_response[1:]
275
+ }
276
+
277
+ progress_context = Progress() if print_progress else nullcontext()
278
+ with progress_context as progress:
279
+ task = None
280
+ if print_progress:
281
+ task = progress.add_task(
282
+ f"Parsing @{self.username}'s archived tweets\n",
283
+ total=len(futures),
284
+ )
285
+
286
+ for future in as_completed(futures):
287
+ try:
288
+ future.result()
289
+ except Exception as e:
290
+ rprint(f"[red]{e}")
291
+
292
+ if print_progress:
293
+ progress.update(task, advance=1)
294
+
295
+ return self.parsed_tweets
waybacktweets/api/request.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Requests data from the Wayback Machine API.
3
+ """
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+ from rich import print as rprint
8
+
9
+ from waybacktweets.config.config import config
10
+ from waybacktweets.exceptions.exceptions import (
11
+ ConnectionError,
12
+ EmptyResponseError,
13
+ GetResponseError,
14
+ HTTPError,
15
+ ReadTimeoutError,
16
+ )
17
+ from waybacktweets.utils.utils import get_response
18
+
19
+
20
+ class WaybackTweets:
21
+ """
22
+ Class responsible for requesting data from the Wayback CDX Server API.
23
+
24
+ Args:
25
+ username (str): The username associated with the tweets.
26
+ collapse (str, optional): The field to collapse duplicate lines on.
27
+ timestamp_from (str, optional): The timestamp to start retrieving tweets from.
28
+ timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
29
+ limit (int, optional): The maximum number of results to return.
30
+ offset (int, optional): The number of lines to skip in the results.
31
+ matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
32
+ """ # noqa: E501
33
+
34
+ def __init__(
35
+ self,
36
+ username: str,
37
+ collapse: str = None,
38
+ timestamp_from: str = None,
39
+ timestamp_to: str = None,
40
+ limit: int = None,
41
+ offset: int = None,
42
+ matchtype: str = None,
43
+ ):
44
+ self.username = username
45
+ self.collapse = collapse
46
+ self.timestamp_from = timestamp_from
47
+ self.timestamp_to = timestamp_to
48
+ self.limit = limit
49
+ self.offset = offset
50
+ self.matchtype = matchtype
51
+
52
+ def get(self) -> Optional[Dict[str, Any]]:
53
+ """
54
+ Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets.
55
+
56
+ Returns:
57
+ The response from the CDX API in JSON format, if successful. Otherwise, None.
58
+ """ # noqa: E501
59
+ url = "https://web.archive.org/cdx/search/cdx"
60
+
61
+ wildcard_pathname = "/*"
62
+ if self.matchtype:
63
+ wildcard_pathname = ""
64
+
65
+ params = {
66
+ "url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
67
+ "output": "json",
68
+ }
69
+
70
+ if self.collapse:
71
+ params["collapse"] = self.collapse
72
+
73
+ if self.timestamp_from:
74
+ params["from"] = self.timestamp_from
75
+
76
+ if self.timestamp_to:
77
+ params["to"] = self.timestamp_to
78
+
79
+ if self.limit:
80
+ params["limit"] = self.limit
81
+
82
+ if self.offset:
83
+ params["offset"] = self.offset
84
+
85
+ if self.matchtype:
86
+ params["matchType"] = self.matchtype
87
+
88
+ try:
89
+ response = get_response(url=url, params=params)
90
+ return response.json()
91
+ except ReadTimeoutError:
92
+ if config.verbose:
93
+ rprint("[red]Connection to web.archive.org timed out.")
94
+ except ConnectionError:
95
+ if config.verbose:
96
+ rprint(
97
+ "[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
98
+ )
99
+ except HTTPError as e:
100
+ if config.verbose:
101
+ rprint(f"[red]HTTP error occurred: {str(e)}")
102
+ except EmptyResponseError:
103
+ if config.verbose:
104
+ rprint("[red]No data was saved due to an empty response.")
105
+ except GetResponseError as e:
106
+ if config.verbose:
107
+ rprint(f"[red]An error occurred: {str(e)}")
108
+
109
+ return None
waybacktweets/api/visualize.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa: E501
2
+ """
3
+ Generates an HTML file to visualize the parsed data.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from typing import Any, Dict, List, Union
9
+
10
+ from waybacktweets.utils import timestamp_parser
11
+
12
+
13
+ class HTMLTweetsVisualizer:
14
+ """
15
+ Class responsible for generating an HTML file to visualize the parsed data.
16
+
17
+ Args:
18
+ username (str): The username associated with the tweets.
19
+ json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
20
+ html_file_path (str, optional): The path where the HTML file will be saved.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ username: str,
26
+ json_path: Union[str, List[str]],
27
+ html_file_path: str = None,
28
+ ):
29
+ self.username = username
30
+ self.json_path = self._json_loader(json_path)
31
+ self.html_file_path = html_file_path
32
+
33
+ @staticmethod
34
+ def _json_loader(json_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
35
+ """
36
+ Reads and loads JSON data from a specified file path or JSON string.
37
+
38
+ Args:
39
+ json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
40
+
41
+ Returns:
42
+ The content of the JSON file or data.
43
+ """
44
+ if os.path.isfile(json_path):
45
+ with open(json_path, "r", encoding="utf-8") as f:
46
+ return json.load(f)
47
+
48
+ return json.loads(json_path)
49
+
50
+ def generate(self) -> str:
51
+ """
52
+ Generates an HTML string that represents the parsed data.
53
+
54
+ Returns:
55
+ The generated HTML string.
56
+ """
57
+ tweets_per_page = 24
58
+ total_pages = (len(self.json_path) + tweets_per_page - 1) // tweets_per_page
59
+
60
+ html = "<!DOCTYPE html>\n"
61
+ html += '<html lang="en">\n'
62
+ html += "<!-- This document was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
63
+
64
+ html += "<head>"
65
+ html += '<meta charset="UTF-8">\n'
66
+ html += (
67
+ '<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'
68
+ )
69
+ html += f"<title>@{self.username}'s archived tweets</title>\n"
70
+
71
+ # Adds styling
72
+ html += "<style>\n"
73
+ html += "body { font-family: monospace; background-color: whitesmoke; color: #1c1e21; margin: 0; padding: 20px; }\n"
74
+ html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
75
+ html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e2e2e2; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
76
+ html += ".tweet strong { font-weight: bold; }\n"
77
+ html += ".tweet a { color: #000000; text-decoration: none; }\n"
78
+ html += ".content { color: #000000; }\n"
79
+ html += ".source { font-size: 12px; text-align: center; }\n"
80
+ html += ".tweet a:hover { text-decoration: underline; }\n"
81
+ html += "h1, h3 { text-align: center; }\n"
82
+ html += "iframe { width: 600px; height: 600px; }\n"
83
+ html += "input { position: absolute; opacity: 0; z-index: -1; }\n"
84
+ html += ".accordion { margin: 10px; border-radius: 5px; overflow: hidden; box-shadow: 0 4px 4px -2px rgba(0, 0, 0, 0.4); }\n"
85
+ html += ".accordion-label { display: flex; justify-content: space-between; padding: 1em; font-weight: bold; cursor: pointer; background: #000000; color: #ffffff; }\n"
86
+ html += ".accordion-content { max-height: 0; padding: 0 1em; background: white; transition: all 0.35s; }\n"
87
+ html += (
88
+ "input:checked ~ .accordion-content { max-height: 100vh; padding: 1em; }\n"
89
+ )
90
+ html += ".pagination { text-align: center; margin-top: 20px; }\n"
91
+ html += ".pagination a { margin: 0 5px; text-decoration: none; color: #000000; padding: 1px 2px; border-radius: 5px; }\n"
92
+ html += ".pagination a:hover { background-color: #e2e2e2; }\n"
93
+ html += ".pagination a.selected { background-color: #e2e2e2; color: #000000; font-weight: bold; }\n"
94
+ html += "</style>\n"
95
+
96
+ html += "</head>\n<body>\n"
97
+
98
+ html += f"<h1>@{self.username}'s archived tweets</h1>\n"
99
+
100
+ html += (
101
+ '<p id="loading_first_page">Building pagination with JavaScript...</p>\n'
102
+ )
103
+
104
+ for page in range(1, total_pages + 1):
105
+ html += (
106
+ f'<div id="page_{page}" style="display:none;">\n' # Starts a new page
107
+ )
108
+ html += '<div class="container">\n'
109
+
110
+ start_index = (page - 1) * tweets_per_page
111
+ end_index = min(start_index + tweets_per_page, len(self.json_path))
112
+
113
+ for index in range(start_index, end_index):
114
+ tweet = self.json_path[index]
115
+ html += '<div class="tweet">\n'
116
+
117
+ if not tweet["available_tweet_text"]:
118
+ iframe_src = {
119
+ "Archived Tweet": tweet["archived_tweet_url"],
120
+ "Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
121
+ "Original Tweet": tweet["original_tweet_url"],
122
+ "Parsed Tweet": tweet["parsed_tweet_url"],
123
+ }
124
+
125
+ for key, value in iframe_src.items():
126
+ key_cleaned = key.replace(" ", "_")
127
+
128
+ html += '<div class="accordion">\n'
129
+ html += f'<input type="checkbox" id="tab_{index}_{key_cleaned}" />\n'
130
+ html += f'<label class="accordion-label" for="tab_{index}_{key_cleaned}">Click to load the iframe from {key}</label>\n'
131
+ html += '<div class="accordion-content">\n'
132
+
133
+ html += f'<div id="loading_{index}_{key_cleaned}" class="loading">Loading...</div>\n'
134
+ html += f'<iframe id="iframe_{index}_{key_cleaned}" frameborder="0" scrolling="auto" loading="lazy" style="display: none;" onload="document.getElementById(\'loading_{index}_{key_cleaned}\').style.display=\'none\'; this.style.display=\'block\';"></iframe>\n'
135
+ html += "</div>\n"
136
+ html += "</div>\n"
137
+
138
+ html += """
139
+ <script>
140
+ // Loads the src attribute of the iframe tag
141
+ document.getElementById('tab_{index}_{key_cleaned}').addEventListener('change', function() {{
142
+ if (this.checked) {{
143
+ document.getElementById('loading_{index}_{key_cleaned}').style.display = 'block';
144
+ document.getElementById('iframe_{index}_{key_cleaned}').src = '{url}';
145
+ }}
146
+ }});
147
+ </script>
148
+ """.format(
149
+ index=index, url=value, key_cleaned=key_cleaned
150
+ )
151
+
152
+ if tweet["available_tweet_text"]:
153
+ html += "<br>\n"
154
+ html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
155
+ html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
156
+ html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
157
+
158
+ html += "<br>\n"
159
+ html += f'<p><strong>Archived Tweet:</strong> <a href="{tweet["archived_tweet_url"]}" target="_blank">{tweet["archived_tweet_url"]}</a></p>\n'
160
+ html += f'<p><strong>Parsed Archived Tweet:</strong> <a href="{tweet["parsed_archived_tweet_url"]}" target="_blank">{tweet["parsed_archived_tweet_url"]}</a></p>\n'
161
+ html += f'<p><strong>Original Tweet:</strong> <a href="{tweet["original_tweet_url"]}" target="_blank">{tweet["original_tweet_url"]}</a></p>\n'
162
+ html += f'<p><strong>Parsed Tweet:</strong> <a href="{tweet["parsed_tweet_url"]}" target="_blank">{tweet["parsed_tweet_url"]}</a></p>\n'
163
+ html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
164
+ html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
165
+ html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
166
+ html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
167
+ html += (
168
+ f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}\n'
169
+ )
170
+ html += f'<p><strong>Archived Length:</strong> {tweet["archived_length"]}</p>\n'
171
+ html += "</div>\n"
172
+
173
+ html += "</div>\n</div>\n" # Closes the page div and the container
174
+
175
+ html += "<br>\n"
176
+
177
+ # Adds navigation for the pages
178
+ html += '<div class="pagination">\n'
179
+ for page in range(1, total_pages + 1):
180
+ html += f'<a href="#" id="page_link_{page}" onclick="showPage({page})">{page}</a>\n'
181
+ html += "</div>\n"
182
+
183
+ html += '<br><p class="source">generated by <a href="https://claromes.github.io/waybacktweets/" target="_blank">Wayback Tweets↗</a></p>\n'
184
+
185
+ html += """
186
+ <script>
187
+ // Function to show the selected page and hide the others
188
+ function showPage(page) {{
189
+ for (let i = 1; i <= {total_pages}; i++) {{
190
+ document.getElementById('page_' + i).style.display = 'none';
191
+ document.getElementById('page_link_' + i).classList.remove('selected');
192
+ }}
193
+
194
+ document.getElementById('page_' + page).style.display = 'block';
195
+ document.getElementById('page_link_' + page).classList.add('selected');
196
+ }}
197
+
198
+ // Initializes the page to show only the first page
199
+ document.addEventListener('DOMContentLoaded', (event) => {{
200
+ showPage(1); // Shows only the first page on load
201
+ document.getElementById('loading_first_page').style.display = 'none';
202
+ }});
203
+ </script>
204
+ """.format(
205
+ total_pages=total_pages
206
+ )
207
+
208
+ html += "</body>\n"
209
+ html += "</html>"
210
+
211
+ return html
212
+
213
+ def save(self, html_content: str) -> None:
214
+ """
215
+ Saves the generated HTML string to a file.
216
+
217
+ Args:
218
+ html_content (str): The HTML string to be saved.
219
+ """
220
+ with open(self.html_file_path, "w", encoding="utf-8") as f:
221
+ f.write(html_content)
waybacktweets/config/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # flake8: noqa: F401
2
+
3
+ from waybacktweets.config.config import config
4
+ from waybacktweets.config.field_options import FIELD_OPTIONS
waybacktweets/config/config.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration module.
3
+
4
+ Manages global configuration settings throughout the application.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+
9
+
10
+ @dataclass
11
+ class _Config:
12
+ """
13
+ A class used to represent the configuration settings.
14
+
15
+ Attributes:
16
+ verbose (bool): Determines if verbose logging should be enabled.
17
+ """
18
+
19
+ verbose: bool = True
20
+
21
+
22
+ config = _Config()
23
+ """
24
+ Global configuration instance.
25
+
26
+ Attributes:
27
+ verbose (bool): Determines if verbose logging should be enabled.
28
+ """
waybacktweets/config/field_options.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ List of valid field options that can be used for parsing tweets.
3
+ """
4
+
5
+ FIELD_OPTIONS = [
6
+ "archived_urlkey",
7
+ "archived_timestamp",
8
+ "parsed_archived_timestamp",
9
+ "archived_tweet_url",
10
+ "parsed_archived_tweet_url",
11
+ "original_tweet_url",
12
+ "parsed_tweet_url",
13
+ "available_tweet_text",
14
+ "available_tweet_is_RT",
15
+ "available_tweet_info",
16
+ "archived_mimetype",
17
+ "archived_statuscode",
18
+ "archived_digest",
19
+ "archived_length",
20
+ ]
waybacktweets/exceptions/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa: F401
2
+
3
+ from waybacktweets.exceptions.exceptions import (
4
+ ConnectionError,
5
+ EmptyResponseError,
6
+ GetResponseError,
7
+ HTTPError,
8
+ ReadTimeoutError,
9
+ )
waybacktweets/exceptions/exceptions.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Wayback Tweets Exceptions
3
+ """
4
+
5
+
6
+ class GetResponseError(Exception):
7
+ """
8
+ Base class for exceptions in get_response.
9
+ """
10
+
11
+
12
+ class ReadTimeoutError(GetResponseError):
13
+ """
14
+ Exception raised for read timeout errors.
15
+ """
16
+
17
+
18
+ class ConnectionError(GetResponseError):
19
+ """
20
+ Exception raised for connection errors.
21
+ """
22
+
23
+
24
+ class HTTPError(GetResponseError):
25
+ """
26
+ Exception raised for HTTP errors.
27
+ """
28
+
29
+
30
+ class EmptyResponseError(GetResponseError):
31
+ """
32
+ Exception raised for empty responses.
33
+ """
waybacktweets/utils/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa: F401
2
+
3
+ from waybacktweets.utils.utils import (
4
+ check_double_status,
5
+ check_pattern_tweet,
6
+ check_url_scheme,
7
+ clean_tweet_url,
8
+ clean_wayback_machine_url,
9
+ delete_tweet_pathnames,
10
+ get_response,
11
+ is_tweet_url,
12
+ semicolon_parser,
13
+ timestamp_parser,
14
+ )
waybacktweets/utils/utils.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for handling HTTP requests and manipulating URLs.
3
+ """
4
+
5
+ import html
6
+ import re
7
+ from datetime import datetime
8
+ from typing import Optional, Tuple
9
+
10
+ import requests
11
+ from requests.adapters import HTTPAdapter
12
+ from urllib3.util.retry import Retry
13
+
14
+ from waybacktweets.exceptions.exceptions import (
15
+ ConnectionError,
16
+ EmptyResponseError,
17
+ GetResponseError,
18
+ HTTPError,
19
+ ReadTimeoutError,
20
+ )
21
+
22
+
23
+ def get_response(
24
+ url: str, params: Optional[dict] = None
25
+ ) -> Tuple[Optional[requests.Response], Optional[str], Optional[str]]:
26
+ """
27
+ Sends a GET request to the specified URL and returns the response.
28
+
29
+ Args:
30
+ url (str): The URL to send the GET request to.
31
+ params (dict, optional): The parameters to include in the GET request.
32
+
33
+ Returns:
34
+ The response from the server.
35
+
36
+ Raises:
37
+ ReadTimeoutError: If a read timeout occurs.
38
+ ConnectionError: If a connection error occurs.
39
+ HTTPError: If an HTTP error occurs.
40
+ EmptyResponseError: If the response is empty.
41
+ """
42
+ session = requests.Session()
43
+ retry = Retry(connect=3, backoff_factor=0.3)
44
+ adapter = HTTPAdapter(max_retries=retry)
45
+ headers = {
46
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" # noqa: E501
47
+ }
48
+
49
+ session.mount("http://", adapter)
50
+ session.mount("https://", adapter)
51
+
52
+ try:
53
+ response = session.get(url, params=params, headers=headers)
54
+ response.raise_for_status()
55
+
56
+ if not response or response.json() == []:
57
+ raise EmptyResponseError("No data was saved due to an empty response.")
58
+ return response
59
+ except requests.exceptions.ReadTimeout:
60
+ raise ReadTimeoutError
61
+ except requests.exceptions.ConnectionError:
62
+ raise ConnectionError
63
+ except requests.exceptions.HTTPError:
64
+ raise HTTPError
65
+ except requests.exceptions.RequestException:
66
+ raise GetResponseError
67
+
68
+
69
+ def clean_tweet_url(tweet_url: str, username: str) -> str:
70
+ """
71
+ Cleans a tweet URL by ensuring it is associated with the correct username.
72
+
73
+ Args:
74
+ tweet_url (str): The tweet URL to clean.
75
+ username (str): The username to associate with the tweet URL.
76
+
77
+ Returns:
78
+ The cleaned tweet URL.
79
+ """
80
+ tweet_lower = tweet_url.lower()
81
+
82
+ pattern = re.compile(r"/status/(\d+)")
83
+ match_lower_case = pattern.search(tweet_lower)
84
+ match_original_case = pattern.search(tweet_url)
85
+
86
+ if match_lower_case and username in tweet_lower:
87
+ return f"https://twitter.com/{username}/status/{match_original_case.group(1)}"
88
+ else:
89
+ return tweet_url
90
+
91
+
92
+ def clean_wayback_machine_url(
93
+ wayback_machine_url: str, archived_timestamp: str, username: str
94
+ ) -> str:
95
+ """
96
+ Cleans a Wayback Machine URL by ensuring it is associated with the correct username and timestamp.
97
+
98
+ Args:
99
+ wayback_machine_url (str): The Wayback Machine URL to clean.
100
+ archived_timestamp (str): The timestamp to associate with the Wayback Machine URL.
101
+ username (str): The username to associate with the Wayback Machine URL.
102
+
103
+ Returns:
104
+ The cleaned Wayback Machine URL.
105
+ """ # noqa: E501
106
+ wayback_machine_url = wayback_machine_url.lower()
107
+
108
+ pattern = re.compile(r"/status/(\d+)")
109
+ match = pattern.search(wayback_machine_url)
110
+
111
+ if match and username in wayback_machine_url:
112
+ return f"https://web.archive.org/web/{archived_timestamp}/https://twitter.com/{username}/status/{match.group(1)}" # noqa: E501
113
+ else:
114
+ return wayback_machine_url
115
+
116
+
117
+ def check_pattern_tweet(tweet_url: str) -> str:
118
+ """
119
+ Extracts the URL from a tweet URL with patterns such as:
120
+
121
+ - Reply: /status//
122
+ - Link: /status///
123
+ - Twimg: /status/https://pbs
124
+
125
+ Args:
126
+ tweet_url (str): The tweet URL to extract the URL from.
127
+
128
+ Returns:
129
+ Only the extracted URL from a tweet.
130
+ """
131
+ pattern = r'/status/((?:"(.*?)"|&quot;(.*?)(?=&|$)|&quot%3B(.*?)(?=&|$)))'
132
+ match = re.search(pattern, tweet_url)
133
+
134
+ if match:
135
+ if match.group(2):
136
+ parsed_tweet_url = match.group(2)
137
+ elif match.group(3):
138
+ parsed_tweet_url = match.group(3)
139
+ elif match.group(4):
140
+ parsed_tweet_url = match.group(4)
141
+ else:
142
+ parsed_tweet_url = ""
143
+
144
+ parsed_tweet_url = html.unescape(parsed_tweet_url)
145
+
146
+ return parsed_tweet_url
147
+
148
+ return tweet_url
149
+
150
+
151
+ def delete_tweet_pathnames(tweet_url: str) -> str:
152
+ """
153
+ Removes any pathnames from a tweet URL.
154
+
155
+ Args:
156
+ tweet_url (str): The tweet URL to remove pathnames from.
157
+
158
+ Returns:
159
+ The tweet URL without any pathnames.
160
+ """
161
+ pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
162
+ match_username = pattern_username.match(tweet_url)
163
+
164
+ pattern_id = r"https://twitter.com/\w+/status/(\d+)"
165
+ match_id = re.search(pattern_id, tweet_url)
166
+
167
+ if match_id and match_username:
168
+ tweet_id = match_id.group(1)
169
+ username = match_username.group(1)
170
+ return f"https://twitter.com/{username}/status/{tweet_id}"
171
+ else:
172
+ return tweet_url
173
+
174
+
175
+ def check_double_status(wayback_machine_url: str, original_tweet_url: str) -> bool:
176
+ """
177
+ Checks if a Wayback Machine URL contains two occurrences of "/status/" and if the original tweet does not contain "twitter.com".
178
+
179
+ Args:
180
+ wayback_machine_url (str): The Wayback Machine URL to check.
181
+ original_tweet_url (str): The original tweet URL to check.
182
+
183
+ Returns:
184
+ True if the conditions are met, False otherwise.
185
+ """ # noqa: E501
186
+ if (
187
+ wayback_machine_url.count("/status/") == 2
188
+ and "twitter.com" not in original_tweet_url
189
+ ):
190
+ return True
191
+
192
+ return False
193
+
194
+
195
+ def semicolon_parser(string: str) -> str:
196
+ """
197
+ Replaces semicolons in a string with %3B.
198
+
199
+ Args:
200
+ string (str): The string to replace semicolons in.
201
+
202
+ Returns:
203
+ The string with semicolons replaced by %3B.
204
+ """
205
+ return "".join("%3B" if c == ";" else c for c in string)
206
+
207
+
208
+ def is_tweet_url(twitter_url: str) -> bool:
209
+ """
210
+ Checks if the provided URL is a Twitter status URL.
211
+
212
+ This function checks if the provided URL contains "/status/" exactly once, which is a common pattern in Twitter status URLs.
213
+
214
+ Args:
215
+ twitter_url (str): The URL to check.
216
+
217
+ Returns:
218
+ True if the URL is a Twitter status URL, False otherwise.
219
+ """ # noqa: E501
220
+ if twitter_url.count("/status/") == 1:
221
+ return True
222
+
223
+ return False
224
+
225
+
226
+ def timestamp_parser(timestamp):
227
+ """
228
+ Parses a timestamp into a formatted string.
229
+
230
+ Args:
231
+ timestamp (str): The timestamp string to parse.
232
+
233
+ Returns:
234
+ The parsed timestamp in the format "%Y/%m/%d %H:%M:%S", or None if the
235
+ timestamp could not be parsed.
236
+ """
237
+ formats = [
238
+ "%Y",
239
+ "%Y%m",
240
+ "%Y%m%d",
241
+ "%Y%m%d%H",
242
+ "%Y%m%d%H%M",
243
+ "%Y%m%d%H%M%S",
244
+ ]
245
+
246
+ for fmt in formats:
247
+ try:
248
+ parsed_time = datetime.strptime(timestamp, fmt)
249
+
250
+ formatted_time = parsed_time.strftime("%Y/%m/%d %H:%M:%S")
251
+ return formatted_time
252
+ except ValueError:
253
+ continue
254
+
255
+ return None
256
+
257
+
258
+ def check_url_scheme(url):
259
+ """
260
+ Corrects the URL scheme if it contains more than two slashes following the scheme.
261
+
262
+ This function uses a regular expression to find 'http:' or 'https:' followed by two or more slashes.
263
+ It then replaces this with the scheme followed by exactly two slashes.
264
+
265
+ Args:
266
+ url (str): The URL to be corrected.
267
+
268
+ Returns:
269
+ The corrected URL.
270
+ """ # noqa: E501
271
+ pattern = r"(http:|https:)(/{2,})"
272
+
273
+ def replace_function(match):
274
+ scheme = match.group(1)
275
+ return f"{scheme}//"
276
+
277
+ parsed_url = re.sub(pattern, replace_function, url)
278
+
279
+ return parsed_url