yuyutsu07 commited on
Commit
32fd335
·
verified ·
1 Parent(s): 0c50b6c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +374 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from datetime import datetime, timedelta
3
+
4
+ import streamlit as st
5
+ import streamlit.components.v1 as components
6
+
7
+ from waybacktweets.api.export import TweetsExporter
8
+ from waybacktweets.api.parse import TweetsParser
9
+ from waybacktweets.api.request import WaybackTweets
10
+ from waybacktweets.api.visualize import HTMLTweetsVisualizer
11
+ from waybacktweets.config import FIELD_OPTIONS, config
12
+
13
+ # ------ Initial Settings ------ #
14
+
15
+ PAGE_ICON = "assets/parthenon.png"
16
+ TITLE = "assets/waybacktweets.png"
17
+ DOWNLOAD = "assets/download.svg"
18
+
19
+ collapse = None
20
+ matchtype = None
21
+ start_date = datetime.now() - timedelta(days=30 * 6)
22
+ end_date = datetime.now()
23
+ min_date = datetime(2006, 1, 1)
24
+
25
+ # ------ Verbose Mode Configuration ------ #
26
+
27
+ config.verbose = False
28
+
29
+ # ------ Page Configuration ------ #
30
+
31
+ st.set_page_config(
32
+ page_title="Wayback Tweets",
33
+ page_icon=PAGE_ICON,
34
+ layout="centered",
35
+ menu_items={
36
+ "About": f"""
37
+ [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)
38
+
39
+ The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool.
40
+
41
+ © 2023 - {end_date.year}, [Claromes](https://claromes.com)
42
+
43
+ ---
44
+ """, # noqa: E501
45
+ "Report a bug": "https://github.com/claromes/waybacktweets/issues",
46
+ },
47
+ )
48
+
49
+ # ------ Set States and Params ------ #
50
+
51
+ if "current_username" not in st.session_state:
52
+ st.session_state.current_username = ""
53
+
54
+ if "count" not in st.session_state:
55
+ st.session_state.count = False
56
+
57
+ if "archived_timestamp_filter" not in st.session_state:
58
+ st.session_state.archived_timestamp_filter = (start_date, end_date)
59
+
60
+ if "username_value" not in st.session_state:
61
+ st.session_state.username_value = ""
62
+
63
+ if "expanded_value" not in st.session_state:
64
+ st.session_state.expanded_value = False
65
+
66
+ if "query" not in st.session_state:
67
+ st.session_state.query = False
68
+
69
+ if "update_component" not in st.session_state:
70
+ st.session_state.update_component = 0
71
+
72
+ if "username" not in st.query_params:
73
+ st.query_params["username"] = ""
74
+
75
+ # ------ Add Custom CSS Style ------ #
76
+
77
+ st.html(
78
+ """
79
+ <style>
80
+ header[data-testid="stHeader"] {
81
+ opacity: 0.5;
82
+ }
83
+ iframe {
84
+ border: 1px solid #dddddd;
85
+ border-radius: 0.5rem;
86
+ }
87
+ div[data-testid="InputInstructions"] {
88
+ visibility: hidden;
89
+ }
90
+ button[data-testid="StyledFullScreenButton"] {
91
+ display: none;
92
+ }
93
+ div[class="st-emotion-cache-1v0mbdj e115fcil1"] {
94
+ max-width: 100%;
95
+ }
96
+ </style>
97
+ """
98
+ )
99
+
100
+ # ------ Requestings ------ #
101
+
102
+
103
+ @st.cache_data(ttl=600, show_spinner=False)
104
+ def wayback_tweets(
105
+ username,
106
+ collapse,
107
+ timestamp_from,
108
+ timestamp_to,
109
+ limit,
110
+ offset,
111
+ matchtype,
112
+ ):
113
+ response = WaybackTweets(
114
+ username,
115
+ collapse,
116
+ timestamp_from,
117
+ timestamp_to,
118
+ limit,
119
+ offset,
120
+ matchtype,
121
+ )
122
+ archived_tweets = response.get()
123
+
124
+ return archived_tweets
125
+
126
+
127
+ @st.cache_data(ttl=600, show_spinner=False)
128
+ def tweets_parser(archived_tweets, username, field_options):
129
+ parser = TweetsParser(archived_tweets, username, field_options)
130
+ parsed_tweets = parser.parse()
131
+
132
+ return parsed_tweets
133
+
134
+
135
+ @st.cache_data(ttl=600, show_spinner=False)
136
+ def tweets_exporter(parsed_tweets, username, field_options):
137
+ exporter = TweetsExporter(parsed_tweets, username, field_options)
138
+
139
+ df = exporter.dataframe
140
+ file_name = exporter.filename
141
+
142
+ return df, file_name
143
+
144
+
145
+ # ------ Custom JavaScript ------ #
146
+
147
+
148
+ def scroll_page():
149
+ js = f"""
150
+ <script>
151
+ window.parent.document.querySelector('section.main').scrollTo(700, 700);
152
+ let update_component = {st.session_state.update_component} // Force component update to generate scroll
153
+ </script>
154
+ """ # noqa: E501
155
+
156
+ components.html(js, width=0, height=0)
157
+
158
+
159
+ # ------ Query Param ------ #
160
+
161
+ if st.query_params.username != "":
162
+ st.session_state.username_value = st.query_params.username
163
+ st.session_state.expanded_value = True
164
+ st.session_state.query = True
165
+
166
+ st.session_state.update_component += 1
167
+ scroll_page()
168
+
169
+ # ------ User Interface Settings ------ #
170
+
171
+ st.info(
172
+ "🥳 [**Pre-release 1.0x: Python module, CLI, and new Streamlit app**](https://github.com/claromes/waybacktweets/releases)" # noqa: E501
173
+ )
174
+
175
+ st.image(TITLE, use_column_width="never")
176
+ st.caption(
177
+ "[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases) [![Star](https://img.shields.io/github/stars/claromes/waybacktweets?style=social)](https://github.com/claromes/waybacktweets)" # noqa: E501
178
+ )
179
+ st.write(
180
+ "Retrieves archived tweets CDX data in HTML (for easy viewing of the tweets using the `iframe` tag), CSV, and JSON formats." # noqa: E501
181
+ )
182
+
183
+ st.write(
184
+ "This application uses the Wayback Tweets Python package, which can be used either as a module or as a standalone command-line tool. [Read the documentation](https://claromes.github.io/waybacktweets) for more information." # noqa: E501
185
+ )
186
+
187
+ st.write(
188
+ "To access the legacy version of Wayback Tweets, [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
189
+ )
190
+
191
+ st.divider()
192
+
193
+ # -- Filters -- #
194
+
195
+ username = st.text_input(
196
+ "Username *",
197
+ value=st.session_state.username_value,
198
+ key="username",
199
+ placeholder="Without @",
200
+ )
201
+
202
+ with st.expander("Filtering", expanded=st.session_state.expanded_value):
203
+
204
+ st.session_state.archived_timestamp_filter = st.date_input(
205
+ "Tweets saved between",
206
+ (start_date, end_date),
207
+ min_date,
208
+ end_date,
209
+ format="YYYY/MM/DD",
210
+ help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
211
+ )
212
+ st.caption(
213
+ ":orange[note: large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]" # noqa: E501
214
+ )
215
+
216
+ col1, col2 = st.columns(2)
217
+
218
+ with col1:
219
+ limit = st.text_input(
220
+ "Limit",
221
+ key="limit",
222
+ help="Query result limits",
223
+ )
224
+
225
+ with col2:
226
+ offset = st.text_input(
227
+ "Offset",
228
+ key="offset",
229
+ help="Allows for a simple way to scroll through the results",
230
+ )
231
+
232
+ unique = st.checkbox(
233
+ "Only unique Wayback Machine URLs",
234
+ key="unique",
235
+ help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
236
+ )
237
+ st.caption(
238
+ ":orange[note: according to the official documentation of the Wayback CDX Server API, the query to retrieve unique URLs may be slow at the moment.]" # noqa: E501
239
+ )
240
+
241
+
242
+ query = st.button("Query", type="primary", use_container_width=True)
243
+
244
+ if st.query_params.username == "":
245
+ st.query_params.clear()
246
+ st.session_state.query = query
247
+
248
+ # ------ Results ------ #
249
+
250
+ if username != st.session_state.current_username:
251
+ st.session_state.current_username = username
252
+
253
+ if st.session_state.query or st.session_state.count:
254
+ if unique:
255
+ collapse = "urlkey"
256
+ matchtype = "prefix"
257
+
258
+ try:
259
+ with st.spinner(
260
+ f"Waybacking @{st.session_state.current_username}'s archived tweets"
261
+ ):
262
+ wayback_tweets = wayback_tweets(
263
+ st.session_state.current_username,
264
+ collapse,
265
+ st.session_state.archived_timestamp_filter[0],
266
+ st.session_state.archived_timestamp_filter[1],
267
+ limit,
268
+ offset,
269
+ matchtype,
270
+ )
271
+
272
+ if not wayback_tweets:
273
+ st.error("No data was saved due to an empty response.")
274
+ st.stop()
275
+
276
+ with st.spinner(
277
+ f"Parsing @{st.session_state.current_username}'s archived tweets"
278
+ ):
279
+ parsed_tweets = tweets_parser(
280
+ wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
281
+ )
282
+
283
+ df, file_name = tweets_exporter(
284
+ parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
285
+ )
286
+
287
+ csv_data = df.to_csv(index=False)
288
+ json_data = df.to_json(orient="records", lines=False)
289
+ html = HTMLTweetsVisualizer(username, json_data)
290
+ html_content = html.generate()
291
+
292
+ # -- Rendering -- #
293
+
294
+ if csv_data and json_data and html_content:
295
+ st.session_state.count = len(df)
296
+ st.write(f"**{st.session_state.count} URLs have been captured**")
297
+
298
+ # -- HTML -- #
299
+
300
+ st.header("HTML", divider="gray", anchor=False)
301
+ st.write(
302
+ f"Visualize tweets more efficiently through `iframes`. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
303
+ )
304
+
305
+ col5, col6 = st.columns([1, 18])
306
+
307
+ with col5:
308
+ st.image(DOWNLOAD, width=22)
309
+
310
+ with col6:
311
+ b64_html = base64.b64encode(html_content.encode()).decode()
312
+ href_html = f"data:text/html;base64,{b64_html}"
313
+
314
+ st.markdown(
315
+ f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
316
+ unsafe_allow_html=True,
317
+ )
318
+
319
+ # -- CSV -- #
320
+
321
+ st.header("CSV", divider="gray", anchor=False)
322
+ st.write(
323
+ "Check the data returned in the dataframe below and download the file."
324
+ )
325
+
326
+ col7, col8 = st.columns([1, 18])
327
+
328
+ with col7:
329
+ st.image(DOWNLOAD, width=22)
330
+
331
+ with col8:
332
+ b64_csv = base64.b64encode(csv_data.encode()).decode()
333
+ href_csv = f"data:file/csv;base64,{b64_csv}"
334
+
335
+ st.markdown(
336
+ f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
337
+ unsafe_allow_html=True,
338
+ )
339
+
340
+ st.dataframe(df, use_container_width=True)
341
+
342
+ # -- JSON -- #
343
+
344
+ st.header("JSON", divider="gray", anchor=False)
345
+ st.write(
346
+ "Check the data returned in JSON format below and download the file."
347
+ )
348
+
349
+ col9, col10 = st.columns([1, 18])
350
+
351
+ with col9:
352
+ st.image(DOWNLOAD, width=22)
353
+
354
+ with col10:
355
+ b64_json = base64.b64encode(json_data.encode()).decode()
356
+ href_json = f"data:file/json;base64,{b64_json}"
357
+
358
+ st.markdown(
359
+ f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
360
+ unsafe_allow_html=True,
361
+ )
362
+
363
+ st.json(json_data, expanded=False)
364
+ except TypeError as e:
365
+ st.error(
366
+ f"""
367
+ {e}. Refresh this page and try again.
368
+
369
+ If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501
370
+ )
371
+ st.stop()
372
+ except Exception as e:
373
+ st.error(str(e))
374
+ st.stop()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit==1.36.0
2
+ waybacktweets==1.0a5