Spaces:
Sleeping
Sleeping
Upload 43 files
Browse files- .github/FUNDING.yml +1 -0
- .github/workflows/docs.yml +34 -0
- .streamlit/config.toml +13 -0
- app/app.py +374 -0
- app/requirements.txt +2 -0
- assets/download.svg +9 -0
- assets/parthenon.png +0 -0
- assets/waybacktweets.png +0 -0
- assets/waybacktweets_title.png +0 -0
- docs/Makefile +20 -0
- docs/_static/card.png +0 -0
- docs/_static/css/custom.css +14 -0
- docs/_templates/page.html +14 -0
- docs/api.rst +72 -0
- docs/cli.rst +77 -0
- docs/conf.py +52 -0
- docs/contribute.rst +38 -0
- docs/exceptions.rst +44 -0
- docs/field_options.rst +42 -0
- docs/index.rst +83 -0
- docs/installation.rst +64 -0
- docs/make.bat +35 -0
- docs/outputs.rst +29 -0
- docs/quickstart.rst +48 -0
- docs/streamlit.rst +63 -0
- docs/todo.rst +18 -0
- docs/workflow.rst +25 -0
- legacy_app/legacy_app.py +525 -0
- legacy_app/requirements.txt +2 -0
- waybacktweets/__init__.py +6 -0
- waybacktweets/_cli.py +158 -0
- waybacktweets/api/__init__.py +0 -0
- waybacktweets/api/export.py +121 -0
- waybacktweets/api/parse.py +295 -0
- waybacktweets/api/request.py +109 -0
- waybacktweets/api/visualize.py +221 -0
- waybacktweets/config/__init__.py +4 -0
- waybacktweets/config/config.py +28 -0
- waybacktweets/config/field_options.py +20 -0
- waybacktweets/exceptions/__init__.py +9 -0
- waybacktweets/exceptions/exceptions.py +33 -0
- waybacktweets/utils/__init__.py +14 -0
- waybacktweets/utils/utils.py +279 -0
.github/FUNDING.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
github: [claromes]
|
.github/workflows/docs.yml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: docs
|
| 2 |
+
|
| 3 |
+
on: [push, pull_request, workflow_dispatch]
|
| 4 |
+
|
| 5 |
+
permissions:
|
| 6 |
+
contents: write
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
docs:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v4
|
| 13 |
+
- uses: actions/setup-python@v5
|
| 14 |
+
with:
|
| 15 |
+
python-version: 3.11
|
| 16 |
+
- name: Install Poetry
|
| 17 |
+
run: |
|
| 18 |
+
curl -sSL https://install.python-poetry.org | python3 -
|
| 19 |
+
- name: Install dependencies
|
| 20 |
+
run: |
|
| 21 |
+
poetry install
|
| 22 |
+
- name: Sphinx build
|
| 23 |
+
run: |
|
| 24 |
+
mkdir gh-pages
|
| 25 |
+
touch gh-pages/.nojekyll
|
| 26 |
+
cd docs/
|
| 27 |
+
poetry run sphinx-build -b html . _build
|
| 28 |
+
cp -r _build/* ../gh-pages/
|
| 29 |
+
- name: Deploy documentation
|
| 30 |
+
if: ${{ github.event_name == 'push' }}
|
| 31 |
+
uses: JamesIves/github-pages-deploy-action@4.1.4
|
| 32 |
+
with:
|
| 33 |
+
branch: gh-pages
|
| 34 |
+
folder: gh-pages
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base = "light"
|
| 3 |
+
primaryColor = "black"
|
| 4 |
+
secondaryBackgroundColor = "gainsboro"
|
| 5 |
+
textColor = "black"
|
| 6 |
+
backgroundColor = "whitesmoke"
|
| 7 |
+
font = "serif"
|
| 8 |
+
|
| 9 |
+
[client]
|
| 10 |
+
toolbarMode = "minimal"
|
| 11 |
+
|
| 12 |
+
[server]
|
| 13 |
+
port = 8501
|
app/app.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from datetime import datetime, timedelta
|
| 3 |
+
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import streamlit.components.v1 as components
|
| 6 |
+
|
| 7 |
+
from waybacktweets.api.export import TweetsExporter
|
| 8 |
+
from waybacktweets.api.parse import TweetsParser
|
| 9 |
+
from waybacktweets.api.request import WaybackTweets
|
| 10 |
+
from waybacktweets.api.visualize import HTMLTweetsVisualizer
|
| 11 |
+
from waybacktweets.config import FIELD_OPTIONS, config
|
| 12 |
+
|
| 13 |
+
# ------ Initial Settings ------ #
|
| 14 |
+
|
| 15 |
+
PAGE_ICON = "assets/parthenon.png"
|
| 16 |
+
TITLE = "assets/waybacktweets.png"
|
| 17 |
+
DOWNLOAD = "assets/download.svg"
|
| 18 |
+
|
| 19 |
+
collapse = None
|
| 20 |
+
matchtype = None
|
| 21 |
+
start_date = datetime.now() - timedelta(days=30 * 6)
|
| 22 |
+
end_date = datetime.now()
|
| 23 |
+
min_date = datetime(2006, 1, 1)
|
| 24 |
+
|
| 25 |
+
# ------ Verbose Mode Configuration ------ #
|
| 26 |
+
|
| 27 |
+
config.verbose = False
|
| 28 |
+
|
| 29 |
+
# ------ Page Configuration ------ #
|
| 30 |
+
|
| 31 |
+
st.set_page_config(
|
| 32 |
+
page_title="Wayback Tweets",
|
| 33 |
+
page_icon=PAGE_ICON,
|
| 34 |
+
layout="centered",
|
| 35 |
+
menu_items={
|
| 36 |
+
"About": f"""
|
| 37 |
+
[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md) [](https://github.com/claromes/waybacktweets)
|
| 38 |
+
|
| 39 |
+
The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool.
|
| 40 |
+
|
| 41 |
+
© 2023 - {end_date.year}, [Claromes](https://claromes.com)
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
""", # noqa: E501
|
| 45 |
+
"Report a bug": "https://github.com/claromes/waybacktweets/issues",
|
| 46 |
+
},
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# ------ Set States and Params ------ #
|
| 50 |
+
|
| 51 |
+
if "current_username" not in st.session_state:
|
| 52 |
+
st.session_state.current_username = ""
|
| 53 |
+
|
| 54 |
+
if "count" not in st.session_state:
|
| 55 |
+
st.session_state.count = False
|
| 56 |
+
|
| 57 |
+
if "archived_timestamp_filter" not in st.session_state:
|
| 58 |
+
st.session_state.archived_timestamp_filter = (start_date, end_date)
|
| 59 |
+
|
| 60 |
+
if "username_value" not in st.session_state:
|
| 61 |
+
st.session_state.username_value = ""
|
| 62 |
+
|
| 63 |
+
if "expanded_value" not in st.session_state:
|
| 64 |
+
st.session_state.expanded_value = False
|
| 65 |
+
|
| 66 |
+
if "query" not in st.session_state:
|
| 67 |
+
st.session_state.query = False
|
| 68 |
+
|
| 69 |
+
if "update_component" not in st.session_state:
|
| 70 |
+
st.session_state.update_component = 0
|
| 71 |
+
|
| 72 |
+
if "username" not in st.query_params:
|
| 73 |
+
st.query_params["username"] = ""
|
| 74 |
+
|
| 75 |
+
# ------ Add Custom CSS Style ------ #
|
| 76 |
+
|
| 77 |
+
st.html(
|
| 78 |
+
"""
|
| 79 |
+
<style>
|
| 80 |
+
header[data-testid="stHeader"] {
|
| 81 |
+
opacity: 0.5;
|
| 82 |
+
}
|
| 83 |
+
iframe {
|
| 84 |
+
border: 1px solid #dddddd;
|
| 85 |
+
border-radius: 0.5rem;
|
| 86 |
+
}
|
| 87 |
+
div[data-testid="InputInstructions"] {
|
| 88 |
+
visibility: hidden;
|
| 89 |
+
}
|
| 90 |
+
button[data-testid="StyledFullScreenButton"] {
|
| 91 |
+
display: none;
|
| 92 |
+
}
|
| 93 |
+
div[class="st-emotion-cache-1v0mbdj e115fcil1"] {
|
| 94 |
+
max-width: 100%;
|
| 95 |
+
}
|
| 96 |
+
</style>
|
| 97 |
+
"""
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# ------ Requestings ------ #
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@st.cache_data(ttl=600, show_spinner=False)
|
| 104 |
+
def wayback_tweets(
|
| 105 |
+
username,
|
| 106 |
+
collapse,
|
| 107 |
+
timestamp_from,
|
| 108 |
+
timestamp_to,
|
| 109 |
+
limit,
|
| 110 |
+
offset,
|
| 111 |
+
matchtype,
|
| 112 |
+
):
|
| 113 |
+
response = WaybackTweets(
|
| 114 |
+
username,
|
| 115 |
+
collapse,
|
| 116 |
+
timestamp_from,
|
| 117 |
+
timestamp_to,
|
| 118 |
+
limit,
|
| 119 |
+
offset,
|
| 120 |
+
matchtype,
|
| 121 |
+
)
|
| 122 |
+
archived_tweets = response.get()
|
| 123 |
+
|
| 124 |
+
return archived_tweets
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
@st.cache_data(ttl=600, show_spinner=False)
|
| 128 |
+
def tweets_parser(archived_tweets, username, field_options):
|
| 129 |
+
parser = TweetsParser(archived_tweets, username, field_options)
|
| 130 |
+
parsed_tweets = parser.parse()
|
| 131 |
+
|
| 132 |
+
return parsed_tweets
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@st.cache_data(ttl=600, show_spinner=False)
|
| 136 |
+
def tweets_exporter(parsed_tweets, username, field_options):
|
| 137 |
+
exporter = TweetsExporter(parsed_tweets, username, field_options)
|
| 138 |
+
|
| 139 |
+
df = exporter.dataframe
|
| 140 |
+
file_name = exporter.filename
|
| 141 |
+
|
| 142 |
+
return df, file_name
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ------ Custom JavaScript ------ #
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def scroll_page():
|
| 149 |
+
js = f"""
|
| 150 |
+
<script>
|
| 151 |
+
window.parent.document.querySelector('section.main').scrollTo(700, 700);
|
| 152 |
+
let update_component = {st.session_state.update_component} // Force component update to generate scroll
|
| 153 |
+
</script>
|
| 154 |
+
""" # noqa: E501
|
| 155 |
+
|
| 156 |
+
components.html(js, width=0, height=0)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ------ Query Param ------ #
|
| 160 |
+
|
| 161 |
+
if st.query_params.username != "":
|
| 162 |
+
st.session_state.username_value = st.query_params.username
|
| 163 |
+
st.session_state.expanded_value = True
|
| 164 |
+
st.session_state.query = True
|
| 165 |
+
|
| 166 |
+
st.session_state.update_component += 1
|
| 167 |
+
scroll_page()
|
| 168 |
+
|
| 169 |
+
# ------ User Interface Settings ------ #
|
| 170 |
+
|
| 171 |
+
st.info(
|
| 172 |
+
"🥳 [**Pre-release 1.0x: Python module, CLI, and new Streamlit app**](https://github.com/claromes/waybacktweets/releases)" # noqa: E501
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
st.image(TITLE, use_column_width="never")
|
| 176 |
+
st.caption(
|
| 177 |
+
"[](https://github.com/claromes/waybacktweets/releases) [](https://github.com/claromes/waybacktweets)" # noqa: E501
|
| 178 |
+
)
|
| 179 |
+
st.write(
|
| 180 |
+
"Retrieves archived tweets CDX data in HTML (for easy viewing of the tweets using the `iframe` tag), CSV, and JSON formats." # noqa: E501
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
st.write(
|
| 184 |
+
"This application uses the Wayback Tweets Python package, which can be used either as a module or as a standalone command-line tool. [Read the documentation](https://claromes.github.io/waybacktweets) for more information." # noqa: E501
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
st.write(
|
| 188 |
+
"To access the legacy version of Wayback Tweets, [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
st.divider()
|
| 192 |
+
|
| 193 |
+
# -- Filters -- #
|
| 194 |
+
|
| 195 |
+
username = st.text_input(
|
| 196 |
+
"Username *",
|
| 197 |
+
value=st.session_state.username_value,
|
| 198 |
+
key="username",
|
| 199 |
+
placeholder="Without @",
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
with st.expander("Filtering", expanded=st.session_state.expanded_value):
|
| 203 |
+
|
| 204 |
+
st.session_state.archived_timestamp_filter = st.date_input(
|
| 205 |
+
"Tweets saved between",
|
| 206 |
+
(start_date, end_date),
|
| 207 |
+
min_date,
|
| 208 |
+
end_date,
|
| 209 |
+
format="YYYY/MM/DD",
|
| 210 |
+
help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
|
| 211 |
+
)
|
| 212 |
+
st.caption(
|
| 213 |
+
":orange[note: large date range takes a long time to process, and the app's resources may not be sufficient. Try to perform searches with smaller ranges to get faster results.]" # noqa: E501
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
col1, col2 = st.columns(2)
|
| 217 |
+
|
| 218 |
+
with col1:
|
| 219 |
+
limit = st.text_input(
|
| 220 |
+
"Limit",
|
| 221 |
+
key="limit",
|
| 222 |
+
help="Query result limits",
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
with col2:
|
| 226 |
+
offset = st.text_input(
|
| 227 |
+
"Offset",
|
| 228 |
+
key="offset",
|
| 229 |
+
help="Allows for a simple way to scroll through the results",
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
unique = st.checkbox(
|
| 233 |
+
"Only unique Wayback Machine URLs",
|
| 234 |
+
key="unique",
|
| 235 |
+
help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
|
| 236 |
+
)
|
| 237 |
+
st.caption(
|
| 238 |
+
":orange[note: according to the official documentation of the Wayback CDX Server API, the query to retrieve unique URLs may be slow at the moment.]" # noqa: E501
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
query = st.button("Query", type="primary", use_container_width=True)
|
| 243 |
+
|
| 244 |
+
if st.query_params.username == "":
|
| 245 |
+
st.query_params.clear()
|
| 246 |
+
st.session_state.query = query
|
| 247 |
+
|
| 248 |
+
# ------ Results ------ #
|
| 249 |
+
|
| 250 |
+
if username != st.session_state.current_username:
|
| 251 |
+
st.session_state.current_username = username
|
| 252 |
+
|
| 253 |
+
if st.session_state.query or st.session_state.count:
|
| 254 |
+
if unique:
|
| 255 |
+
collapse = "urlkey"
|
| 256 |
+
matchtype = "prefix"
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
with st.spinner(
|
| 260 |
+
f"Waybacking @{st.session_state.current_username}'s archived tweets"
|
| 261 |
+
):
|
| 262 |
+
wayback_tweets = wayback_tweets(
|
| 263 |
+
st.session_state.current_username,
|
| 264 |
+
collapse,
|
| 265 |
+
st.session_state.archived_timestamp_filter[0],
|
| 266 |
+
st.session_state.archived_timestamp_filter[1],
|
| 267 |
+
limit,
|
| 268 |
+
offset,
|
| 269 |
+
matchtype,
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
if not wayback_tweets:
|
| 273 |
+
st.error("No data was saved due to an empty response.")
|
| 274 |
+
st.stop()
|
| 275 |
+
|
| 276 |
+
with st.spinner(
|
| 277 |
+
f"Parsing @{st.session_state.current_username}'s archived tweets"
|
| 278 |
+
):
|
| 279 |
+
parsed_tweets = tweets_parser(
|
| 280 |
+
wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
df, file_name = tweets_exporter(
|
| 284 |
+
parsed_tweets, st.session_state.current_username, FIELD_OPTIONS
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
csv_data = df.to_csv(index=False)
|
| 288 |
+
json_data = df.to_json(orient="records", lines=False)
|
| 289 |
+
html = HTMLTweetsVisualizer(username, json_data)
|
| 290 |
+
html_content = html.generate()
|
| 291 |
+
|
| 292 |
+
# -- Rendering -- #
|
| 293 |
+
|
| 294 |
+
if csv_data and json_data and html_content:
|
| 295 |
+
st.session_state.count = len(df)
|
| 296 |
+
st.write(f"**{st.session_state.count} URLs have been captured**")
|
| 297 |
+
|
| 298 |
+
# -- HTML -- #
|
| 299 |
+
|
| 300 |
+
st.header("HTML", divider="gray", anchor=False)
|
| 301 |
+
st.write(
|
| 302 |
+
f"Visualize tweets more efficiently through `iframes`. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
col5, col6 = st.columns([1, 18])
|
| 306 |
+
|
| 307 |
+
with col5:
|
| 308 |
+
st.image(DOWNLOAD, width=22)
|
| 309 |
+
|
| 310 |
+
with col6:
|
| 311 |
+
b64_html = base64.b64encode(html_content.encode()).decode()
|
| 312 |
+
href_html = f"data:text/html;base64,{b64_html}"
|
| 313 |
+
|
| 314 |
+
st.markdown(
|
| 315 |
+
f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
|
| 316 |
+
unsafe_allow_html=True,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# -- CSV -- #
|
| 320 |
+
|
| 321 |
+
st.header("CSV", divider="gray", anchor=False)
|
| 322 |
+
st.write(
|
| 323 |
+
"Check the data returned in the dataframe below and download the file."
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
col7, col8 = st.columns([1, 18])
|
| 327 |
+
|
| 328 |
+
with col7:
|
| 329 |
+
st.image(DOWNLOAD, width=22)
|
| 330 |
+
|
| 331 |
+
with col8:
|
| 332 |
+
b64_csv = base64.b64encode(csv_data.encode()).decode()
|
| 333 |
+
href_csv = f"data:file/csv;base64,{b64_csv}"
|
| 334 |
+
|
| 335 |
+
st.markdown(
|
| 336 |
+
f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
|
| 337 |
+
unsafe_allow_html=True,
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
st.dataframe(df, use_container_width=True)
|
| 341 |
+
|
| 342 |
+
# -- JSON -- #
|
| 343 |
+
|
| 344 |
+
st.header("JSON", divider="gray", anchor=False)
|
| 345 |
+
st.write(
|
| 346 |
+
"Check the data returned in JSON format below and download the file."
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
col9, col10 = st.columns([1, 18])
|
| 350 |
+
|
| 351 |
+
with col9:
|
| 352 |
+
st.image(DOWNLOAD, width=22)
|
| 353 |
+
|
| 354 |
+
with col10:
|
| 355 |
+
b64_json = base64.b64encode(json_data.encode()).decode()
|
| 356 |
+
href_json = f"data:file/json;base64,{b64_json}"
|
| 357 |
+
|
| 358 |
+
st.markdown(
|
| 359 |
+
f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
|
| 360 |
+
unsafe_allow_html=True,
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
st.json(json_data, expanded=False)
|
| 364 |
+
except TypeError as e:
|
| 365 |
+
st.error(
|
| 366 |
+
f"""
|
| 367 |
+
{e}. Refresh this page and try again.
|
| 368 |
+
|
| 369 |
+
If the problem persists [open an issue](https://github.com/claromes/waybacktweets/issues).""" # noqa: E501
|
| 370 |
+
)
|
| 371 |
+
st.stop()
|
| 372 |
+
except Exception as e:
|
| 373 |
+
st.error(str(e))
|
| 374 |
+
st.stop()
|
app/requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.36.0
|
| 2 |
+
waybacktweets==1.0a5
|
assets/download.svg
ADDED
|
|
assets/parthenon.png
ADDED
|
assets/waybacktweets.png
ADDED
|
assets/waybacktweets_title.png
ADDED
|
docs/Makefile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal makefile for Sphinx documentation
|
| 2 |
+
#
|
| 3 |
+
|
| 4 |
+
# You can set these variables from the command line, and also
|
| 5 |
+
# from the environment for the first two.
|
| 6 |
+
SPHINXOPTS ?=
|
| 7 |
+
SPHINXBUILD ?= sphinx-build
|
| 8 |
+
SOURCEDIR = .
|
| 9 |
+
BUILDDIR = _build
|
| 10 |
+
|
| 11 |
+
# Put it first so that "make" without argument is like "make help".
|
| 12 |
+
help:
|
| 13 |
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
| 14 |
+
|
| 15 |
+
.PHONY: help Makefile
|
| 16 |
+
|
| 17 |
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
| 18 |
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
| 19 |
+
%: Makefile
|
| 20 |
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
docs/_static/card.png
ADDED
|
docs/_static/css/custom.css
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
body {
|
| 2 |
+
font-family: Georgia, 'Times New Roman', Times, serif;
|
| 3 |
+
background-color: whitesmoke;
|
| 4 |
+
}
|
| 5 |
+
|
| 6 |
+
a:hover {
|
| 7 |
+
background-color: whitesmoke !important;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
#cli #usage #waybacktweets h3,
|
| 11 |
+
#cli .admonition-title,
|
| 12 |
+
.sphinxsidebarwrapper li ul li ul:has(a[href="#waybacktweets"]):last-child {
|
| 13 |
+
display: none;
|
| 14 |
+
}
|
docs/_templates/page.html
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "!page.html" %}
|
| 2 |
+
|
| 3 |
+
{% block extrahead %}
|
| 4 |
+
{{ super() }}
|
| 5 |
+
<meta name="description" content="Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data">
|
| 6 |
+
|
| 7 |
+
<meta property="og:title" content="{{ title|e }}" />
|
| 8 |
+
<meta property="og:description" content="Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data">
|
| 9 |
+
<meta property="og:image" content="https://claromes.github.io/waybacktweets/_static/card.png" />
|
| 10 |
+
|
| 11 |
+
<meta name="twitter:title" content="{{ title|e }}">
|
| 12 |
+
<meta name="twitter:description" content="Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data">
|
| 13 |
+
<meta property="twitter:image" content="https://claromes.github.io/waybacktweets/_static/card.png" />
|
| 14 |
+
{% endblock %}
|
docs/api.rst
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
API
|
| 2 |
+
====
|
| 3 |
+
|
| 4 |
+
Request
|
| 5 |
+
---------
|
| 6 |
+
|
| 7 |
+
.. automodule:: waybacktweets.api.request
|
| 8 |
+
|
| 9 |
+
.. autoclass:: WaybackTweets
|
| 10 |
+
:members:
|
| 11 |
+
|
| 12 |
+
.. _parser:
|
| 13 |
+
|
| 14 |
+
Parse
|
| 15 |
+
---------
|
| 16 |
+
|
| 17 |
+
.. automodule:: waybacktweets.api.parse
|
| 18 |
+
|
| 19 |
+
.. autoclass:: TweetsParser
|
| 20 |
+
:members:
|
| 21 |
+
:private-members:
|
| 22 |
+
|
| 23 |
+
.. autoclass:: TwitterEmbed
|
| 24 |
+
:members:
|
| 25 |
+
|
| 26 |
+
.. autoclass:: JsonParser
|
| 27 |
+
:members:
|
| 28 |
+
|
| 29 |
+
.. _exporter:
|
| 30 |
+
|
| 31 |
+
Export
|
| 32 |
+
---------
|
| 33 |
+
|
| 34 |
+
.. automodule:: waybacktweets.api.export
|
| 35 |
+
|
| 36 |
+
.. autoclass:: TweetsExporter
|
| 37 |
+
:members:
|
| 38 |
+
:private-members:
|
| 39 |
+
|
| 40 |
+
Visualize
|
| 41 |
+
-----------
|
| 42 |
+
|
| 43 |
+
.. automodule:: waybacktweets.api.visualize
|
| 44 |
+
|
| 45 |
+
.. autoclass:: HTMLTweetsVisualizer
|
| 46 |
+
:members:
|
| 47 |
+
:private-members:
|
| 48 |
+
|
| 49 |
+
.. _utils:
|
| 50 |
+
|
| 51 |
+
Utils
|
| 52 |
+
-------
|
| 53 |
+
|
| 54 |
+
.. automodule:: waybacktweets.utils.utils
|
| 55 |
+
|
| 56 |
+
.. autofunction:: check_double_status
|
| 57 |
+
.. autofunction:: check_pattern_tweet
|
| 58 |
+
.. autofunction:: check_url_scheme
|
| 59 |
+
.. autofunction:: clean_tweet_url
|
| 60 |
+
.. autofunction:: clean_wayback_machine_url
|
| 61 |
+
.. autofunction:: delete_tweet_pathnames
|
| 62 |
+
.. autofunction:: get_response
|
| 63 |
+
.. autofunction:: is_tweet_url
|
| 64 |
+
.. autofunction:: semicolon_parser
|
| 65 |
+
.. autofunction:: timestamp_parser
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
Config
|
| 69 |
+
------------
|
| 70 |
+
|
| 71 |
+
.. automodule:: waybacktweets.config.config
|
| 72 |
+
:members:
|
docs/cli.rst
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CLI
|
| 2 |
+
================
|
| 3 |
+
|
| 4 |
+
Usage
|
| 5 |
+
---------
|
| 6 |
+
|
| 7 |
+
.. click:: waybacktweets._cli:main
|
| 8 |
+
:prog: waybacktweets
|
| 9 |
+
:nested: full
|
| 10 |
+
|
| 11 |
+
Collapsing
|
| 12 |
+
------------
|
| 13 |
+
|
| 14 |
+
The Wayback Tweets command line tool recommends the use of three types of "collapse": ``urlkey``, ``digest``, and ``timestamp`` field.
|
| 15 |
+
|
| 16 |
+
- ``urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
|
| 17 |
+
|
| 18 |
+
- ``digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string.
|
| 19 |
+
|
| 20 |
+
- ``timestamp``: (`datetime`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format. We recommend ``YYYYMMDD``.
|
| 21 |
+
|
| 22 |
+
However, it is possible to use it with other options. Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
|
| 23 |
+
|
| 24 |
+
.. note::
|
| 25 |
+
|
| 26 |
+
A new form of filtering is the option to "collapse" results based on a field, or a substring of a field. Collapsing is done on adjacent CDX lines where all captures after the first one that are duplicate are filtered out. This is useful for filtering out captures that are "too dense" or when looking for unique captures.
|
| 27 |
+
|
| 28 |
+
To use collapsing, add one or more ``collapse=field`` or ``collapse=field:N`` where ``N`` is the first ``N`` characters of field to test.
|
| 29 |
+
|
| 30 |
+
- Ex: Only show at most 1 capture per hour (compare the first 10 digits of the ``timestamp`` field). Given 2 captures ``20130226010000`` and ``20130226010800``, since first 10 digits ``2013022601`` match, the 2nd capture will be filtered out:
|
| 31 |
+
|
| 32 |
+
http://web.archive.org/cdx/search/cdx?url=google.com&collapse=timestamp:10
|
| 33 |
+
|
| 34 |
+
The calendar page at `web.archive.org` uses this filter by default: `http://web.archive.org/web/*/archive.org`
|
| 35 |
+
|
| 36 |
+
- Ex: Only show unique captures by ``digest`` (note that only adjacent digest are collapsed, duplicates elsewhere in the cdx are not affected):
|
| 37 |
+
|
| 38 |
+
http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=digest
|
| 39 |
+
|
| 40 |
+
- Ex: Only show unique urls in a prefix query (filtering out captures except first capture of a given url). This is similar to the old prefix query in wayback (note: this query may be slow at the moment):
|
| 41 |
+
|
| 42 |
+
http://web.archive.org/cdx/search/cdx?url=archive.org&collapse=urlkey&matchType=prefix
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
URL Match Scope
|
| 46 |
+
-----------------
|
| 47 |
+
|
| 48 |
+
The CDX Server can return results matching a certain prefix, a certain host or all subdomains by using the ``matchType`` param.
|
| 49 |
+
|
| 50 |
+
The package ``waybacktweets`` uses the pathname ``/status`` followed by the wildcard '*' at the end of the URL to retrieve only tweets. However, if a value is provided for this parameter, the search will be made from the URL `twitter.com/<USERNAME>`.
|
| 51 |
+
|
| 52 |
+
Read below text extracted from the official Wayback CDX Server API (Beta) documentation.
|
| 53 |
+
|
| 54 |
+
.. note::
|
| 55 |
+
|
| 56 |
+
For example, if given the url: archive.org/about/ and:
|
| 57 |
+
|
| 58 |
+
- ``matchType=exact`` (default if omitted) will return results matching exactly archive.org/about/
|
| 59 |
+
|
| 60 |
+
- ``matchType=prefix`` will return results for all results under the path archive.org/about/
|
| 61 |
+
|
| 62 |
+
http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=prefix&limit=1000
|
| 63 |
+
|
| 64 |
+
- ``matchType=host`` will return results from host archive.org
|
| 65 |
+
|
| 66 |
+
http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=host&limit=1000
|
| 67 |
+
|
| 68 |
+
- ``matchType=domain`` will return results from host archive.org and all subhosts \*.archive.org
|
| 69 |
+
|
| 70 |
+
http://web.archive.org/cdx/search/cdx?url=archive.org/about/&matchType=domain&limit=1000
|
| 71 |
+
|
| 72 |
+
The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url:
|
| 73 |
+
|
| 74 |
+
- If url is ends in '/\*', eg url=archive.org/\* the query is equivalent to url=archive.org/&matchType=prefix
|
| 75 |
+
- If url starts with '\*.', eg url=\*.archive.org/ the query is equivalent to url=archive.org/&matchType=domain
|
| 76 |
+
|
| 77 |
+
(Note: The domain mode is only available if the CDX is in `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_-order format.)
|
docs/conf.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
|
| 3 |
+
from pallets_sphinx_themes import ProjectLink, get_version
|
| 4 |
+
|
| 5 |
+
project = "Wayback Tweets"
|
| 6 |
+
release, version = get_version("waybacktweets")
|
| 7 |
+
rst_epilog = f".. |release| replace:: v{release}"
|
| 8 |
+
copyright = f"2023 - {datetime.datetime.now().year}, Claromes · Icon by The Doodle Library · Title font by Google, licensed under the Open Font License · Pre-release: v{release}" # noqa: E501
|
| 9 |
+
author = "Claromes"
|
| 10 |
+
|
| 11 |
+
# -- General configuration ---------------------------------------------------
|
| 12 |
+
|
| 13 |
+
extensions = [
|
| 14 |
+
"sphinx.ext.autodoc",
|
| 15 |
+
"sphinx.ext.napoleon",
|
| 16 |
+
"sphinx.ext.extlinks",
|
| 17 |
+
"sphinx.ext.intersphinx",
|
| 18 |
+
"pallets_sphinx_themes",
|
| 19 |
+
"sphinxcontrib.mermaid",
|
| 20 |
+
"sphinx_new_tab_link",
|
| 21 |
+
"sphinx_click.ext",
|
| 22 |
+
"sphinx_autodoc_typehints",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
templates_path = ["_templates"]
|
| 26 |
+
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
| 27 |
+
autodoc_typehints = "description"
|
| 28 |
+
|
| 29 |
+
# -- Options for HTML output -------------------------------------------------
|
| 30 |
+
|
| 31 |
+
html_theme = "flask"
|
| 32 |
+
html_static_path = ["_static"]
|
| 33 |
+
html_css_files = ["css/custom.css"]
|
| 34 |
+
html_context = {
|
| 35 |
+
"project_links": [
|
| 36 |
+
ProjectLink("PyPI Releases", "https://pypi.org/project/waybacktweets/"),
|
| 37 |
+
ProjectLink("Source Code", "https://github.com/claromes/waybacktweets/"),
|
| 38 |
+
ProjectLink(
|
| 39 |
+
"Issue Tracker", "https://github.com/claromes/waybacktweets/issues/"
|
| 40 |
+
),
|
| 41 |
+
ProjectLink("Mastodon", "https://ruby.social/@claromes"),
|
| 42 |
+
ProjectLink("Bluesky", "https://bsky.app/profile/claromes.com"),
|
| 43 |
+
]
|
| 44 |
+
}
|
| 45 |
+
html_sidebars = {
|
| 46 |
+
"index": ["project.html", "localtoc.html", "searchbox.html"],
|
| 47 |
+
"**": ["localtoc.html", "relations.html", "searchbox.html"],
|
| 48 |
+
}
|
| 49 |
+
html_favicon = "../assets/parthenon.png"
|
| 50 |
+
html_logo = "../assets/parthenon.png"
|
| 51 |
+
html_title = f"Wayback Tweets Documentation ({version})"
|
| 52 |
+
html_show_sourcelink = False
|
docs/contribute.rst
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Contribute
|
| 2 |
+
================
|
| 3 |
+
|
| 4 |
+
Here are all the ways you can contribute to this project.
|
| 5 |
+
|
| 6 |
+
Testing
|
| 7 |
+
---------
|
| 8 |
+
|
| 9 |
+
The best way to help is by using the package, either on the command line or as a module, suggesting improvements and reporting bugs. You're very welcome to `open an issue <https://github.com/claromes/waybacktweets/issues/>`_.
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
Hacking
|
| 13 |
+
---------
|
| 14 |
+
|
| 15 |
+
If you have Python skills, contribute to the `code <https://github.com/claromes/waybacktweets/>`_.
|
| 16 |
+
|
| 17 |
+
These are the prerequisites:
|
| 18 |
+
|
| 19 |
+
- Python 3.10+
|
| 20 |
+
- Poetry
|
| 21 |
+
|
| 22 |
+
Install from the source, following the :ref:`installation` instructions.
|
| 23 |
+
|
| 24 |
+
Brief explanation about the code under the Wayback Tweets directory:
|
| 25 |
+
|
| 26 |
+
- ``app``: Streamlit application code
|
| 27 |
+
- ``assets``: Title and logo images
|
| 28 |
+
- ``docs``: Documentation generated with Sphinx
|
| 29 |
+
- ``legacy_app``: Legacy Streamlit application code
|
| 30 |
+
- ``waybacktweets/api``: Main package modules
|
| 31 |
+
- ``waybacktweets/config``: Global configuration module
|
| 32 |
+
- ``waybacktweets/exceptions``: Wayback Tweets Exceptions
|
| 33 |
+
- ``waybacktweets/utils``: Helper functions used in the package
|
| 34 |
+
|
| 35 |
+
Sponsoring
|
| 36 |
+
------------
|
| 37 |
+
|
| 38 |
+
You can also donate to the project's developer and maintainer, `Claromes <https://claromes.com>`_, via `GitHub Sponsor <https://github.com/sponsors/claromes>`_ or if you are interested in sponsoring the project you can contact via email at support at claromes dot com.
|
docs/exceptions.rst
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Exceptions
|
| 2 |
+
================
|
| 3 |
+
|
| 4 |
+
These are the most common errors and are handled by the ``waybacktweets`` package.
|
| 5 |
+
|
| 6 |
+
ReadTimeoutError
|
| 7 |
+
------------------
|
| 8 |
+
|
| 9 |
+
This error occurs when a request to the web.archive.org server takes too long to respond. The server could be overloaded or there could be network issues.
|
| 10 |
+
|
| 11 |
+
The output message from the package would be: ``Connection to web.archive.org timed out.``
|
| 12 |
+
|
| 13 |
+
ConnectionError
|
| 14 |
+
------------------
|
| 15 |
+
|
| 16 |
+
This error is raised when the package fails to establish a new connection with web.archive.org. This could be due to network issues or the server being down.
|
| 17 |
+
|
| 18 |
+
The output message from the package would be: ``Failed to establish a new connection with web.archive.org. Max retries exceeded.``
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
This is the error often returned when performing experimental parsing of URLs with the mimetype ``application/json``.
|
| 22 |
+
|
| 23 |
+
The warning output message from the package would be: ``Connection error with https://web.archive.org/web/<TIMESTAMP>/https://twitter.com/<USERNAME>/status/<TWEET_ID>. Max retries exceeded. Error parsing the JSON, but the CDX data was saved.``
|
| 24 |
+
|
| 25 |
+
HTTPError
|
| 26 |
+
------------------
|
| 27 |
+
|
| 28 |
+
This error occurs when the Internet Archive services are temporarily offline. This could be due to maintenance or server issues.
|
| 29 |
+
|
| 30 |
+
The output message from the package would be: ``Temporarily Offline: Internet Archive services are temporarily offline. Please check Internet Archive Twitter feed (https://twitter.com/internetarchive) for the latest information.``
|
| 31 |
+
|
| 32 |
+
EmptyResponseError
|
| 33 |
+
---------------------
|
| 34 |
+
|
| 35 |
+
This exception raised for empty responses.
|
| 36 |
+
|
| 37 |
+
The output message from the package would be: ``No data was saved due to an empty response.``
|
| 38 |
+
|
| 39 |
+
Warning
|
| 40 |
+
------------------
|
| 41 |
+
|
| 42 |
+
It is possible to encounter the following warning when running the ``TweetsParser`` class (:ref:`parser`): ``<TWEET_URL> not available on the user's Twitter account, but the CDX data was saved.``
|
| 43 |
+
|
| 44 |
+
This occurs when the original tweet is no longer available on Twitter and has possibly been deleted.
|
docs/field_options.rst
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. _field_options:
|
| 2 |
+
|
| 3 |
+
Field Options
|
| 4 |
+
================
|
| 5 |
+
|
| 6 |
+
The package performs several parses to facilitate the analysis of archived tweets and types of tweets. The fields below are available, which can be passed to the :ref:`parser` and :ref:`exporter`, in addition, the command line tool returns all these fields.
|
| 7 |
+
|
| 8 |
+
- ``archived_urlkey``: (`str`) A canonical transformation of the URL you supplied, for example, ``org,eserver,tc)/``. Such keys are useful for indexing.
|
| 9 |
+
|
| 10 |
+
- ``archived_timestamp``: (`str`) A 14 digit date-time representation in the ``YYYYMMDDhhmmss`` format.
|
| 11 |
+
|
| 12 |
+
- ``parsed_archived_timestamp``: (`str`) The ``archived_timestamp`` in human-readable format.
|
| 13 |
+
|
| 14 |
+
- ``archived_tweet_url``: (`str`) The archived URL.
|
| 15 |
+
|
| 16 |
+
- ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
|
| 17 |
+
|
| 18 |
+
- ``original_tweet_url``: (`str`) The original tweet URL.
|
| 19 |
+
|
| 20 |
+
- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`.
|
| 21 |
+
|
| 22 |
+
- ``available_tweet_text``: (`str`) The tweet text extracted from the URL that is still available on the Twitter account.
|
| 23 |
+
|
| 24 |
+
- ``available_tweet_is_RT``: (`bool`) Whether the tweet from the ``available_tweet_text`` field is a retweet or not.
|
| 25 |
+
|
| 26 |
+
- ``available_tweet_info``: (`str`) Name and date of the tweet from the ``available_tweet_text`` field.
|
| 27 |
+
|
| 28 |
+
- ``archived_mimetype``: (`str`) The mimetype of the archived content, which can be one of these:
|
| 29 |
+
|
| 30 |
+
- ``text/html``
|
| 31 |
+
|
| 32 |
+
- ``warc/revisit``
|
| 33 |
+
|
| 34 |
+
- ``application/json``
|
| 35 |
+
|
| 36 |
+
- ``unk``
|
| 37 |
+
|
| 38 |
+
- ``archived_statuscode``: (`str`) The HTTP status code of the snapshot. If the mimetype is ``warc/revisit``, the value returned for the ``statuscode`` key can be blank, but the actual value is the same as that of any other entry that has the same ``digest`` as this entry. If the mimetype is ``application/json``, the value is usually empty or ``-``.
|
| 39 |
+
|
| 40 |
+
- ``archived_digest``: (`str`) The ``SHA1`` hash digest of the content, excluding the headers. It's usually a base-32-encoded string.
|
| 41 |
+
|
| 42 |
+
- ``archived_length``: (`int`) The compressed byte size of the corresponding WARC record, which includes WARC headers, HTTP headers, and content payload.
|
docs/index.rst
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. rst-class:: hide-header
|
| 2 |
+
|
| 3 |
+
Wayback Tweets
|
| 4 |
+
================
|
| 5 |
+
|
| 6 |
+
.. image:: ../assets/waybacktweets_title.png
|
| 7 |
+
:alt: Wayback Tweets
|
| 8 |
+
:align: center
|
| 9 |
+
|
| 10 |
+
Pre-release: |release|
|
| 11 |
+
|
| 12 |
+
Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing (see :ref:`field_options`), and saves the data in HTML (for easy viewing of the tweets using the ``iframe`` tag), CSV, and JSON formats.
|
| 13 |
+
|
| 14 |
+
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.12528448.svg
|
| 15 |
+
:target: https://doi.org/10.5281/zenodo.12528448
|
| 16 |
+
|
| 17 |
+
.. note::
|
| 18 |
+
Intensive queries can lead to rate limiting, resulting in a temporary ban of a few minutes from web.archive.org.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
User Guide
|
| 22 |
+
------------
|
| 23 |
+
|
| 24 |
+
.. toctree::
|
| 25 |
+
:maxdepth: 2
|
| 26 |
+
|
| 27 |
+
installation
|
| 28 |
+
quickstart
|
| 29 |
+
workflow
|
| 30 |
+
field_options
|
| 31 |
+
outputs
|
| 32 |
+
exceptions
|
| 33 |
+
contribute
|
| 34 |
+
todo
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
Command-Line Interface
|
| 38 |
+
------------------------
|
| 39 |
+
.. toctree::
|
| 40 |
+
:maxdepth: 2
|
| 41 |
+
|
| 42 |
+
cli
|
| 43 |
+
|
| 44 |
+
Streamlit Web App
|
| 45 |
+
-------------------
|
| 46 |
+
|
| 47 |
+
.. toctree::
|
| 48 |
+
:maxdepth: 2
|
| 49 |
+
|
| 50 |
+
streamlit
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
API Reference
|
| 54 |
+
---------------
|
| 55 |
+
|
| 56 |
+
.. toctree::
|
| 57 |
+
:maxdepth: 2
|
| 58 |
+
|
| 59 |
+
api
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
Additional Information
|
| 63 |
+
-----------------------
|
| 64 |
+
|
| 65 |
+
.. toctree::
|
| 66 |
+
:maxdepth: 1
|
| 67 |
+
|
| 68 |
+
.. raw:: html
|
| 69 |
+
|
| 70 |
+
<ul>
|
| 71 |
+
<li><a href="https://github.com/claromes/waybacktweets/blob/main/LICENSE.md" target="_blank">GPL-3.0 license</a></li>
|
| 72 |
+
<li><a href="https://github.com/claromes/waybacktweets/releases" target="_blank">Changes</a></li>
|
| 73 |
+
</ul>
|
| 74 |
+
|
| 75 |
+
Indices and tables
|
| 76 |
+
----------------------
|
| 77 |
+
|
| 78 |
+
.. toctree::
|
| 79 |
+
:maxdepth: 2
|
| 80 |
+
|
| 81 |
+
genindex
|
| 82 |
+
modindex
|
| 83 |
+
search
|
docs/installation.rst
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. _installation:
|
| 2 |
+
|
| 3 |
+
Installation
|
| 4 |
+
================
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
Using pip
|
| 8 |
+
------------
|
| 9 |
+
|
| 10 |
+
.. code-block:: shell
|
| 11 |
+
|
| 12 |
+
pip install waybacktweets
|
| 13 |
+
|
| 14 |
+
From source
|
| 15 |
+
-------------
|
| 16 |
+
|
| 17 |
+
Clone the repository:
|
| 18 |
+
|
| 19 |
+
.. code-block:: shell
|
| 20 |
+
|
| 21 |
+
git clone git@github.com:claromes/waybacktweets.git
|
| 22 |
+
|
| 23 |
+
Change directory:
|
| 24 |
+
|
| 25 |
+
.. code-block:: shell
|
| 26 |
+
|
| 27 |
+
cd waybacktweets
|
| 28 |
+
|
| 29 |
+
Install poetry, if you haven't already:
|
| 30 |
+
|
| 31 |
+
.. code-block:: shell
|
| 32 |
+
|
| 33 |
+
pip install poetry
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
Install the dependencies:
|
| 37 |
+
|
| 38 |
+
.. code-block:: shell
|
| 39 |
+
|
| 40 |
+
poetry install
|
| 41 |
+
|
| 42 |
+
Run the CLI:
|
| 43 |
+
|
| 44 |
+
.. code-block:: shell
|
| 45 |
+
|
| 46 |
+
poetry run waybacktweets [SUBCOMMANDS]
|
| 47 |
+
|
| 48 |
+
Run the Streamlit App:
|
| 49 |
+
|
| 50 |
+
.. code-block:: shell
|
| 51 |
+
|
| 52 |
+
streamlit run app/app.py
|
| 53 |
+
|
| 54 |
+
Build the docs:
|
| 55 |
+
|
| 56 |
+
.. code-block:: shell
|
| 57 |
+
|
| 58 |
+
cd docs
|
| 59 |
+
|
| 60 |
+
.. code-block:: shell
|
| 61 |
+
|
| 62 |
+
make clean html
|
| 63 |
+
|
| 64 |
+
`Read the Poetry CLI documentation <https://python-poetry.org/docs/cli/>`_.
|
docs/make.bat
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@ECHO OFF
|
| 2 |
+
|
| 3 |
+
pushd %~dp0
|
| 4 |
+
|
| 5 |
+
REM Command file for Sphinx documentation
|
| 6 |
+
|
| 7 |
+
if "%SPHINXBUILD%" == "" (
|
| 8 |
+
set SPHINXBUILD=sphinx-build
|
| 9 |
+
)
|
| 10 |
+
set SOURCEDIR=.
|
| 11 |
+
set BUILDDIR=_build
|
| 12 |
+
|
| 13 |
+
%SPHINXBUILD% >NUL 2>NUL
|
| 14 |
+
if errorlevel 9009 (
|
| 15 |
+
echo.
|
| 16 |
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
| 17 |
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
| 18 |
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
| 19 |
+
echo.may add the Sphinx directory to PATH.
|
| 20 |
+
echo.
|
| 21 |
+
echo.If you don't have Sphinx installed, grab it from
|
| 22 |
+
echo.https://www.sphinx-doc.org/
|
| 23 |
+
exit /b 1
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
if "%1" == "" goto help
|
| 27 |
+
|
| 28 |
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
| 29 |
+
goto end
|
| 30 |
+
|
| 31 |
+
:help
|
| 32 |
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
| 33 |
+
|
| 34 |
+
:end
|
| 35 |
+
popd
|
docs/outputs.rst
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Outputs
|
| 2 |
+
==========
|
| 3 |
+
|
| 4 |
+
It is possible to save the CDX data in three formats. In the command line tool, these three formats are saved automatically.
|
| 5 |
+
|
| 6 |
+
HTML
|
| 7 |
+
--------
|
| 8 |
+
|
| 9 |
+
This format allows for easy viewing of the archived tweets, through the use of the ``iframe`` tag. Each tweet contains four viewing options, which render when clicking on the accordion:
|
| 10 |
+
|
| 11 |
+
- ``archived_tweet_url``: (`str`) The archived URL.
|
| 12 |
+
|
| 13 |
+
- ``parsed_archived_tweet_url``: (`str`) The archived URL after parsing. It is not guaranteed that this option will be archived, it is just a facilitator, as the originally archived URL does not always exist, due to changes in URLs and web services of the social network Twitter. Check the :ref:`utils`.
|
| 14 |
+
|
| 15 |
+
- ``original_tweet_url``: (`str`) The original tweet URL.
|
| 16 |
+
|
| 17 |
+
- ``parsed_tweet_url``: (`str`) The original tweet URL after parsing. Old URLs were archived in a nested manner. The parsing applied here unnests these URLs, when necessary. Check the :ref:`utils`.
|
| 18 |
+
|
| 19 |
+
Additionally, other fields are displayed.
|
| 20 |
+
|
| 21 |
+
CSV
|
| 22 |
+
--------
|
| 23 |
+
|
| 24 |
+
Option to analyze the CDX data in comma-separated values.
|
| 25 |
+
|
| 26 |
+
JSON
|
| 27 |
+
--------
|
| 28 |
+
|
| 29 |
+
Option to analyze the data in JavaScript Object Notation.
|
docs/quickstart.rst
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Quickstart
|
| 2 |
+
================
|
| 3 |
+
|
| 4 |
+
CLI
|
| 5 |
+
-------------
|
| 6 |
+
|
| 7 |
+
Using Wayback Tweets as a standalone command line tool.
|
| 8 |
+
|
| 9 |
+
waybacktweets [OPTIONS] USERNAME
|
| 10 |
+
|
| 11 |
+
.. code-block:: shell
|
| 12 |
+
|
| 13 |
+
waybacktweets --from 20150101 --to 20191231 --limit 250 jack
|
| 14 |
+
|
| 15 |
+
Web App
|
| 16 |
+
-------------
|
| 17 |
+
|
| 18 |
+
Using Wayback Tweets as a Streamlit Web App.
|
| 19 |
+
|
| 20 |
+
`Open the application <https://waybacktweets.streamlit.app>`_, a prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
|
| 21 |
+
|
| 22 |
+
Module
|
| 23 |
+
-------------
|
| 24 |
+
|
| 25 |
+
Using Wayback Tweets as a Python Module.
|
| 26 |
+
|
| 27 |
+
.. code-block:: python
|
| 28 |
+
|
| 29 |
+
from waybacktweets import WaybackTweets, TweetsParser, TweetsExporter
|
| 30 |
+
|
| 31 |
+
USERNAME = "jack"
|
| 32 |
+
|
| 33 |
+
api = WaybackTweets(USERNAME)
|
| 34 |
+
archived_tweets = api.get()
|
| 35 |
+
|
| 36 |
+
if archived_tweets:
|
| 37 |
+
field_options = [
|
| 38 |
+
"archived_timestamp",
|
| 39 |
+
"original_tweet_url",
|
| 40 |
+
"archived_tweet_url",
|
| 41 |
+
"archived_statuscode",
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
parser = TweetsParser(archived_tweets, USERNAME, field_options)
|
| 45 |
+
parsed_tweets = parser.parse()
|
| 46 |
+
|
| 47 |
+
exporter = TweetsExporter(parsed_tweets, USERNAME, field_options)
|
| 48 |
+
exporter.save_to_csv()
|
docs/streamlit.rst
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Web App
|
| 2 |
+
=========
|
| 3 |
+
|
| 4 |
+
The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool.
|
| 5 |
+
|
| 6 |
+
`Open the application <https://waybacktweets.streamlit.app>`_.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
Filters
|
| 10 |
+
----------
|
| 11 |
+
|
| 12 |
+
- Filtering by date range: Using the ``from`` and ``to`` filters
|
| 13 |
+
|
| 14 |
+
- Limit: Query result limits.
|
| 15 |
+
|
| 16 |
+
- Offset: Allows for a simple way to scroll through the results.
|
| 17 |
+
|
| 18 |
+
- Only unique Wayback Machine URLs: Filtering by the collapse option using the ``urlkey`` field and the URL Match Scope ``prefix``
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
Username Query Parameter
|
| 22 |
+
--------------------------
|
| 23 |
+
|
| 24 |
+
An alternative way to access the application is by using the ``username`` query parameter. This allows for automatic configuration of the Username input and automatically searches. Additionally, when the ``username`` parameter is sent, the accordion with the filters will already be open.
|
| 25 |
+
|
| 26 |
+
Example URL format:
|
| 27 |
+
|
| 28 |
+
``https://waybacktweets.streamlit.app?username=<USERNAME>``
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
Community Comments
|
| 32 |
+
--------------------
|
| 33 |
+
|
| 34 |
+
.. raw:: html
|
| 35 |
+
|
| 36 |
+
<ul>
|
| 37 |
+
<li>"We're always delighted when we see our community members create tools for open source research." <a href="https://twitter.com/bellingcat/status/1728085974138122604" target="_blank">Bellingcat</a></li>
|
| 38 |
+
<br>
|
| 39 |
+
<li>"#myOSINTtip Clarissa Mendes launched a new tool for accessing old tweets via archive.org called the Wayback Tweets app. For those who love to look deeper at #osint tools, it is available on GitHub and uses the Wayback CDX Server API server (which is a hidden gem for accessing archive.org data!)" <a href="https://www.linkedin.com/posts/my-osint-training_myosinttip-osint-activity-7148425933324963841-0Q2n/" target="_blank">My OSINT Training</a></li>
|
| 40 |
+
<br>
|
| 41 |
+
<li>"Original way to find deleted tweets." <a href="https://twitter.com/henkvaness/status/1693298101765701676" target="_blank">Henk Van Ess</a></li>
|
| 42 |
+
<br>
|
| 43 |
+
<li>"This is an excellent tool to use now that most Twitter API-based tools have gone down with changes to the pricing structure over at X." <a href="https://osintnewsletter.com/p/22#%C2%A7osint-community" target="_blank">The OSINT Newsletter - Issue #22</a></li>
|
| 44 |
+
<br>
|
| 45 |
+
<li>"One of the keys to using the Wayback Machine effectively is knowing what it can and can't archive. It can, and has, archived many, many Twitter accounts... Utilize fun tools such as Wayback Tweets to do so more effectively." <a href="https://memeticwarfareweekly.substack.com/p/mww-paradise-by-the-telegram-dashboard" target="_blank">Ari Ben Am</a></li>
|
| 46 |
+
<br>
|
| 47 |
+
<li>"Want to see archived tweets on Wayback Machine in bulk? You can use Wayback Tweets." <a href="https://twitter.com/DailyOsint/status/1695065018662855102" target="_blank">Daily OSINT</a></li>
|
| 48 |
+
<br>
|
| 49 |
+
<li>"Untuk mempermudah penelusuran arsip, gunakan Wayback Tweets." <a href="https://twitter.com/gijnIndonesia/status/1685912219408805888" target="_blank">GIJN Indonesia</a></li>
|
| 50 |
+
<br>
|
| 51 |
+
<li>"A tool to quickly view tweets saved on archive.org." <a href="https://irinatechtips.substack.com/p/irina_tech_tips-newsletter-3-2023#%C2%A7wayback-tweets" target="_blank">Irina_Tech_Tips Newsletter #3</a></li>
|
| 52 |
+
<br>
|
| 53 |
+
</ul>
|
| 54 |
+
|
| 55 |
+
Legacy App
|
| 56 |
+
-------------
|
| 57 |
+
|
| 58 |
+
To access the legacy version of Wayback Tweets `click here <https://waybacktweets-legacy.streamlit.app>`_.
|
| 59 |
+
|
| 60 |
+
.. note::
|
| 61 |
+
|
| 62 |
+
If the application is down, please check the `Streamlit Cloud Status <https://www.streamlitstatus.com/>`_.
|
| 63 |
+
|
docs/todo.rst
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TODO
|
| 2 |
+
================
|
| 3 |
+
|
| 4 |
+
.. |uncheck| raw:: html
|
| 5 |
+
|
| 6 |
+
<input type="checkbox">
|
| 7 |
+
|
| 8 |
+
|uncheck| Unit Tests
|
| 9 |
+
|
| 10 |
+
|uncheck| JSON Parser: Create a separate function to handle JSON return, apply JsonParser (``waybacktweets/api/parse.py:110``), and avoid rate limiting
|
| 11 |
+
|
| 12 |
+
|uncheck| Download images when tweet URL has extensions like JPG or PNG
|
| 13 |
+
|
| 14 |
+
|uncheck| Implement logging system (remove print statements)
|
| 15 |
+
|
| 16 |
+
|uncheck| Mapping and parsing of other Twitter-related URLs
|
| 17 |
+
|
| 18 |
+
|uncheck| Develop a scraper to download snapshots from https://archive.today
|
docs/workflow.rst
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. _flowchart:
|
| 2 |
+
|
| 3 |
+
Workflow
|
| 4 |
+
================
|
| 5 |
+
|
| 6 |
+
The tool was written following a proposal not only to retrieve data from archived tweets, but also to facilitate the reading of these tweets. Therefore, a flow is defined to obtain these results in the best possible way.
|
| 7 |
+
|
| 8 |
+
Due to limitations of the Wayback CDX Server API, it is not always possible to parse the results with the mimetype ``application/json``, regardless, the data in CDX format are saved.
|
| 9 |
+
|
| 10 |
+
Use the mouse to zoom in and out the flowchart.
|
| 11 |
+
|
| 12 |
+
.. mermaid::
|
| 13 |
+
:zoom:
|
| 14 |
+
:align: center
|
| 15 |
+
|
| 16 |
+
flowchart TB
|
| 17 |
+
A[input Username]--> B[(Wayback Machine)]
|
| 18 |
+
B--> B1[save Archived Tweets CDX data]
|
| 19 |
+
B1--> |parsing| C{embed Tweet URL\nvia Twitter Publisher}
|
| 20 |
+
C--> |2xx/3xx| D[return Tweet text]
|
| 21 |
+
C--> |4xx| E[return None]
|
| 22 |
+
E--> F{request Archived\nTweet URL}
|
| 23 |
+
F--> |4xx| G[return Only CDX data]
|
| 24 |
+
F--> |TODO: 2xx/3xx: application/json| J[return JSON text]
|
| 25 |
+
F--> |2xx/3xx: text/html, warc/revisit, unk| K[return HTML iframe tag]
|
legacy_app/legacy_app.py
ADDED
|
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import re
|
| 3 |
+
from urllib.parse import unquote
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import streamlit.components.v1 as components
|
| 8 |
+
|
| 9 |
+
year = datetime.datetime.now().year
|
| 10 |
+
|
| 11 |
+
st.set_page_config(
|
| 12 |
+
page_title="Wayback Tweets",
|
| 13 |
+
page_icon="🏛️",
|
| 14 |
+
layout="centered",
|
| 15 |
+
menu_items={
|
| 16 |
+
"About": """
|
| 17 |
+
## 🏛️ Wayback Tweets
|
| 18 |
+
|
| 19 |
+
Tool that displays, via Wayback CDX Server API, multiple archived tweets on Wayback Machine to avoid opening each link manually. Users can apply filters based on specific years and view tweets that do not have the original URL available.
|
| 20 |
+
|
| 21 |
+
This tool is a prototype, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues). Created by [@claromes](https://claromes.com).
|
| 22 |
+
|
| 23 |
+
-------
|
| 24 |
+
""", # noqa: E501
|
| 25 |
+
},
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# https://discuss.streamlit.io/t/remove-hide-running-man-animation-on-top-of-page/21773/3
|
| 29 |
+
hide_streamlit_style = """
|
| 30 |
+
<style>
|
| 31 |
+
header[data-testid="stHeader"] {
|
| 32 |
+
opacity: 0.5;
|
| 33 |
+
}
|
| 34 |
+
iframe {
|
| 35 |
+
border: 1px solid #dddddd;
|
| 36 |
+
border-radius: 0.5rem;
|
| 37 |
+
}
|
| 38 |
+
div[data-testid="InputInstructions"] {
|
| 39 |
+
visibility: hidden;
|
| 40 |
+
}
|
| 41 |
+
</style>
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
| 45 |
+
|
| 46 |
+
if "current_handle" not in st.session_state:
|
| 47 |
+
st.session_state.current_handle = ""
|
| 48 |
+
|
| 49 |
+
if "prev_disabled" not in st.session_state:
|
| 50 |
+
st.session_state.prev_disabled = False
|
| 51 |
+
|
| 52 |
+
if "next_disabled" not in st.session_state:
|
| 53 |
+
st.session_state.next_disabled = False
|
| 54 |
+
|
| 55 |
+
if "next_button" not in st.session_state:
|
| 56 |
+
st.session_state.next_button = False
|
| 57 |
+
|
| 58 |
+
if "prev_button" not in st.session_state:
|
| 59 |
+
st.session_state.prev_button = False
|
| 60 |
+
|
| 61 |
+
if "update_component" not in st.session_state:
|
| 62 |
+
st.session_state.update_component = 0
|
| 63 |
+
|
| 64 |
+
if "offset" not in st.session_state:
|
| 65 |
+
st.session_state.offset = 0
|
| 66 |
+
|
| 67 |
+
if "saved_at" not in st.session_state:
|
| 68 |
+
st.session_state.saved_at = (2006, year)
|
| 69 |
+
|
| 70 |
+
if "count" not in st.session_state:
|
| 71 |
+
st.session_state.count = False
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def scroll_into_view():
|
| 75 |
+
js = f"""
|
| 76 |
+
<script>
|
| 77 |
+
window.parent.document.querySelector('section.main').scrollTo(0, 0);
|
| 78 |
+
let update_component = {st.session_state.update_component} // Force component update to generate scroll
|
| 79 |
+
</script>
|
| 80 |
+
""" # noqa: E501
|
| 81 |
+
|
| 82 |
+
components.html(js, width=0, height=0)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def clean_tweet(tweet):
|
| 86 |
+
handle = st.session_state.current_handle.lower()
|
| 87 |
+
tweet_lower = tweet.lower()
|
| 88 |
+
|
| 89 |
+
pattern = re.compile(r"/status/(\d+)")
|
| 90 |
+
match_lower_case = pattern.search(tweet_lower)
|
| 91 |
+
match_original_case = pattern.search(tweet)
|
| 92 |
+
|
| 93 |
+
if match_lower_case and handle in tweet_lower:
|
| 94 |
+
return f"https://twitter.com/{st.session_state.current_handle}/status/{match_original_case.group(1)}" # noqa: E501
|
| 95 |
+
else:
|
| 96 |
+
return tweet
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def clean_link(link):
|
| 100 |
+
handle = st.session_state.current_handle.lower()
|
| 101 |
+
link = link.lower()
|
| 102 |
+
|
| 103 |
+
pattern = re.compile(r"/status/(\d+)")
|
| 104 |
+
match = pattern.search(link)
|
| 105 |
+
|
| 106 |
+
if match and handle in link:
|
| 107 |
+
return f"https://web.archive.org/web/{timestamp[i]}/https://twitter.com/{st.session_state.current_handle}/status/{match.group(1)}" # noqa: E501
|
| 108 |
+
else:
|
| 109 |
+
return link
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def pattern_tweet(tweet):
|
| 113 |
+
# Reply: /status//
|
| 114 |
+
# Link: /status///
|
| 115 |
+
# Twimg: /status/https://pbs
|
| 116 |
+
|
| 117 |
+
pattern = re.compile(r'/status/"([^"]+)"')
|
| 118 |
+
|
| 119 |
+
match = pattern.search(tweet)
|
| 120 |
+
if match:
|
| 121 |
+
return match.group(1).lstrip("/")
|
| 122 |
+
else:
|
| 123 |
+
return tweet
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def pattern_tweet_id(tweet):
|
| 127 |
+
# Delete sub-endpoint (/photos, /likes, /retweet...)
|
| 128 |
+
pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
|
| 129 |
+
match_username = pattern_username.match(tweet)
|
| 130 |
+
|
| 131 |
+
pattern_id = r"https://twitter.com/\w+/status/(\d+)"
|
| 132 |
+
match_id = re.search(pattern_id, tweet)
|
| 133 |
+
|
| 134 |
+
if match_id and match_username:
|
| 135 |
+
tweet_id = match_id.group(1)
|
| 136 |
+
username = match_username.group(1)
|
| 137 |
+
return f"https://twitter.com/{username}/status/{tweet_id}"
|
| 138 |
+
else:
|
| 139 |
+
return tweet
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def check_double_status(url_wb, url_tweet):
|
| 143 |
+
if url_wb.count("/status/") == 2 and "twitter.com" not in url_tweet:
|
| 144 |
+
return True
|
| 145 |
+
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def embed(tweet):
|
| 150 |
+
try:
|
| 151 |
+
url = f"https://publish.twitter.com/oembed?url={clean_tweet(tweet)}"
|
| 152 |
+
response = requests.get(url)
|
| 153 |
+
|
| 154 |
+
regex = r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>' # noqa: E501
|
| 155 |
+
regex_author = r"^(.*?)\s*\("
|
| 156 |
+
|
| 157 |
+
if response.status_code == 200 or response.status_code == 302:
|
| 158 |
+
status_code = response.status_code
|
| 159 |
+
html = response.json()["html"]
|
| 160 |
+
author_name = response.json()["author_name"]
|
| 161 |
+
|
| 162 |
+
matches_html = re.findall(regex, html, re.DOTALL)
|
| 163 |
+
|
| 164 |
+
tweet_content = []
|
| 165 |
+
user_info = []
|
| 166 |
+
is_RT = []
|
| 167 |
+
|
| 168 |
+
for match in matches_html:
|
| 169 |
+
tweet_content_match = re.sub(r"<a[^>]*>|<\/a>", "", match[0].strip())
|
| 170 |
+
tweet_content_match = tweet_content_match.replace("<br>", "\n")
|
| 171 |
+
|
| 172 |
+
user_info_match = re.sub(r"<a[^>]*>|<\/a>", "", match[1].strip())
|
| 173 |
+
user_info_match = user_info_match.replace(")", "), ")
|
| 174 |
+
|
| 175 |
+
match_author = re.search(regex_author, user_info_match)
|
| 176 |
+
author_tweet = match_author.group(1)
|
| 177 |
+
|
| 178 |
+
if tweet_content_match:
|
| 179 |
+
tweet_content.append(tweet_content_match)
|
| 180 |
+
if user_info_match:
|
| 181 |
+
user_info.append(user_info_match)
|
| 182 |
+
|
| 183 |
+
is_RT_match = False
|
| 184 |
+
if author_name != author_tweet:
|
| 185 |
+
is_RT_match = True
|
| 186 |
+
|
| 187 |
+
is_RT.append(is_RT_match)
|
| 188 |
+
|
| 189 |
+
return status_code, tweet_content, user_info, is_RT
|
| 190 |
+
else:
|
| 191 |
+
return False
|
| 192 |
+
except requests.exceptions.Timeout:
|
| 193 |
+
st.error("Connection to web.archive.org timed out.")
|
| 194 |
+
except requests.exceptions.ConnectionError:
|
| 195 |
+
st.error("Failed to establish a new connection with web.archive.org.")
|
| 196 |
+
except UnboundLocalError:
|
| 197 |
+
st.empty()
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
@st.cache_data(ttl=1800, show_spinner=False)
|
| 201 |
+
def tweets_count(handle, saved_at):
|
| 202 |
+
url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&from={saved_at[0]}&to={saved_at[1]}" # noqa: E501
|
| 203 |
+
try:
|
| 204 |
+
response = requests.get(url)
|
| 205 |
+
|
| 206 |
+
if response.status_code == 200:
|
| 207 |
+
data = response.json()
|
| 208 |
+
if data and len(data) > 1:
|
| 209 |
+
total_tweets = len(data) - 1
|
| 210 |
+
return total_tweets
|
| 211 |
+
else:
|
| 212 |
+
return 0
|
| 213 |
+
except requests.exceptions.Timeout:
|
| 214 |
+
st.error("Connection to web.archive.org timed out.")
|
| 215 |
+
st.stop()
|
| 216 |
+
except requests.exceptions.ConnectionError:
|
| 217 |
+
st.error("Failed to establish a new connection with web.archive.org.")
|
| 218 |
+
except UnboundLocalError:
|
| 219 |
+
st.empty()
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@st.cache_data(ttl=1800, show_spinner=False)
|
| 223 |
+
def query_api(handle, limit, offset, saved_at):
|
| 224 |
+
if not handle:
|
| 225 |
+
st.warning("username, please!")
|
| 226 |
+
st.stop()
|
| 227 |
+
|
| 228 |
+
url = f"https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{handle}/status/*&collapse=timestamp:8&output=json&limit={limit}&offset={offset}&from={saved_at[0]}&to={saved_at[1]}" # noqa: E501
|
| 229 |
+
try:
|
| 230 |
+
response = requests.get(url)
|
| 231 |
+
response.raise_for_status()
|
| 232 |
+
|
| 233 |
+
if response.status_code == 200 or response.status_code == 304:
|
| 234 |
+
return response.json()
|
| 235 |
+
except requests.exceptions.Timeout:
|
| 236 |
+
st.error("Connection to web.archive.org timed out.")
|
| 237 |
+
except requests.exceptions.ConnectionError:
|
| 238 |
+
st.error("Failed to establish a new connection with web.archive.org.")
|
| 239 |
+
except UnboundLocalError:
|
| 240 |
+
st.empty()
|
| 241 |
+
except requests.exceptions.HTTPError:
|
| 242 |
+
st.error(
|
| 243 |
+
"""
|
| 244 |
+
**Temporarily Offline**
|
| 245 |
+
|
| 246 |
+
Internet Archive services are temporarily offline. Please check Internet Archive [Twitter feed](https://twitter.com/internetarchive/) for the latest information.
|
| 247 |
+
""" # noqa: E501
|
| 248 |
+
)
|
| 249 |
+
st.stop()
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
@st.cache_data(ttl=1800, show_spinner=False)
|
| 253 |
+
def parse_links(links):
|
| 254 |
+
parsed_links = []
|
| 255 |
+
timestamp = []
|
| 256 |
+
tweet_links = []
|
| 257 |
+
parsed_mimetype = []
|
| 258 |
+
|
| 259 |
+
for link in links[1:]:
|
| 260 |
+
tweet_remove_char = unquote(link[2]).replace("’", "")
|
| 261 |
+
cleaned_tweet = pattern_tweet(tweet_remove_char).strip('"')
|
| 262 |
+
|
| 263 |
+
url = f"https://web.archive.org/web/{link[1]}/{tweet_remove_char}"
|
| 264 |
+
|
| 265 |
+
parsed_links.append(url)
|
| 266 |
+
timestamp.append(link[1])
|
| 267 |
+
tweet_links.append(cleaned_tweet)
|
| 268 |
+
parsed_mimetype.append(link[3])
|
| 269 |
+
|
| 270 |
+
return parsed_links, tweet_links, parsed_mimetype, timestamp
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def attr(i):
|
| 274 |
+
original_tweet = pattern_tweet_id(clean_tweet(tweet_links[i]))
|
| 275 |
+
|
| 276 |
+
if status:
|
| 277 |
+
original_tweet = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
|
| 278 |
+
elif "://" not in tweet_links[i]:
|
| 279 |
+
original_tweet = pattern_tweet_id(f"https://{tweet_links[i]}")
|
| 280 |
+
|
| 281 |
+
st.markdown(
|
| 282 |
+
f'{i+1 + st.session_state.offset}. [**archived url**]({link}) · [**original url**]({original_tweet}) · **MIME Type:** {mimetype[i]} · **Saved at:** {datetime.datetime.strptime(timestamp[i], "%Y%m%d%H%M%S")}' # noqa: E501
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def display_tweet():
|
| 287 |
+
if (
|
| 288 |
+
mimetype[i] == "application/json"
|
| 289 |
+
or mimetype[i] == "text/html"
|
| 290 |
+
or mimetype[i] == "unk"
|
| 291 |
+
or mimetype[i] == "warc/revisit"
|
| 292 |
+
):
|
| 293 |
+
if is_RT[0] is True:
|
| 294 |
+
st.info("*Retweet*")
|
| 295 |
+
st.write(tweet_content[0])
|
| 296 |
+
st.write(f"**{user_info[0]}**")
|
| 297 |
+
|
| 298 |
+
st.divider()
|
| 299 |
+
else:
|
| 300 |
+
st.warning("MIME Type was not parsed.")
|
| 301 |
+
|
| 302 |
+
st.divider()
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def display_not_tweet():
|
| 306 |
+
original_link = pattern_tweet_id(clean_tweet(tweet_links[i]))
|
| 307 |
+
|
| 308 |
+
if status:
|
| 309 |
+
original_link = pattern_tweet_id(f"https://twitter.com/{tweet_links[i]}")
|
| 310 |
+
elif "://" not in tweet_links[i]:
|
| 311 |
+
original_link = pattern_tweet_id(f"https://{tweet_links[i]}")
|
| 312 |
+
|
| 313 |
+
response_html = requests.get(original_link)
|
| 314 |
+
|
| 315 |
+
if (
|
| 316 |
+
mimetype[i] == "text/html"
|
| 317 |
+
or mimetype[i] == "warc/revisit"
|
| 318 |
+
or mimetype[i] == "unk"
|
| 319 |
+
):
|
| 320 |
+
if (
|
| 321 |
+
".jpg" in tweet_links[i] or ".png" in tweet_links[i]
|
| 322 |
+
) and response_html.status_code == 200:
|
| 323 |
+
components.iframe(tweet_links[i], height=500, scrolling=True)
|
| 324 |
+
elif "/status/" not in original_link:
|
| 325 |
+
st.info("This isn't a status or is not available")
|
| 326 |
+
elif status or f"{st.session_state.current_handle}" not in original_link:
|
| 327 |
+
st.info(f"Replying to {st.session_state.current_handle}")
|
| 328 |
+
else:
|
| 329 |
+
components.iframe(clean_link(link), height=500, scrolling=True)
|
| 330 |
+
|
| 331 |
+
st.divider()
|
| 332 |
+
elif mimetype[i] == "application/json":
|
| 333 |
+
try:
|
| 334 |
+
response_json = requests.get(link)
|
| 335 |
+
|
| 336 |
+
if response_json.status_code == 200:
|
| 337 |
+
json_data = response_json.json()
|
| 338 |
+
|
| 339 |
+
if "data" in json_data:
|
| 340 |
+
if "text" in json_data["data"]:
|
| 341 |
+
json_text = json_data["data"]["text"]
|
| 342 |
+
else:
|
| 343 |
+
json_text = json_data["data"]
|
| 344 |
+
else:
|
| 345 |
+
if "text" in json_data:
|
| 346 |
+
json_text = json_data["text"]
|
| 347 |
+
else:
|
| 348 |
+
json_text = json_data
|
| 349 |
+
|
| 350 |
+
st.code(json_text)
|
| 351 |
+
st.json(json_data, expanded=False)
|
| 352 |
+
|
| 353 |
+
st.divider()
|
| 354 |
+
else:
|
| 355 |
+
st.error(response_json.status_code)
|
| 356 |
+
|
| 357 |
+
st.divider()
|
| 358 |
+
except requests.exceptions.Timeout:
|
| 359 |
+
st.error("Connection to web.archive.org timed out.")
|
| 360 |
+
st.divider()
|
| 361 |
+
except requests.exceptions.ConnectionError:
|
| 362 |
+
st.error("Failed to establish a new connection with web.archive.org.")
|
| 363 |
+
st.divider()
|
| 364 |
+
except UnboundLocalError:
|
| 365 |
+
st.empty()
|
| 366 |
+
else:
|
| 367 |
+
st.warning("MIME Type was not parsed.")
|
| 368 |
+
st.divider()
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def prev_page():
|
| 372 |
+
st.session_state.offset -= tweets_per_page
|
| 373 |
+
|
| 374 |
+
# scroll to top config
|
| 375 |
+
st.session_state.update_component += 1
|
| 376 |
+
scroll_into_view()
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def next_page():
|
| 380 |
+
st.session_state.offset += tweets_per_page
|
| 381 |
+
|
| 382 |
+
# scroll to top config
|
| 383 |
+
st.session_state.update_component += 1
|
| 384 |
+
scroll_into_view()
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
# UI
|
| 388 |
+
st.title(
|
| 389 |
+
"Wayback Tweets [](https://github.com/claromes/waybacktweets)", # noqa: E501
|
| 390 |
+
anchor=False,
|
| 391 |
+
help="v0.4.3",
|
| 392 |
+
)
|
| 393 |
+
st.write(
|
| 394 |
+
"Display multiple archived tweets on Wayback Machine and avoid opening each link manually" # noqa: E501
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
handle = st.text_input("Username", placeholder="jack")
|
| 398 |
+
|
| 399 |
+
st.session_state.saved_at = st.slider("Tweets saved between", 2006, year, (2006, year))
|
| 400 |
+
|
| 401 |
+
not_available = st.checkbox(
|
| 402 |
+
"Original URLs not available",
|
| 403 |
+
help="Due to changes in X, it is possible to find available tweets if you are logged into X", # noqa: E501
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
query = st.button("Query", type="primary", use_container_width=True)
|
| 407 |
+
|
| 408 |
+
if handle != st.session_state.current_handle:
|
| 409 |
+
st.session_state.current_handle = handle
|
| 410 |
+
st.session_state.offset = 0
|
| 411 |
+
|
| 412 |
+
if query or st.session_state.count:
|
| 413 |
+
tweets_per_page = 25
|
| 414 |
+
|
| 415 |
+
st.session_state.count = tweets_count(handle, st.session_state.saved_at)
|
| 416 |
+
|
| 417 |
+
st.caption(
|
| 418 |
+
"The search optimization uses an 8-digit [collapsing strategy](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md?ref=hackernoon.com#collapsing), refining the captures to one per day. The number of tweets per page is set to 25, and this is a fixed value due to the API rate limit." # noqa: E501
|
| 419 |
+
)
|
| 420 |
+
st.write(f"**{st.session_state.count} URLs have been captured**")
|
| 421 |
+
|
| 422 |
+
if st.session_state.count:
|
| 423 |
+
if tweets_per_page > st.session_state.count:
|
| 424 |
+
tweets_per_page = st.session_state.count
|
| 425 |
+
|
| 426 |
+
try:
|
| 427 |
+
progress = st.empty()
|
| 428 |
+
links = query_api(
|
| 429 |
+
handle, tweets_per_page, st.session_state.offset, st.session_state.saved_at
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
parse = parse_links(links)
|
| 433 |
+
parsed_links = parse[0]
|
| 434 |
+
tweet_links = parse[1]
|
| 435 |
+
mimetype = parse[2]
|
| 436 |
+
timestamp = parse[3]
|
| 437 |
+
|
| 438 |
+
if links:
|
| 439 |
+
st.divider()
|
| 440 |
+
|
| 441 |
+
st.session_state.current_handle = handle
|
| 442 |
+
|
| 443 |
+
return_none_count = 0
|
| 444 |
+
|
| 445 |
+
start_index = st.session_state.offset
|
| 446 |
+
end_index = min(st.session_state.count, start_index + tweets_per_page)
|
| 447 |
+
|
| 448 |
+
with st.spinner("Fetching..."):
|
| 449 |
+
for i in range(tweets_per_page):
|
| 450 |
+
try:
|
| 451 |
+
if tweet_links[i]:
|
| 452 |
+
link = parsed_links[i]
|
| 453 |
+
tweet = embed(tweet_links[i])
|
| 454 |
+
|
| 455 |
+
status = check_double_status(link, tweet_links[i])
|
| 456 |
+
|
| 457 |
+
if not not_available:
|
| 458 |
+
attr(i)
|
| 459 |
+
|
| 460 |
+
if tweet:
|
| 461 |
+
status_code = tweet[0]
|
| 462 |
+
tweet_content = tweet[1]
|
| 463 |
+
user_info = tweet[2]
|
| 464 |
+
is_RT = tweet[3]
|
| 465 |
+
|
| 466 |
+
display_tweet()
|
| 467 |
+
elif not tweet:
|
| 468 |
+
display_not_tweet()
|
| 469 |
+
|
| 470 |
+
if not_available:
|
| 471 |
+
if not tweet:
|
| 472 |
+
return_none_count += 1
|
| 473 |
+
attr(i)
|
| 474 |
+
|
| 475 |
+
display_not_tweet()
|
| 476 |
+
|
| 477 |
+
progress.write(
|
| 478 |
+
f"{return_none_count} URLs have been captured in the range {start_index}-{end_index}" # noqa: E501
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
if start_index <= 0:
|
| 482 |
+
st.session_state.prev_disabled = True
|
| 483 |
+
else:
|
| 484 |
+
st.session_state.prev_disabled = False
|
| 485 |
+
|
| 486 |
+
if i + 1 == st.session_state.count:
|
| 487 |
+
st.session_state.next_disabled = True
|
| 488 |
+
else:
|
| 489 |
+
st.session_state.next_disabled = False
|
| 490 |
+
except IndexError:
|
| 491 |
+
if start_index <= 0:
|
| 492 |
+
st.session_state.prev_disabled = True
|
| 493 |
+
else:
|
| 494 |
+
st.session_state.prev_disabled = False
|
| 495 |
+
|
| 496 |
+
st.session_state.next_disabled = True
|
| 497 |
+
|
| 498 |
+
prev, _, next = st.columns([3, 4, 3])
|
| 499 |
+
|
| 500 |
+
prev.button(
|
| 501 |
+
"Previous",
|
| 502 |
+
disabled=st.session_state.prev_disabled,
|
| 503 |
+
key="prev_button_key",
|
| 504 |
+
on_click=prev_page,
|
| 505 |
+
type="primary",
|
| 506 |
+
use_container_width=True,
|
| 507 |
+
)
|
| 508 |
+
next.button(
|
| 509 |
+
"Next",
|
| 510 |
+
disabled=st.session_state.next_disabled,
|
| 511 |
+
key="next_button_key",
|
| 512 |
+
on_click=next_page,
|
| 513 |
+
type="primary",
|
| 514 |
+
use_container_width=True,
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
if not links:
|
| 518 |
+
st.error("Unable to query the Wayback Machine API.")
|
| 519 |
+
except TypeError as e:
|
| 520 |
+
st.error(
|
| 521 |
+
f"""
|
| 522 |
+
{e}. Refresh this page and try again.
|
| 523 |
+
""" # noqa: E501
|
| 524 |
+
)
|
| 525 |
+
st.session_state.offset = 0
|
legacy_app/requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
requests==2.30.0
|
| 2 |
+
streamlit==1.27.0
|
waybacktweets/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa: F401
|
| 2 |
+
|
| 3 |
+
from waybacktweets.api.export import TweetsExporter
|
| 4 |
+
from waybacktweets.api.parse import JsonParser, TweetsParser, TwitterEmbed
|
| 5 |
+
from waybacktweets.api.request import WaybackTweets
|
| 6 |
+
from waybacktweets.api.visualize import HTMLTweetsVisualizer
|
waybacktweets/_cli.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CLI functions for retrieving archived tweets.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from typing import Any, Optional
|
| 7 |
+
|
| 8 |
+
import click
|
| 9 |
+
from rich import print as rprint
|
| 10 |
+
|
| 11 |
+
from waybacktweets.api.export import TweetsExporter
|
| 12 |
+
from waybacktweets.api.parse import TweetsParser
|
| 13 |
+
from waybacktweets.api.request import WaybackTweets
|
| 14 |
+
from waybacktweets.config.config import config
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _parse_date(
|
| 18 |
+
ctx: Optional[Any] = None, param: Optional[Any] = None, value: Optional[str] = None
|
| 19 |
+
) -> Optional[str]:
|
| 20 |
+
"""
|
| 21 |
+
Parses a date string and returns it in the format "YYYYMMDD".
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
ctx: Necessary when used with the click package. Defaults to None.
|
| 25 |
+
param: Necessary when used with the click package. Defaults to None.
|
| 26 |
+
value: A date string in the "YYYYMMDD" format. Defaults to None.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
The input date string formatted in the "YYYYMMDD" format, or None if no date string was provided.
|
| 30 |
+
""" # noqa: E501
|
| 31 |
+
try:
|
| 32 |
+
if value is None:
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
date = datetime.strptime(value, "%Y%m%d")
|
| 36 |
+
|
| 37 |
+
return date.strftime("%Y%m%d")
|
| 38 |
+
except ValueError:
|
| 39 |
+
raise click.BadParameter("Date must be in format YYYYmmdd")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@click.command()
|
| 43 |
+
@click.argument("username", type=str)
|
| 44 |
+
@click.option(
|
| 45 |
+
"-c",
|
| 46 |
+
"--collapse",
|
| 47 |
+
type=click.Choice(["urlkey", "digest", "timestamp:XX"], case_sensitive=False),
|
| 48 |
+
default=None,
|
| 49 |
+
help="Collapse results based on a field, or a substring of a field. XX in the timestamp value ranges from 1 to 14, comparing the first XX digits of the timestamp field. It is recommended to use from 4 onwards, to compare at least by years.", # noqa: E501
|
| 50 |
+
)
|
| 51 |
+
@click.option(
|
| 52 |
+
"-f",
|
| 53 |
+
"--from",
|
| 54 |
+
"timestamp_from",
|
| 55 |
+
type=click.UNPROCESSED,
|
| 56 |
+
metavar="DATE",
|
| 57 |
+
callback=_parse_date,
|
| 58 |
+
default=None,
|
| 59 |
+
help="Filtering by date range from this date. Format: YYYYmmdd",
|
| 60 |
+
)
|
| 61 |
+
@click.option(
|
| 62 |
+
"-t",
|
| 63 |
+
"--to",
|
| 64 |
+
"timestamp_to",
|
| 65 |
+
type=click.UNPROCESSED,
|
| 66 |
+
metavar="DATE",
|
| 67 |
+
callback=_parse_date,
|
| 68 |
+
default=None,
|
| 69 |
+
help="Filtering by date range up to this date. Format: YYYYmmdd",
|
| 70 |
+
)
|
| 71 |
+
@click.option(
|
| 72 |
+
"-l",
|
| 73 |
+
"--limit",
|
| 74 |
+
type=int,
|
| 75 |
+
metavar="INTEGER",
|
| 76 |
+
default=None,
|
| 77 |
+
help="Query result limits.",
|
| 78 |
+
)
|
| 79 |
+
@click.option(
|
| 80 |
+
"-o",
|
| 81 |
+
"--offset",
|
| 82 |
+
type=int,
|
| 83 |
+
metavar="INTEGER",
|
| 84 |
+
default=None,
|
| 85 |
+
help="Allows for a simple way to scroll through the results.",
|
| 86 |
+
)
|
| 87 |
+
@click.option(
|
| 88 |
+
"-mt",
|
| 89 |
+
"--matchtype",
|
| 90 |
+
type=click.Choice(["exact", "prefix", "host", "domain"], case_sensitive=False),
|
| 91 |
+
default=None,
|
| 92 |
+
help="Results matching a certain prefix, a certain host or all subdomains.", # noqa: E501
|
| 93 |
+
)
|
| 94 |
+
@click.option(
|
| 95 |
+
"-v",
|
| 96 |
+
"--verbose",
|
| 97 |
+
"verbose",
|
| 98 |
+
is_flag=True,
|
| 99 |
+
default=False,
|
| 100 |
+
help="Shows the error log.",
|
| 101 |
+
)
|
| 102 |
+
def main(
|
| 103 |
+
username: str,
|
| 104 |
+
collapse: Optional[str],
|
| 105 |
+
timestamp_from: Optional[str],
|
| 106 |
+
timestamp_to: Optional[str],
|
| 107 |
+
limit: Optional[int],
|
| 108 |
+
offset: Optional[int],
|
| 109 |
+
matchtype: Optional[str],
|
| 110 |
+
verbose: Optional[bool],
|
| 111 |
+
) -> None:
|
| 112 |
+
"""
|
| 113 |
+
Retrieves archived tweets CDX data from the Wayback Machine, performs necessary parsing, and saves the data.
|
| 114 |
+
|
| 115 |
+
USERNAME: The Twitter username without @.
|
| 116 |
+
""" # noqa: E501
|
| 117 |
+
try:
|
| 118 |
+
config.verbose = verbose
|
| 119 |
+
|
| 120 |
+
api = WaybackTweets(
|
| 121 |
+
username, collapse, timestamp_from, timestamp_to, limit, offset, matchtype
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
print(f"Waybacking @{username}'s archived tweets...")
|
| 125 |
+
archived_tweets = api.get()
|
| 126 |
+
|
| 127 |
+
if archived_tweets:
|
| 128 |
+
field_options = [
|
| 129 |
+
"archived_urlkey",
|
| 130 |
+
"archived_timestamp",
|
| 131 |
+
"parsed_archived_timestamp",
|
| 132 |
+
"archived_tweet_url",
|
| 133 |
+
"parsed_archived_tweet_url",
|
| 134 |
+
"original_tweet_url",
|
| 135 |
+
"parsed_tweet_url",
|
| 136 |
+
"available_tweet_text",
|
| 137 |
+
"available_tweet_is_RT",
|
| 138 |
+
"available_tweet_info",
|
| 139 |
+
"archived_mimetype",
|
| 140 |
+
"archived_statuscode",
|
| 141 |
+
"archived_digest",
|
| 142 |
+
"archived_length",
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
parser = TweetsParser(archived_tweets, username, field_options)
|
| 146 |
+
parsed_tweets = parser.parse(print_progress=True)
|
| 147 |
+
|
| 148 |
+
exporter = TweetsExporter(parsed_tweets, username, field_options)
|
| 149 |
+
|
| 150 |
+
exporter.save_to_csv()
|
| 151 |
+
exporter.save_to_json()
|
| 152 |
+
exporter.save_to_html()
|
| 153 |
+
except Exception as e:
|
| 154 |
+
rprint(f"[red]{e}")
|
| 155 |
+
finally:
|
| 156 |
+
rprint(
|
| 157 |
+
"[yellow]\nNeed help? Read the docs: https://claromes.github.io/waybacktweets" # noqa: E501
|
| 158 |
+
)
|
waybacktweets/api/__init__.py
ADDED
|
File without changes
|
waybacktweets/api/export.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Exports the parsed archived tweets.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import datetime
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from typing import Any, Dict, List, Optional
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
from waybacktweets.api.visualize import HTMLTweetsVisualizer
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TweetsExporter:
|
| 16 |
+
"""
|
| 17 |
+
Class responsible for exporting parsed archived tweets.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
data (Dict[str, List[Any]]): The parsed archived tweets data.
|
| 21 |
+
username (str): The username associated with the tweets.
|
| 22 |
+
field_options (List[str]): The fields to be included in the exported data. For more details on each option, visit :ref:`field_options`.
|
| 23 |
+
""" # noqa: E501
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self, data: Dict[str, List[Any]], username: str, field_options: List[str]
|
| 27 |
+
):
|
| 28 |
+
self.data = data
|
| 29 |
+
self.username = username
|
| 30 |
+
self.field_options = field_options
|
| 31 |
+
self.formatted_datetime = self._datetime_now()
|
| 32 |
+
self.filename = f"{self.username}_tweets_{self.formatted_datetime}"
|
| 33 |
+
self.dataframe = self._create_dataframe()
|
| 34 |
+
|
| 35 |
+
@staticmethod
|
| 36 |
+
def _datetime_now() -> str:
|
| 37 |
+
"""
|
| 38 |
+
Returns the current datetime, formatted as a string.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
The current datetime.
|
| 42 |
+
"""
|
| 43 |
+
now = datetime.datetime.now()
|
| 44 |
+
formatted_now = now.strftime("%Y%m%d%H%M%S")
|
| 45 |
+
formatted_now = re.sub(r"\W+", "", formatted_now)
|
| 46 |
+
|
| 47 |
+
return formatted_now
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def _transpose_matrix(
|
| 51 |
+
data: Dict[str, List[Any]], fill_value: Optional[Any] = None
|
| 52 |
+
) -> List[List[Any]]:
|
| 53 |
+
"""
|
| 54 |
+
Transposes a matrix, filling in missing values with a specified fill value if needed.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
data (Dict[str, List[Any]]): The matrix to be transposed.
|
| 58 |
+
fill_value (Optional[Any]): The value to fill in missing values with.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
The transposed matrix.
|
| 62 |
+
""" # noqa: E501
|
| 63 |
+
max_length = max(len(sublist) for sublist in data.values())
|
| 64 |
+
|
| 65 |
+
filled_data = {
|
| 66 |
+
key: value + [fill_value] * (max_length - len(value))
|
| 67 |
+
for key, value in data.items()
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
data_transposed = [list(row) for row in zip(*filled_data.values())]
|
| 71 |
+
|
| 72 |
+
return data_transposed
|
| 73 |
+
|
| 74 |
+
def _create_dataframe(self) -> pd.DataFrame:
|
| 75 |
+
"""
|
| 76 |
+
Creates a DataFrame from the transposed data.
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
The DataFrame representation of the data.
|
| 80 |
+
"""
|
| 81 |
+
data_transposed = self._transpose_matrix(self.data)
|
| 82 |
+
|
| 83 |
+
df = pd.DataFrame(data_transposed, columns=self.field_options)
|
| 84 |
+
|
| 85 |
+
return df
|
| 86 |
+
|
| 87 |
+
def save_to_csv(self) -> None:
|
| 88 |
+
"""
|
| 89 |
+
Saves the DataFrame to a CSV file.
|
| 90 |
+
"""
|
| 91 |
+
csv_file_path = f"{self.filename}.csv"
|
| 92 |
+
self.dataframe.to_csv(csv_file_path, index=False)
|
| 93 |
+
|
| 94 |
+
print(f"Saved to {csv_file_path}")
|
| 95 |
+
|
| 96 |
+
def save_to_json(self) -> None:
|
| 97 |
+
"""
|
| 98 |
+
Saves the DataFrame to a JSON file.
|
| 99 |
+
"""
|
| 100 |
+
json_path = f"{self.filename}.json"
|
| 101 |
+
self.dataframe.to_json(json_path, orient="records", lines=False)
|
| 102 |
+
|
| 103 |
+
print(f"Saved to {json_path}")
|
| 104 |
+
|
| 105 |
+
def save_to_html(self) -> None:
|
| 106 |
+
"""
|
| 107 |
+
Saves the DataFrame to an HTML file.
|
| 108 |
+
"""
|
| 109 |
+
json_path = f"{self.filename}.json"
|
| 110 |
+
|
| 111 |
+
if not os.path.exists(json_path):
|
| 112 |
+
self.save_to_json()
|
| 113 |
+
|
| 114 |
+
html_file_path = f"{self.filename}.html"
|
| 115 |
+
|
| 116 |
+
html = HTMLTweetsVisualizer(self.username, json_path, html_file_path)
|
| 117 |
+
|
| 118 |
+
html_content = html.generate()
|
| 119 |
+
html.save(html_content)
|
| 120 |
+
|
| 121 |
+
print(f"Saved to {html_file_path}")
|
waybacktweets/api/parse.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Parses the returned data from the Wayback CDX Server API.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
+
from contextlib import nullcontext
|
| 8 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
+
from urllib.parse import unquote
|
| 10 |
+
|
| 11 |
+
from rich import print as rprint
|
| 12 |
+
from rich.progress import Progress
|
| 13 |
+
|
| 14 |
+
from waybacktweets.config.config import config
|
| 15 |
+
from waybacktweets.config.field_options import FIELD_OPTIONS
|
| 16 |
+
from waybacktweets.exceptions.exceptions import (
|
| 17 |
+
ConnectionError,
|
| 18 |
+
GetResponseError,
|
| 19 |
+
HTTPError,
|
| 20 |
+
)
|
| 21 |
+
from waybacktweets.utils.utils import (
|
| 22 |
+
check_double_status,
|
| 23 |
+
check_pattern_tweet,
|
| 24 |
+
check_url_scheme,
|
| 25 |
+
clean_tweet_url,
|
| 26 |
+
delete_tweet_pathnames,
|
| 27 |
+
get_response,
|
| 28 |
+
is_tweet_url,
|
| 29 |
+
semicolon_parser,
|
| 30 |
+
timestamp_parser,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TwitterEmbed:
|
| 35 |
+
"""
|
| 36 |
+
This class is responsible for parsing tweets using the Twitter Publish service.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
tweet_url (str): The URL of the tweet to be parsed.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(self, tweet_url: str):
|
| 43 |
+
self.tweet_url = tweet_url
|
| 44 |
+
|
| 45 |
+
def embed(self) -> Optional[Tuple[List[str], List[bool], List[str]]]:
|
| 46 |
+
"""
|
| 47 |
+
Parses the archived tweets when they are still available.
|
| 48 |
+
|
| 49 |
+
This function goes through each archived tweet and checks if it is still available. If the tweet is available, it extracts the necessary information and adds it to the respective lists. The function returns a tuple of three lists:
|
| 50 |
+
|
| 51 |
+
- The first list contains the tweet texts.
|
| 52 |
+
- The second list contains boolean values indicating whether each tweet is still available.
|
| 53 |
+
- The third list contains the URLs of the tweets.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
A tuple of three lists containing the tweet texts, availability statuses, and URLs, respectively. If no tweets are available, returns None.
|
| 57 |
+
""" # noqa: E501
|
| 58 |
+
try:
|
| 59 |
+
url = f"https://publish.twitter.com/oembed?url={self.tweet_url}"
|
| 60 |
+
response = get_response(url=url)
|
| 61 |
+
if response:
|
| 62 |
+
json_response = response.json()
|
| 63 |
+
html = json_response["html"]
|
| 64 |
+
author_name = json_response["author_name"]
|
| 65 |
+
|
| 66 |
+
regex = re.compile(
|
| 67 |
+
r'<blockquote class="twitter-tweet"(?: [^>]+)?><p[^>]*>(.*?)<\/p>.*?— (.*?)<\/a>', # noqa
|
| 68 |
+
re.DOTALL,
|
| 69 |
+
)
|
| 70 |
+
regex_author = re.compile(r"^(.*?)\s*\(")
|
| 71 |
+
|
| 72 |
+
matches_html = regex.findall(html)
|
| 73 |
+
|
| 74 |
+
tweet_content = []
|
| 75 |
+
user_info = []
|
| 76 |
+
is_RT = []
|
| 77 |
+
|
| 78 |
+
for match in matches_html:
|
| 79 |
+
tweet_content_match = re.sub(
|
| 80 |
+
r"<a[^>]*>|<\/a>", "", match[0].strip()
|
| 81 |
+
).replace("<br>", "\n")
|
| 82 |
+
user_info_match = re.sub(
|
| 83 |
+
r"<a[^>]*>|<\/a>", "", match[1].strip()
|
| 84 |
+
).replace(")", "), ")
|
| 85 |
+
match_author = regex_author.search(user_info_match)
|
| 86 |
+
author_tweet = match_author.group(1) if match_author else ""
|
| 87 |
+
|
| 88 |
+
if tweet_content_match:
|
| 89 |
+
tweet_content.append(tweet_content_match)
|
| 90 |
+
if user_info_match:
|
| 91 |
+
user_info.append(user_info_match)
|
| 92 |
+
is_RT.append(author_name != author_tweet)
|
| 93 |
+
|
| 94 |
+
return tweet_content, is_RT, user_info
|
| 95 |
+
except ConnectionError:
|
| 96 |
+
if config.verbose:
|
| 97 |
+
rprint("[yellow]Error parsing the tweet, but the CDX data was saved.")
|
| 98 |
+
except HTTPError:
|
| 99 |
+
if config.verbose:
|
| 100 |
+
rprint(
|
| 101 |
+
f"[yellow]{self.tweet_url} not available on the user's Twitter account, but the CDX data was saved." # noqa: E501
|
| 102 |
+
)
|
| 103 |
+
except GetResponseError as e:
|
| 104 |
+
if config.verbose:
|
| 105 |
+
rprint(f"[red]An error occurred: {str(e)}")
|
| 106 |
+
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class JsonParser:
|
| 111 |
+
"""
|
| 112 |
+
This class is responsible for parsing tweets when the mimetype is application/json.
|
| 113 |
+
|
| 114 |
+
Note: This class is in an experimental phase.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
archived_tweet_url (str): The URL of the archived tweet to be parsed.
|
| 118 |
+
""" # noqa: E501
|
| 119 |
+
|
| 120 |
+
def __init__(self, archived_tweet_url: str):
|
| 121 |
+
self.archived_tweet_url = archived_tweet_url
|
| 122 |
+
|
| 123 |
+
def parse(self) -> str:
|
| 124 |
+
"""
|
| 125 |
+
Parses the archived tweets in JSON format.
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
The parsed tweet text.
|
| 129 |
+
"""
|
| 130 |
+
try:
|
| 131 |
+
response = get_response(url=self.archived_tweet_url)
|
| 132 |
+
|
| 133 |
+
if response:
|
| 134 |
+
json_data = response.json()
|
| 135 |
+
|
| 136 |
+
if "data" in json_data:
|
| 137 |
+
return json_data["data"].get("text", json_data["data"])
|
| 138 |
+
|
| 139 |
+
if "retweeted_status" in json_data:
|
| 140 |
+
return json_data["retweeted_status"].get(
|
| 141 |
+
"text", json_data["retweeted_status"]
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
return json_data.get("text", json_data)
|
| 145 |
+
except ConnectionError:
|
| 146 |
+
if config.verbose:
|
| 147 |
+
rprint(
|
| 148 |
+
f"[yellow]Connection error with {self.archived_tweet_url}. Max retries exceeded. Error parsing the JSON, but the CDX data was saved." # noqa: E501
|
| 149 |
+
)
|
| 150 |
+
except GetResponseError as e:
|
| 151 |
+
if config.verbose:
|
| 152 |
+
rprint(f"[red]An error occurred: {str(e)}")
|
| 153 |
+
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class TweetsParser:
|
| 158 |
+
"""
|
| 159 |
+
This class is responsible for the overall parsing of archived tweets.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
archived_tweets_response (List[str]): The response from the archived tweets.
|
| 163 |
+
username (str): The username associated with the tweets.
|
| 164 |
+
field_options (List[str]): The fields to be included in the parsed data. For more details on each option, visit :ref:`field_options`.
|
| 165 |
+
""" # noqa: E501
|
| 166 |
+
|
| 167 |
+
def __init__(
|
| 168 |
+
self,
|
| 169 |
+
archived_tweets_response: List[str],
|
| 170 |
+
username: str,
|
| 171 |
+
field_options: List[str],
|
| 172 |
+
):
|
| 173 |
+
if not all(option in FIELD_OPTIONS for option in field_options):
|
| 174 |
+
raise ValueError("Some field options are not valid.")
|
| 175 |
+
|
| 176 |
+
self.archived_tweets_response = archived_tweets_response
|
| 177 |
+
self.username = username
|
| 178 |
+
self.field_options = field_options
|
| 179 |
+
self.parsed_tweets = {option: [] for option in self.field_options}
|
| 180 |
+
|
| 181 |
+
def _add_field(self, key: str, value: Any) -> None:
|
| 182 |
+
"""
|
| 183 |
+
Appends a value to a list in the parsed data structure.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
key (str): The key in the parsed data structure.
|
| 187 |
+
value (Any): The value to be appended.
|
| 188 |
+
"""
|
| 189 |
+
if key in self.parsed_tweets:
|
| 190 |
+
self.parsed_tweets[key].append(value)
|
| 191 |
+
|
| 192 |
+
def _process_response(self, response: List[str]) -> None:
|
| 193 |
+
"""
|
| 194 |
+
Processes the archived tweet's response and adds the relevant CDX data.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
response (List[str]): The response from the archived tweet.
|
| 198 |
+
"""
|
| 199 |
+
tweet_remove_char = unquote(response[2]).replace("’", "")
|
| 200 |
+
cleaned_tweet = check_pattern_tweet(tweet_remove_char).strip('"')
|
| 201 |
+
|
| 202 |
+
wayback_machine_url = (
|
| 203 |
+
f"https://web.archive.org/web/{response[1]}/{tweet_remove_char}"
|
| 204 |
+
)
|
| 205 |
+
original_tweet = delete_tweet_pathnames(
|
| 206 |
+
clean_tweet_url(cleaned_tweet, self.username)
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
double_status = check_double_status(wayback_machine_url, original_tweet)
|
| 210 |
+
|
| 211 |
+
if double_status:
|
| 212 |
+
original_tweet = delete_tweet_pathnames(
|
| 213 |
+
f"https://twitter.com{original_tweet}"
|
| 214 |
+
)
|
| 215 |
+
elif "://" not in original_tweet:
|
| 216 |
+
original_tweet = delete_tweet_pathnames(f"https://{original_tweet}")
|
| 217 |
+
|
| 218 |
+
parsed_wayback_machine_url = (
|
| 219 |
+
f"https://web.archive.org/web/{response[1]}/{original_tweet}"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
encoded_archived_tweet = check_url_scheme(semicolon_parser(wayback_machine_url))
|
| 223 |
+
encoded_parsed_archived_tweet = check_url_scheme(
|
| 224 |
+
semicolon_parser(parsed_wayback_machine_url)
|
| 225 |
+
)
|
| 226 |
+
encoded_tweet = check_url_scheme(semicolon_parser(response[2]))
|
| 227 |
+
encoded_parsed_tweet = check_url_scheme(semicolon_parser(original_tweet))
|
| 228 |
+
|
| 229 |
+
available_tweet_text = None
|
| 230 |
+
available_tweet_is_RT = None
|
| 231 |
+
available_tweet_info = None
|
| 232 |
+
|
| 233 |
+
is_tweet = is_tweet_url(encoded_tweet)
|
| 234 |
+
|
| 235 |
+
if is_tweet:
|
| 236 |
+
embed_parser = TwitterEmbed(encoded_tweet)
|
| 237 |
+
content = embed_parser.embed()
|
| 238 |
+
|
| 239 |
+
if content:
|
| 240 |
+
available_tweet_text = semicolon_parser(content[0][0])
|
| 241 |
+
available_tweet_is_RT = content[1][0]
|
| 242 |
+
available_tweet_info = semicolon_parser(content[2][0])
|
| 243 |
+
|
| 244 |
+
self._add_field("available_tweet_text", available_tweet_text)
|
| 245 |
+
self._add_field("available_tweet_is_RT", available_tweet_is_RT)
|
| 246 |
+
self._add_field("available_tweet_info", available_tweet_info)
|
| 247 |
+
|
| 248 |
+
self._add_field("archived_urlkey", response[0])
|
| 249 |
+
self._add_field("archived_timestamp", response[1])
|
| 250 |
+
self._add_field("parsed_archived_timestamp", timestamp_parser(response[1]))
|
| 251 |
+
self._add_field("archived_tweet_url", encoded_archived_tweet)
|
| 252 |
+
self._add_field("parsed_archived_tweet_url", encoded_parsed_archived_tweet)
|
| 253 |
+
self._add_field("original_tweet_url", encoded_tweet)
|
| 254 |
+
self._add_field("parsed_tweet_url", encoded_parsed_tweet)
|
| 255 |
+
self._add_field("archived_mimetype", response[3])
|
| 256 |
+
self._add_field("archived_statuscode", response[4])
|
| 257 |
+
self._add_field("archived_digest", response[5])
|
| 258 |
+
self._add_field("archived_length", response[6])
|
| 259 |
+
|
| 260 |
+
def parse(self, print_progress=False) -> Dict[str, List[Any]]:
|
| 261 |
+
"""
|
| 262 |
+
Parses the archived tweets CDX data and structures it.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
print_progress (bool): A boolean indicating whether to print progress or not.
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
The parsed tweets data.
|
| 269 |
+
""" # noqa: E501
|
| 270 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 271 |
+
|
| 272 |
+
futures = {
|
| 273 |
+
executor.submit(self._process_response, response): response
|
| 274 |
+
for response in self.archived_tweets_response[1:]
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
progress_context = Progress() if print_progress else nullcontext()
|
| 278 |
+
with progress_context as progress:
|
| 279 |
+
task = None
|
| 280 |
+
if print_progress:
|
| 281 |
+
task = progress.add_task(
|
| 282 |
+
f"Parsing @{self.username}'s archived tweets\n",
|
| 283 |
+
total=len(futures),
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
for future in as_completed(futures):
|
| 287 |
+
try:
|
| 288 |
+
future.result()
|
| 289 |
+
except Exception as e:
|
| 290 |
+
rprint(f"[red]{e}")
|
| 291 |
+
|
| 292 |
+
if print_progress:
|
| 293 |
+
progress.update(task, advance=1)
|
| 294 |
+
|
| 295 |
+
return self.parsed_tweets
|
waybacktweets/api/request.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Requests data from the Wayback Machine API.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, Optional
|
| 6 |
+
|
| 7 |
+
from rich import print as rprint
|
| 8 |
+
|
| 9 |
+
from waybacktweets.config.config import config
|
| 10 |
+
from waybacktweets.exceptions.exceptions import (
|
| 11 |
+
ConnectionError,
|
| 12 |
+
EmptyResponseError,
|
| 13 |
+
GetResponseError,
|
| 14 |
+
HTTPError,
|
| 15 |
+
ReadTimeoutError,
|
| 16 |
+
)
|
| 17 |
+
from waybacktweets.utils.utils import get_response
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class WaybackTweets:
|
| 21 |
+
"""
|
| 22 |
+
Class responsible for requesting data from the Wayback CDX Server API.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
username (str): The username associated with the tweets.
|
| 26 |
+
collapse (str, optional): The field to collapse duplicate lines on.
|
| 27 |
+
timestamp_from (str, optional): The timestamp to start retrieving tweets from.
|
| 28 |
+
timestamp_to (str, optional): The timestamp to stop retrieving tweets at.
|
| 29 |
+
limit (int, optional): The maximum number of results to return.
|
| 30 |
+
offset (int, optional): The number of lines to skip in the results.
|
| 31 |
+
matchtype (str, optional): Results matching a certain prefix, a certain host or all subdomains.
|
| 32 |
+
""" # noqa: E501
|
| 33 |
+
|
| 34 |
+
def __init__(
|
| 35 |
+
self,
|
| 36 |
+
username: str,
|
| 37 |
+
collapse: str = None,
|
| 38 |
+
timestamp_from: str = None,
|
| 39 |
+
timestamp_to: str = None,
|
| 40 |
+
limit: int = None,
|
| 41 |
+
offset: int = None,
|
| 42 |
+
matchtype: str = None,
|
| 43 |
+
):
|
| 44 |
+
self.username = username
|
| 45 |
+
self.collapse = collapse
|
| 46 |
+
self.timestamp_from = timestamp_from
|
| 47 |
+
self.timestamp_to = timestamp_to
|
| 48 |
+
self.limit = limit
|
| 49 |
+
self.offset = offset
|
| 50 |
+
self.matchtype = matchtype
|
| 51 |
+
|
| 52 |
+
def get(self) -> Optional[Dict[str, Any]]:
|
| 53 |
+
"""
|
| 54 |
+
Sends a GET request to the Internet Archive's CDX API to retrieve archived tweets.
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
The response from the CDX API in JSON format, if successful. Otherwise, None.
|
| 58 |
+
""" # noqa: E501
|
| 59 |
+
url = "https://web.archive.org/cdx/search/cdx"
|
| 60 |
+
|
| 61 |
+
wildcard_pathname = "/*"
|
| 62 |
+
if self.matchtype:
|
| 63 |
+
wildcard_pathname = ""
|
| 64 |
+
|
| 65 |
+
params = {
|
| 66 |
+
"url": f"https://twitter.com/{self.username}/status{wildcard_pathname}",
|
| 67 |
+
"output": "json",
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
if self.collapse:
|
| 71 |
+
params["collapse"] = self.collapse
|
| 72 |
+
|
| 73 |
+
if self.timestamp_from:
|
| 74 |
+
params["from"] = self.timestamp_from
|
| 75 |
+
|
| 76 |
+
if self.timestamp_to:
|
| 77 |
+
params["to"] = self.timestamp_to
|
| 78 |
+
|
| 79 |
+
if self.limit:
|
| 80 |
+
params["limit"] = self.limit
|
| 81 |
+
|
| 82 |
+
if self.offset:
|
| 83 |
+
params["offset"] = self.offset
|
| 84 |
+
|
| 85 |
+
if self.matchtype:
|
| 86 |
+
params["matchType"] = self.matchtype
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
response = get_response(url=url, params=params)
|
| 90 |
+
return response.json()
|
| 91 |
+
except ReadTimeoutError:
|
| 92 |
+
if config.verbose:
|
| 93 |
+
rprint("[red]Connection to web.archive.org timed out.")
|
| 94 |
+
except ConnectionError:
|
| 95 |
+
if config.verbose:
|
| 96 |
+
rprint(
|
| 97 |
+
"[red]Failed to establish a new connection with web.archive.org. Max retries exceeded. Please wait a few minutes and try again." # noqa: E501
|
| 98 |
+
)
|
| 99 |
+
except HTTPError as e:
|
| 100 |
+
if config.verbose:
|
| 101 |
+
rprint(f"[red]HTTP error occurred: {str(e)}")
|
| 102 |
+
except EmptyResponseError:
|
| 103 |
+
if config.verbose:
|
| 104 |
+
rprint("[red]No data was saved due to an empty response.")
|
| 105 |
+
except GetResponseError as e:
|
| 106 |
+
if config.verbose:
|
| 107 |
+
rprint(f"[red]An error occurred: {str(e)}")
|
| 108 |
+
|
| 109 |
+
return None
|
waybacktweets/api/visualize.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa: E501
|
| 2 |
+
"""
|
| 3 |
+
Generates an HTML file to visualize the parsed data.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from typing import Any, Dict, List, Union
|
| 9 |
+
|
| 10 |
+
from waybacktweets.utils import timestamp_parser
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HTMLTweetsVisualizer:
|
| 14 |
+
"""
|
| 15 |
+
Class responsible for generating an HTML file to visualize the parsed data.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
username (str): The username associated with the tweets.
|
| 19 |
+
json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
|
| 20 |
+
html_file_path (str, optional): The path where the HTML file will be saved.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(
|
| 24 |
+
self,
|
| 25 |
+
username: str,
|
| 26 |
+
json_path: Union[str, List[str]],
|
| 27 |
+
html_file_path: str = None,
|
| 28 |
+
):
|
| 29 |
+
self.username = username
|
| 30 |
+
self.json_path = self._json_loader(json_path)
|
| 31 |
+
self.html_file_path = html_file_path
|
| 32 |
+
|
| 33 |
+
@staticmethod
|
| 34 |
+
def _json_loader(json_path: Union[str, List[str]]) -> List[Dict[str, Any]]:
|
| 35 |
+
"""
|
| 36 |
+
Reads and loads JSON data from a specified file path or JSON string.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
json_path (Union[str, List[str]]): The path of the JSON file or the JSON data itself.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
The content of the JSON file or data.
|
| 43 |
+
"""
|
| 44 |
+
if os.path.isfile(json_path):
|
| 45 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 46 |
+
return json.load(f)
|
| 47 |
+
|
| 48 |
+
return json.loads(json_path)
|
| 49 |
+
|
| 50 |
+
def generate(self) -> str:
|
| 51 |
+
"""
|
| 52 |
+
Generates an HTML string that represents the parsed data.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
The generated HTML string.
|
| 56 |
+
"""
|
| 57 |
+
tweets_per_page = 24
|
| 58 |
+
total_pages = (len(self.json_path) + tweets_per_page - 1) // tweets_per_page
|
| 59 |
+
|
| 60 |
+
html = "<!DOCTYPE html>\n"
|
| 61 |
+
html += '<html lang="en">\n'
|
| 62 |
+
html += "<!-- This document was generated by Wayback Tweets. Visit: https://claromes.github.io/waybacktweets -->\n"
|
| 63 |
+
|
| 64 |
+
html += "<head>"
|
| 65 |
+
html += '<meta charset="UTF-8">\n'
|
| 66 |
+
html += (
|
| 67 |
+
'<meta name="viewport" content="width=device-width, initial-scale=1.0">\n'
|
| 68 |
+
)
|
| 69 |
+
html += f"<title>@{self.username}'s archived tweets</title>\n"
|
| 70 |
+
|
| 71 |
+
# Adds styling
|
| 72 |
+
html += "<style>\n"
|
| 73 |
+
html += "body { font-family: monospace; background-color: whitesmoke; color: #1c1e21; margin: 0; padding: 20px; }\n"
|
| 74 |
+
html += ".container { display: flex; flex-wrap: wrap; gap: 20px; }\n"
|
| 75 |
+
html += ".tweet { flex: 0 1 calc(33.33% - 20px); background-color: #ffffff; border: 1px solid #e2e2e2; border-radius: 10px; padding: 15px; overflow-wrap: break-word; margin: auto; width: 600px; }\n"
|
| 76 |
+
html += ".tweet strong { font-weight: bold; }\n"
|
| 77 |
+
html += ".tweet a { color: #000000; text-decoration: none; }\n"
|
| 78 |
+
html += ".content { color: #000000; }\n"
|
| 79 |
+
html += ".source { font-size: 12px; text-align: center; }\n"
|
| 80 |
+
html += ".tweet a:hover { text-decoration: underline; }\n"
|
| 81 |
+
html += "h1, h3 { text-align: center; }\n"
|
| 82 |
+
html += "iframe { width: 600px; height: 600px; }\n"
|
| 83 |
+
html += "input { position: absolute; opacity: 0; z-index: -1; }\n"
|
| 84 |
+
html += ".accordion { margin: 10px; border-radius: 5px; overflow: hidden; box-shadow: 0 4px 4px -2px rgba(0, 0, 0, 0.4); }\n"
|
| 85 |
+
html += ".accordion-label { display: flex; justify-content: space-between; padding: 1em; font-weight: bold; cursor: pointer; background: #000000; color: #ffffff; }\n"
|
| 86 |
+
html += ".accordion-content { max-height: 0; padding: 0 1em; background: white; transition: all 0.35s; }\n"
|
| 87 |
+
html += (
|
| 88 |
+
"input:checked ~ .accordion-content { max-height: 100vh; padding: 1em; }\n"
|
| 89 |
+
)
|
| 90 |
+
html += ".pagination { text-align: center; margin-top: 20px; }\n"
|
| 91 |
+
html += ".pagination a { margin: 0 5px; text-decoration: none; color: #000000; padding: 1px 2px; border-radius: 5px; }\n"
|
| 92 |
+
html += ".pagination a:hover { background-color: #e2e2e2; }\n"
|
| 93 |
+
html += ".pagination a.selected { background-color: #e2e2e2; color: #000000; font-weight: bold; }\n"
|
| 94 |
+
html += "</style>\n"
|
| 95 |
+
|
| 96 |
+
html += "</head>\n<body>\n"
|
| 97 |
+
|
| 98 |
+
html += f"<h1>@{self.username}'s archived tweets</h1>\n"
|
| 99 |
+
|
| 100 |
+
html += (
|
| 101 |
+
'<p id="loading_first_page">Building pagination with JavaScript...</p>\n'
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
for page in range(1, total_pages + 1):
|
| 105 |
+
html += (
|
| 106 |
+
f'<div id="page_{page}" style="display:none;">\n' # Starts a new page
|
| 107 |
+
)
|
| 108 |
+
html += '<div class="container">\n'
|
| 109 |
+
|
| 110 |
+
start_index = (page - 1) * tweets_per_page
|
| 111 |
+
end_index = min(start_index + tweets_per_page, len(self.json_path))
|
| 112 |
+
|
| 113 |
+
for index in range(start_index, end_index):
|
| 114 |
+
tweet = self.json_path[index]
|
| 115 |
+
html += '<div class="tweet">\n'
|
| 116 |
+
|
| 117 |
+
if not tweet["available_tweet_text"]:
|
| 118 |
+
iframe_src = {
|
| 119 |
+
"Archived Tweet": tweet["archived_tweet_url"],
|
| 120 |
+
"Parsed Archived Tweet": tweet["parsed_archived_tweet_url"],
|
| 121 |
+
"Original Tweet": tweet["original_tweet_url"],
|
| 122 |
+
"Parsed Tweet": tweet["parsed_tweet_url"],
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
for key, value in iframe_src.items():
|
| 126 |
+
key_cleaned = key.replace(" ", "_")
|
| 127 |
+
|
| 128 |
+
html += '<div class="accordion">\n'
|
| 129 |
+
html += f'<input type="checkbox" id="tab_{index}_{key_cleaned}" />\n'
|
| 130 |
+
html += f'<label class="accordion-label" for="tab_{index}_{key_cleaned}">Click to load the iframe from {key}</label>\n'
|
| 131 |
+
html += '<div class="accordion-content">\n'
|
| 132 |
+
|
| 133 |
+
html += f'<div id="loading_{index}_{key_cleaned}" class="loading">Loading...</div>\n'
|
| 134 |
+
html += f'<iframe id="iframe_{index}_{key_cleaned}" frameborder="0" scrolling="auto" loading="lazy" style="display: none;" onload="document.getElementById(\'loading_{index}_{key_cleaned}\').style.display=\'none\'; this.style.display=\'block\';"></iframe>\n'
|
| 135 |
+
html += "</div>\n"
|
| 136 |
+
html += "</div>\n"
|
| 137 |
+
|
| 138 |
+
html += """
|
| 139 |
+
<script>
|
| 140 |
+
// Loads the src attribute of the iframe tag
|
| 141 |
+
document.getElementById('tab_{index}_{key_cleaned}').addEventListener('change', function() {{
|
| 142 |
+
if (this.checked) {{
|
| 143 |
+
document.getElementById('loading_{index}_{key_cleaned}').style.display = 'block';
|
| 144 |
+
document.getElementById('iframe_{index}_{key_cleaned}').src = '{url}';
|
| 145 |
+
}}
|
| 146 |
+
}});
|
| 147 |
+
</script>
|
| 148 |
+
""".format(
|
| 149 |
+
index=index, url=value, key_cleaned=key_cleaned
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
if tweet["available_tweet_text"]:
|
| 153 |
+
html += "<br>\n"
|
| 154 |
+
html += f'<p><strong class="content">Available Tweet Content:</strong> {tweet["available_tweet_text"]}</p>\n'
|
| 155 |
+
html += f'<p><strong class="content">Available Tweet Is Retweet:</strong> {tweet["available_tweet_is_RT"]}</p>\n'
|
| 156 |
+
html += f'<p><strong class="content">Available Tweet Username:</strong> {tweet["available_tweet_info"]}</p>\n'
|
| 157 |
+
|
| 158 |
+
html += "<br>\n"
|
| 159 |
+
html += f'<p><strong>Archived Tweet:</strong> <a href="{tweet["archived_tweet_url"]}" target="_blank">{tweet["archived_tweet_url"]}</a></p>\n'
|
| 160 |
+
html += f'<p><strong>Parsed Archived Tweet:</strong> <a href="{tweet["parsed_archived_tweet_url"]}" target="_blank">{tweet["parsed_archived_tweet_url"]}</a></p>\n'
|
| 161 |
+
html += f'<p><strong>Original Tweet:</strong> <a href="{tweet["original_tweet_url"]}" target="_blank">{tweet["original_tweet_url"]}</a></p>\n'
|
| 162 |
+
html += f'<p><strong>Parsed Tweet:</strong> <a href="{tweet["parsed_tweet_url"]}" target="_blank">{tweet["parsed_tweet_url"]}</a></p>\n'
|
| 163 |
+
html += f'<p><strong>Archived URL Key:</strong> {tweet["archived_urlkey"]}</p>\n'
|
| 164 |
+
html += f'<p><strong>Archived Timestamp:</strong> {timestamp_parser(tweet["archived_timestamp"])} ({tweet["archived_timestamp"]})</p>\n'
|
| 165 |
+
html += f'<p><strong>Archived mimetype:</strong> {tweet["archived_mimetype"]}</p>\n'
|
| 166 |
+
html += f'<p><strong>Archived Statuscode:</strong> {tweet["archived_statuscode"]}</p>\n'
|
| 167 |
+
html += (
|
| 168 |
+
f'<p><strong>Archived Digest:</strong> {tweet["archived_digest"]}\n'
|
| 169 |
+
)
|
| 170 |
+
html += f'<p><strong>Archived Length:</strong> {tweet["archived_length"]}</p>\n'
|
| 171 |
+
html += "</div>\n"
|
| 172 |
+
|
| 173 |
+
html += "</div>\n</div>\n" # Closes the page div and the container
|
| 174 |
+
|
| 175 |
+
html += "<br>\n"
|
| 176 |
+
|
| 177 |
+
# Adds navigation for the pages
|
| 178 |
+
html += '<div class="pagination">\n'
|
| 179 |
+
for page in range(1, total_pages + 1):
|
| 180 |
+
html += f'<a href="#" id="page_link_{page}" onclick="showPage({page})">{page}</a>\n'
|
| 181 |
+
html += "</div>\n"
|
| 182 |
+
|
| 183 |
+
html += '<br><p class="source">generated by <a href="https://claromes.github.io/waybacktweets/" target="_blank">Wayback Tweets↗</a></p>\n'
|
| 184 |
+
|
| 185 |
+
html += """
|
| 186 |
+
<script>
|
| 187 |
+
// Function to show the selected page and hide the others
|
| 188 |
+
function showPage(page) {{
|
| 189 |
+
for (let i = 1; i <= {total_pages}; i++) {{
|
| 190 |
+
document.getElementById('page_' + i).style.display = 'none';
|
| 191 |
+
document.getElementById('page_link_' + i).classList.remove('selected');
|
| 192 |
+
}}
|
| 193 |
+
|
| 194 |
+
document.getElementById('page_' + page).style.display = 'block';
|
| 195 |
+
document.getElementById('page_link_' + page).classList.add('selected');
|
| 196 |
+
}}
|
| 197 |
+
|
| 198 |
+
// Initializes the page to show only the first page
|
| 199 |
+
document.addEventListener('DOMContentLoaded', (event) => {{
|
| 200 |
+
showPage(1); // Shows only the first page on load
|
| 201 |
+
document.getElementById('loading_first_page').style.display = 'none';
|
| 202 |
+
}});
|
| 203 |
+
</script>
|
| 204 |
+
""".format(
|
| 205 |
+
total_pages=total_pages
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
html += "</body>\n"
|
| 209 |
+
html += "</html>"
|
| 210 |
+
|
| 211 |
+
return html
|
| 212 |
+
|
| 213 |
+
def save(self, html_content: str) -> None:
|
| 214 |
+
"""
|
| 215 |
+
Saves the generated HTML string to a file.
|
| 216 |
+
|
| 217 |
+
Args:
|
| 218 |
+
html_content (str): The HTML string to be saved.
|
| 219 |
+
"""
|
| 220 |
+
with open(self.html_file_path, "w", encoding="utf-8") as f:
|
| 221 |
+
f.write(html_content)
|
waybacktweets/config/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa: F401
|
| 2 |
+
|
| 3 |
+
from waybacktweets.config.config import config
|
| 4 |
+
from waybacktweets.config.field_options import FIELD_OPTIONS
|
waybacktweets/config/config.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration module.
|
| 3 |
+
|
| 4 |
+
Manages global configuration settings throughout the application.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class _Config:
|
| 12 |
+
"""
|
| 13 |
+
A class used to represent the configuration settings.
|
| 14 |
+
|
| 15 |
+
Attributes:
|
| 16 |
+
verbose (bool): Determines if verbose logging should be enabled.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
verbose: bool = True
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
config = _Config()
|
| 23 |
+
"""
|
| 24 |
+
Global configuration instance.
|
| 25 |
+
|
| 26 |
+
Attributes:
|
| 27 |
+
verbose (bool): Determines if verbose logging should be enabled.
|
| 28 |
+
"""
|
waybacktweets/config/field_options.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
List of valid field options that can be used for parsing tweets.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
FIELD_OPTIONS = [
|
| 6 |
+
"archived_urlkey",
|
| 7 |
+
"archived_timestamp",
|
| 8 |
+
"parsed_archived_timestamp",
|
| 9 |
+
"archived_tweet_url",
|
| 10 |
+
"parsed_archived_tweet_url",
|
| 11 |
+
"original_tweet_url",
|
| 12 |
+
"parsed_tweet_url",
|
| 13 |
+
"available_tweet_text",
|
| 14 |
+
"available_tweet_is_RT",
|
| 15 |
+
"available_tweet_info",
|
| 16 |
+
"archived_mimetype",
|
| 17 |
+
"archived_statuscode",
|
| 18 |
+
"archived_digest",
|
| 19 |
+
"archived_length",
|
| 20 |
+
]
|
waybacktweets/exceptions/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa: F401
|
| 2 |
+
|
| 3 |
+
from waybacktweets.exceptions.exceptions import (
|
| 4 |
+
ConnectionError,
|
| 5 |
+
EmptyResponseError,
|
| 6 |
+
GetResponseError,
|
| 7 |
+
HTTPError,
|
| 8 |
+
ReadTimeoutError,
|
| 9 |
+
)
|
waybacktweets/exceptions/exceptions.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Wayback Tweets Exceptions
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class GetResponseError(Exception):
|
| 7 |
+
"""
|
| 8 |
+
Base class for exceptions in get_response.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ReadTimeoutError(GetResponseError):
|
| 13 |
+
"""
|
| 14 |
+
Exception raised for read timeout errors.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ConnectionError(GetResponseError):
|
| 19 |
+
"""
|
| 20 |
+
Exception raised for connection errors.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class HTTPError(GetResponseError):
|
| 25 |
+
"""
|
| 26 |
+
Exception raised for HTTP errors.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class EmptyResponseError(GetResponseError):
|
| 31 |
+
"""
|
| 32 |
+
Exception raised for empty responses.
|
| 33 |
+
"""
|
waybacktweets/utils/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa: F401
|
| 2 |
+
|
| 3 |
+
from waybacktweets.utils.utils import (
|
| 4 |
+
check_double_status,
|
| 5 |
+
check_pattern_tweet,
|
| 6 |
+
check_url_scheme,
|
| 7 |
+
clean_tweet_url,
|
| 8 |
+
clean_wayback_machine_url,
|
| 9 |
+
delete_tweet_pathnames,
|
| 10 |
+
get_response,
|
| 11 |
+
is_tweet_url,
|
| 12 |
+
semicolon_parser,
|
| 13 |
+
timestamp_parser,
|
| 14 |
+
)
|
waybacktweets/utils/utils.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for handling HTTP requests and manipulating URLs.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import html
|
| 6 |
+
import re
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Optional, Tuple
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
from requests.adapters import HTTPAdapter
|
| 12 |
+
from urllib3.util.retry import Retry
|
| 13 |
+
|
| 14 |
+
from waybacktweets.exceptions.exceptions import (
|
| 15 |
+
ConnectionError,
|
| 16 |
+
EmptyResponseError,
|
| 17 |
+
GetResponseError,
|
| 18 |
+
HTTPError,
|
| 19 |
+
ReadTimeoutError,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def get_response(
|
| 24 |
+
url: str, params: Optional[dict] = None
|
| 25 |
+
) -> Tuple[Optional[requests.Response], Optional[str], Optional[str]]:
|
| 26 |
+
"""
|
| 27 |
+
Sends a GET request to the specified URL and returns the response.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
url (str): The URL to send the GET request to.
|
| 31 |
+
params (dict, optional): The parameters to include in the GET request.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
The response from the server.
|
| 35 |
+
|
| 36 |
+
Raises:
|
| 37 |
+
ReadTimeoutError: If a read timeout occurs.
|
| 38 |
+
ConnectionError: If a connection error occurs.
|
| 39 |
+
HTTPError: If an HTTP error occurs.
|
| 40 |
+
EmptyResponseError: If the response is empty.
|
| 41 |
+
"""
|
| 42 |
+
session = requests.Session()
|
| 43 |
+
retry = Retry(connect=3, backoff_factor=0.3)
|
| 44 |
+
adapter = HTTPAdapter(max_retries=retry)
|
| 45 |
+
headers = {
|
| 46 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" # noqa: E501
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
session.mount("http://", adapter)
|
| 50 |
+
session.mount("https://", adapter)
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
response = session.get(url, params=params, headers=headers)
|
| 54 |
+
response.raise_for_status()
|
| 55 |
+
|
| 56 |
+
if not response or response.json() == []:
|
| 57 |
+
raise EmptyResponseError("No data was saved due to an empty response.")
|
| 58 |
+
return response
|
| 59 |
+
except requests.exceptions.ReadTimeout:
|
| 60 |
+
raise ReadTimeoutError
|
| 61 |
+
except requests.exceptions.ConnectionError:
|
| 62 |
+
raise ConnectionError
|
| 63 |
+
except requests.exceptions.HTTPError:
|
| 64 |
+
raise HTTPError
|
| 65 |
+
except requests.exceptions.RequestException:
|
| 66 |
+
raise GetResponseError
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def clean_tweet_url(tweet_url: str, username: str) -> str:
|
| 70 |
+
"""
|
| 71 |
+
Cleans a tweet URL by ensuring it is associated with the correct username.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
tweet_url (str): The tweet URL to clean.
|
| 75 |
+
username (str): The username to associate with the tweet URL.
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
The cleaned tweet URL.
|
| 79 |
+
"""
|
| 80 |
+
tweet_lower = tweet_url.lower()
|
| 81 |
+
|
| 82 |
+
pattern = re.compile(r"/status/(\d+)")
|
| 83 |
+
match_lower_case = pattern.search(tweet_lower)
|
| 84 |
+
match_original_case = pattern.search(tweet_url)
|
| 85 |
+
|
| 86 |
+
if match_lower_case and username in tweet_lower:
|
| 87 |
+
return f"https://twitter.com/{username}/status/{match_original_case.group(1)}"
|
| 88 |
+
else:
|
| 89 |
+
return tweet_url
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def clean_wayback_machine_url(
|
| 93 |
+
wayback_machine_url: str, archived_timestamp: str, username: str
|
| 94 |
+
) -> str:
|
| 95 |
+
"""
|
| 96 |
+
Cleans a Wayback Machine URL by ensuring it is associated with the correct username and timestamp.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
wayback_machine_url (str): The Wayback Machine URL to clean.
|
| 100 |
+
archived_timestamp (str): The timestamp to associate with the Wayback Machine URL.
|
| 101 |
+
username (str): The username to associate with the Wayback Machine URL.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
The cleaned Wayback Machine URL.
|
| 105 |
+
""" # noqa: E501
|
| 106 |
+
wayback_machine_url = wayback_machine_url.lower()
|
| 107 |
+
|
| 108 |
+
pattern = re.compile(r"/status/(\d+)")
|
| 109 |
+
match = pattern.search(wayback_machine_url)
|
| 110 |
+
|
| 111 |
+
if match and username in wayback_machine_url:
|
| 112 |
+
return f"https://web.archive.org/web/{archived_timestamp}/https://twitter.com/{username}/status/{match.group(1)}" # noqa: E501
|
| 113 |
+
else:
|
| 114 |
+
return wayback_machine_url
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def check_pattern_tweet(tweet_url: str) -> str:
|
| 118 |
+
"""
|
| 119 |
+
Extracts the URL from a tweet URL with patterns such as:
|
| 120 |
+
|
| 121 |
+
- Reply: /status//
|
| 122 |
+
- Link: /status///
|
| 123 |
+
- Twimg: /status/https://pbs
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
tweet_url (str): The tweet URL to extract the URL from.
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
Only the extracted URL from a tweet.
|
| 130 |
+
"""
|
| 131 |
+
pattern = r'/status/((?:"(.*?)"|"(.*?)(?=&|$)|"%3B(.*?)(?=&|$)))'
|
| 132 |
+
match = re.search(pattern, tweet_url)
|
| 133 |
+
|
| 134 |
+
if match:
|
| 135 |
+
if match.group(2):
|
| 136 |
+
parsed_tweet_url = match.group(2)
|
| 137 |
+
elif match.group(3):
|
| 138 |
+
parsed_tweet_url = match.group(3)
|
| 139 |
+
elif match.group(4):
|
| 140 |
+
parsed_tweet_url = match.group(4)
|
| 141 |
+
else:
|
| 142 |
+
parsed_tweet_url = ""
|
| 143 |
+
|
| 144 |
+
parsed_tweet_url = html.unescape(parsed_tweet_url)
|
| 145 |
+
|
| 146 |
+
return parsed_tweet_url
|
| 147 |
+
|
| 148 |
+
return tweet_url
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def delete_tweet_pathnames(tweet_url: str) -> str:
|
| 152 |
+
"""
|
| 153 |
+
Removes any pathnames from a tweet URL.
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
tweet_url (str): The tweet URL to remove pathnames from.
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
The tweet URL without any pathnames.
|
| 160 |
+
"""
|
| 161 |
+
pattern_username = re.compile(r"https://twitter\.com/([^/]+)/status/\d+")
|
| 162 |
+
match_username = pattern_username.match(tweet_url)
|
| 163 |
+
|
| 164 |
+
pattern_id = r"https://twitter.com/\w+/status/(\d+)"
|
| 165 |
+
match_id = re.search(pattern_id, tweet_url)
|
| 166 |
+
|
| 167 |
+
if match_id and match_username:
|
| 168 |
+
tweet_id = match_id.group(1)
|
| 169 |
+
username = match_username.group(1)
|
| 170 |
+
return f"https://twitter.com/{username}/status/{tweet_id}"
|
| 171 |
+
else:
|
| 172 |
+
return tweet_url
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def check_double_status(wayback_machine_url: str, original_tweet_url: str) -> bool:
|
| 176 |
+
"""
|
| 177 |
+
Checks if a Wayback Machine URL contains two occurrences of "/status/" and if the original tweet does not contain "twitter.com".
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
wayback_machine_url (str): The Wayback Machine URL to check.
|
| 181 |
+
original_tweet_url (str): The original tweet URL to check.
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
True if the conditions are met, False otherwise.
|
| 185 |
+
""" # noqa: E501
|
| 186 |
+
if (
|
| 187 |
+
wayback_machine_url.count("/status/") == 2
|
| 188 |
+
and "twitter.com" not in original_tweet_url
|
| 189 |
+
):
|
| 190 |
+
return True
|
| 191 |
+
|
| 192 |
+
return False
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def semicolon_parser(string: str) -> str:
|
| 196 |
+
"""
|
| 197 |
+
Replaces semicolons in a string with %3B.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
string (str): The string to replace semicolons in.
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
The string with semicolons replaced by %3B.
|
| 204 |
+
"""
|
| 205 |
+
return "".join("%3B" if c == ";" else c for c in string)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def is_tweet_url(twitter_url: str) -> bool:
|
| 209 |
+
"""
|
| 210 |
+
Checks if the provided URL is a Twitter status URL.
|
| 211 |
+
|
| 212 |
+
This function checks if the provided URL contains "/status/" exactly once, which is a common pattern in Twitter status URLs.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
twitter_url (str): The URL to check.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
True if the URL is a Twitter status URL, False otherwise.
|
| 219 |
+
""" # noqa: E501
|
| 220 |
+
if twitter_url.count("/status/") == 1:
|
| 221 |
+
return True
|
| 222 |
+
|
| 223 |
+
return False
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def timestamp_parser(timestamp):
|
| 227 |
+
"""
|
| 228 |
+
Parses a timestamp into a formatted string.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
timestamp (str): The timestamp string to parse.
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
The parsed timestamp in the format "%Y/%m/%d %H:%M:%S", or None if the
|
| 235 |
+
timestamp could not be parsed.
|
| 236 |
+
"""
|
| 237 |
+
formats = [
|
| 238 |
+
"%Y",
|
| 239 |
+
"%Y%m",
|
| 240 |
+
"%Y%m%d",
|
| 241 |
+
"%Y%m%d%H",
|
| 242 |
+
"%Y%m%d%H%M",
|
| 243 |
+
"%Y%m%d%H%M%S",
|
| 244 |
+
]
|
| 245 |
+
|
| 246 |
+
for fmt in formats:
|
| 247 |
+
try:
|
| 248 |
+
parsed_time = datetime.strptime(timestamp, fmt)
|
| 249 |
+
|
| 250 |
+
formatted_time = parsed_time.strftime("%Y/%m/%d %H:%M:%S")
|
| 251 |
+
return formatted_time
|
| 252 |
+
except ValueError:
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
+
return None
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def check_url_scheme(url):
|
| 259 |
+
"""
|
| 260 |
+
Corrects the URL scheme if it contains more than two slashes following the scheme.
|
| 261 |
+
|
| 262 |
+
This function uses a regular expression to find 'http:' or 'https:' followed by two or more slashes.
|
| 263 |
+
It then replaces this with the scheme followed by exactly two slashes.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
url (str): The URL to be corrected.
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
The corrected URL.
|
| 270 |
+
""" # noqa: E501
|
| 271 |
+
pattern = r"(http:|https:)(/{2,})"
|
| 272 |
+
|
| 273 |
+
def replace_function(match):
|
| 274 |
+
scheme = match.group(1)
|
| 275 |
+
return f"{scheme}//"
|
| 276 |
+
|
| 277 |
+
parsed_url = re.sub(pattern, replace_function, url)
|
| 278 |
+
|
| 279 |
+
return parsed_url
|