wayback-fast / app.py
tx3bas's picture
Update app.py
5b53dd5 verified
import gradio as gr
from datetime import datetime, timedelta
import requests
import json
import re
# Define los headers para simular una petición desde Firefox
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'
}
def get_google_cache_date(url):
cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}"
try:
resp = requests.get(cache_url, headers=headers)
if resp.status_code == 200:
getcache = re.search("[a-zA-z]{3}\s[0-9]{1,2},\s[0-9]{4}", resp.text)
if getcache:
g_cache = getcache.group(0)
cache_date = datetime.strptime(g_cache, '%b %d, %Y')
today = datetime.now()
days_ago = (today - cache_date).days
# Manejo del singular y plural para "día/días"
day_word = "día" if days_ago == 1 else "días"
# Lógica para determinar el emoji
if days_ago <= 3:
emoji = "😎"
elif days_ago <= 30:
emoji = "🙂"
elif days_ago <= 90:
emoji = "😐"
else:
emoji = "😭"
formatted_date = cache_date.strftime('%d/%m/%Y')
return f"<p style='margin-bottom: 10px; margin-top:10px;text-align: center; background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'><a style='color:#1f2937;text-decoration:none' title='Ver la url cacheada por Google' href='cache:{url}'>👁 </a> Cacheada por Google hace {days_ago} {day_word} ({formatted_date}) {emoji}</p>"
else:
return "Url no cacheada por Google ⛔"
else:
return "Error al acceder a la caché de Google ⛔"
except Exception as e:
return str(e)
def wayback(website):
if not website:
return '<p>😭 Error: introduce una url correcta</p>'
google_cache_info = get_google_cache_date(website)
# Intenta primero con la Wayback CDX Server API
end_date = datetime.now()
start_date = end_date - timedelta(days=365)
datefrom = start_date.strftime('%Y%m%d')
dateto = end_date.strftime('%Y%m%d')
cdx_api_url = f"http://web.archive.org/cdx/search/cdx?url={website}&output=json&from={datefrom}&to={dateto}&limit=3000"
try:
response = requests.get(cdx_api_url, headers=headers)
if response.status_code == 200:
content = json.loads(response.text)
if len(content) > 1:
# Ordenar los resultados por fecha de manera descendente
sorted_content = sorted(content[1:], key=lambda x: x[1], reverse=True)
# Crear la tabla HTML
results = [f"<div>{google_cache_info}</div><table style='width: -webkit-fill-available;''><tr><th>Fecha</th><th>URL</th></tr>"]
for row in sorted_content:
date, page, status = [row[i] for i in [1, 2, 4]]
formatted_date = datetime.strptime(date, '%Y%m%d%H%M%S').strftime('%d/%m/%Y')
formatted_wayback_url = f"https://web.archive.org/web/{date}/{page}"
results.append(f"<tr><td>{formatted_date}</td><td><a href='{formatted_wayback_url}' target='_blank'>{formatted_wayback_url}</a></td></tr>")
results.append("</table>")
return "".join(results)
except Exception as e:
pass # Falla silenciosa, intentar con el siguiente método
# Si falla, intentar con la Wayback Availability JSON API
availability_api_url = f"http://archive.org/wayback/available?url={website}"
try:
response = requests.get(availability_api_url, headers=headers)
if response.status_code == 200:
data = json.loads(response.text)
if data["archived_snapshots"]:
closest_snapshot = data["archived_snapshots"]["closest"]
if closest_snapshot and closest_snapshot["available"]:
snapshot_url = closest_snapshot["url"]
timestamp = closest_snapshot["timestamp"]
formatted_date = datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%d/%m/%Y')
return f"<div>{google_cache_info}</div><table style='width: -webkit-fill-available;''><tr><th>Fecha</th><th>URL</th></tr><tr><td>{formatted_date}</td><td><a href='{snapshot_url}' target='_blank'>{snapshot_url}</a></td></tr></table>"
except Exception as e:
return f"<div>{google_cache_info}</div><p>😭 Error: {e}</p>"
return f"<div>{google_cache_info}</div><p>😭 Error: No se encontraron datos archivados para esta URL.</p>"
def archive_now(website):
if not website:
return "<div>😭 Error: Por favor, introduce una URL válida.</div>"
archive_url = f"https://web.archive.org/save/{website}"
try:
response = requests.get(archive_url, headers=headers, timeout=60) # Establece un tiempo límite de 60 segundos
if response.status_code == 200:
return f"<div>👌 URL archivada con éxito.</div>"
else:
# Si la respuesta no es exitosa, busca la última instantánea
return check_last_snapshot(website)
except requests.exceptions.Timeout:
# Si se supera el tiempo de espera, busca la última instantánea
return check_last_snapshot(website)
except Exception as e:
return f"<div>Error al archivar la URL: {e}</div>"
def check_last_snapshot(website):
availability_api_url = f"http://archive.org/wayback/available?url={website}"
try:
response = requests.get(availability_api_url, headers=headers)
if response.status_code == 200:
data = json.loads(response.text)
if data["archived_snapshots"]:
closest_snapshot = data["archived_snapshots"]["closest"]
if closest_snapshot and closest_snapshot["available"]:
snapshot_url = closest_snapshot["url"]
timestamp = closest_snapshot["timestamp"]
formatted_date = datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%d/%m/%Y')
return f"<div>Última instantánea disponible: <a href='{snapshot_url}' target='_blank'>{formatted_date}</a></div>"
except Exception as e:
return f"<div>Error al buscar la última instantánea: {e}</div>"
return "<div>😭 No se encontraron datos archivados para esta URL.</div>"
# Crear la interfaz de Gradio para la función wayback
wayback_interface = gr.Interface(
fn=wayback,
inputs="text",
outputs="html",
title="<p style='margin-top:10px;margin-bottom:-10px;font-size: 22px;'>Wayback Machine</p>",
description="<p style='margin-bottom: 10px; text-align: center; background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'>Busca instantáneas de una página web en Wayback Machine y guarda la página actual simlemente introduciendo la url.</p>",
article="<p style='text-align:center !important;'>Desarrollada por <a style='text-decoration:none !important;color:#e12a31 !important;' href='https://artxeweb.com'>© Artxe Web</a></p>"
)
# Crear la interfaz de Gradio para la función archive_now
archive_interface = gr.Interface(
fn=archive_now,
inputs="text",
outputs="html",
title="<p style='margin-top:10px;margin-bottom:-10px;font-size: 22px;'>Guardar en Wayback Machine<p>",
description="<p style='margin-bottom: 10px; text-align: center; background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'>Guarda la página web actual en Wayback Machine.</p>",
article="<p style='text-align:center !important;'>Desarrollada por <a style='text-decoration:none !important;color:#e12a31 !important;' href='https://artxeweb.com'>© Artxe Web</a></p>"
)
# Combinar ambas interfaces en una
iface = gr.TabbedInterface([wayback_interface, archive_interface], ["Buscar Instantáneas", "Archivar URL"], title="<div style='margin:0 auto;text-align:center;margin-bottom: -20px;'><div style='margin:0 auto;text-align:center'><img style='width:100px;display: inline-table;margin-bottom:-10px' src='https://artxeweb.com/media/files/waybackmachine.jpg'><p>Wayback Fast</p></div>")
# Lanzar la aplicación
iface.launch()