Spaces:
Build error
Build error
| import gradio as gr | |
| from datetime import datetime, timedelta | |
| import requests | |
| import json | |
| import re | |
| # Define los headers para simular una petición desde Firefox | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0' | |
| } | |
| def get_google_cache_date(url): | |
| cache_url = f"https://webcache.googleusercontent.com/search?q=cache:{url}" | |
| try: | |
| resp = requests.get(cache_url, headers=headers) | |
| if resp.status_code == 200: | |
| getcache = re.search("[a-zA-z]{3}\s[0-9]{1,2},\s[0-9]{4}", resp.text) | |
| if getcache: | |
| g_cache = getcache.group(0) | |
| cache_date = datetime.strptime(g_cache, '%b %d, %Y') | |
| today = datetime.now() | |
| days_ago = (today - cache_date).days | |
| # Manejo del singular y plural para "día/días" | |
| day_word = "día" if days_ago == 1 else "días" | |
| # Lógica para determinar el emoji | |
| if days_ago <= 3: | |
| emoji = "😎" | |
| elif days_ago <= 30: | |
| emoji = "🙂" | |
| elif days_ago <= 90: | |
| emoji = "😐" | |
| else: | |
| emoji = "😭" | |
| formatted_date = cache_date.strftime('%d/%m/%Y') | |
| return f"<p style='margin-bottom: 10px; margin-top:10px;text-align: center; background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'><a style='color:#1f2937;text-decoration:none' title='Ver la url cacheada por Google' href='cache:{url}'>👁 </a> Cacheada por Google hace {days_ago} {day_word} ({formatted_date}) {emoji}</p>" | |
| else: | |
| return "Url no cacheada por Google ⛔" | |
| else: | |
| return "Error al acceder a la caché de Google ⛔" | |
| except Exception as e: | |
| return str(e) | |
| def wayback(website): | |
| if not website: | |
| return '<p>😭 Error: introduce una url correcta</p>' | |
| google_cache_info = get_google_cache_date(website) | |
| # Intenta primero con la Wayback CDX Server API | |
| end_date = datetime.now() | |
| start_date = end_date - timedelta(days=365) | |
| datefrom = start_date.strftime('%Y%m%d') | |
| dateto = end_date.strftime('%Y%m%d') | |
| cdx_api_url = f"http://web.archive.org/cdx/search/cdx?url={website}&output=json&from={datefrom}&to={dateto}&limit=3000" | |
| try: | |
| response = requests.get(cdx_api_url, headers=headers) | |
| if response.status_code == 200: | |
| content = json.loads(response.text) | |
| if len(content) > 1: | |
| # Ordenar los resultados por fecha de manera descendente | |
| sorted_content = sorted(content[1:], key=lambda x: x[1], reverse=True) | |
| # Crear la tabla HTML | |
| results = [f"<div>{google_cache_info}</div><table style='width: -webkit-fill-available;''><tr><th>Fecha</th><th>URL</th></tr>"] | |
| for row in sorted_content: | |
| date, page, status = [row[i] for i in [1, 2, 4]] | |
| formatted_date = datetime.strptime(date, '%Y%m%d%H%M%S').strftime('%d/%m/%Y') | |
| formatted_wayback_url = f"https://web.archive.org/web/{date}/{page}" | |
| results.append(f"<tr><td>{formatted_date}</td><td><a href='{formatted_wayback_url}' target='_blank'>{formatted_wayback_url}</a></td></tr>") | |
| results.append("</table>") | |
| return "".join(results) | |
| except Exception as e: | |
| pass # Falla silenciosa, intentar con el siguiente método | |
| # Si falla, intentar con la Wayback Availability JSON API | |
| availability_api_url = f"http://archive.org/wayback/available?url={website}" | |
| try: | |
| response = requests.get(availability_api_url, headers=headers) | |
| if response.status_code == 200: | |
| data = json.loads(response.text) | |
| if data["archived_snapshots"]: | |
| closest_snapshot = data["archived_snapshots"]["closest"] | |
| if closest_snapshot and closest_snapshot["available"]: | |
| snapshot_url = closest_snapshot["url"] | |
| timestamp = closest_snapshot["timestamp"] | |
| formatted_date = datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%d/%m/%Y') | |
| return f"<div>{google_cache_info}</div><table style='width: -webkit-fill-available;''><tr><th>Fecha</th><th>URL</th></tr><tr><td>{formatted_date}</td><td><a href='{snapshot_url}' target='_blank'>{snapshot_url}</a></td></tr></table>" | |
| except Exception as e: | |
| return f"<div>{google_cache_info}</div><p>😭 Error: {e}</p>" | |
| return f"<div>{google_cache_info}</div><p>😭 Error: No se encontraron datos archivados para esta URL.</p>" | |
| def archive_now(website): | |
| if not website: | |
| return "<div>😭 Error: Por favor, introduce una URL válida.</div>" | |
| archive_url = f"https://web.archive.org/save/{website}" | |
| try: | |
| response = requests.get(archive_url, headers=headers, timeout=60) # Establece un tiempo límite de 60 segundos | |
| if response.status_code == 200: | |
| return f"<div>👌 URL archivada con éxito.</div>" | |
| else: | |
| # Si la respuesta no es exitosa, busca la última instantánea | |
| return check_last_snapshot(website) | |
| except requests.exceptions.Timeout: | |
| # Si se supera el tiempo de espera, busca la última instantánea | |
| return check_last_snapshot(website) | |
| except Exception as e: | |
| return f"<div>Error al archivar la URL: {e}</div>" | |
| def check_last_snapshot(website): | |
| availability_api_url = f"http://archive.org/wayback/available?url={website}" | |
| try: | |
| response = requests.get(availability_api_url, headers=headers) | |
| if response.status_code == 200: | |
| data = json.loads(response.text) | |
| if data["archived_snapshots"]: | |
| closest_snapshot = data["archived_snapshots"]["closest"] | |
| if closest_snapshot and closest_snapshot["available"]: | |
| snapshot_url = closest_snapshot["url"] | |
| timestamp = closest_snapshot["timestamp"] | |
| formatted_date = datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%d/%m/%Y') | |
| return f"<div>Última instantánea disponible: <a href='{snapshot_url}' target='_blank'>{formatted_date}</a></div>" | |
| except Exception as e: | |
| return f"<div>Error al buscar la última instantánea: {e}</div>" | |
| return "<div>😭 No se encontraron datos archivados para esta URL.</div>" | |
| # Crear la interfaz de Gradio para la función wayback | |
| wayback_interface = gr.Interface( | |
| fn=wayback, | |
| inputs="text", | |
| outputs="html", | |
| title="<p style='margin-top:10px;margin-bottom:-10px;font-size: 22px;'>Wayback Machine</p>", | |
| description="<p style='margin-bottom: 10px; text-align: center; background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'>Busca instantáneas de una página web en Wayback Machine y guarda la página actual simlemente introduciendo la url.</p>", | |
| article="<p style='text-align:center !important;'>Desarrollada por <a style='text-decoration:none !important;color:#e12a31 !important;' href='https://artxeweb.com'>© Artxe Web</a></p>" | |
| ) | |
| # Crear la interfaz de Gradio para la función archive_now | |
| archive_interface = gr.Interface( | |
| fn=archive_now, | |
| inputs="text", | |
| outputs="html", | |
| title="<p style='margin-top:10px;margin-bottom:-10px;font-size: 22px;'>Guardar en Wayback Machine<p>", | |
| description="<p style='margin-bottom: 10px; text-align: center; background: #ffffff; padding: 8px; border-radius: 8px; border-width: 1px; border: solid 1px #e5e7eb;'>Guarda la página web actual en Wayback Machine.</p>", | |
| article="<p style='text-align:center !important;'>Desarrollada por <a style='text-decoration:none !important;color:#e12a31 !important;' href='https://artxeweb.com'>© Artxe Web</a></p>" | |
| ) | |
| # Combinar ambas interfaces en una | |
| iface = gr.TabbedInterface([wayback_interface, archive_interface], ["Buscar Instantáneas", "Archivar URL"], title="<div style='margin:0 auto;text-align:center;margin-bottom: -20px;'><div style='margin:0 auto;text-align:center'><img style='width:100px;display: inline-table;margin-bottom:-10px' src='https://artxeweb.com/media/files/waybackmachine.jpg'><p>Wayback Fast</p></div>") | |
| # Lanzar la aplicación | |
| iface.launch() |