tx3bas commited on
Commit
3c44349
·
verified ·
1 Parent(s): 61570d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -27
app.py CHANGED
@@ -1,48 +1,63 @@
1
  import gradio as gr
2
- from datetime import datetime
3
  import requests
4
  import json
5
 
6
  def wayback(website):
7
  if not website:
8
- return '😭 Error: introduce una url correcta'
9
 
10
- # URL de la Wayback CDX Server API
11
- api_url = f"http://web.archive.org/cdx/search/cdx?url={website}&output=json&limit=10&fl=timestamp,original"
 
 
 
 
12
 
13
  try:
14
- response = requests.get(api_url)
15
- if response.status_code != 200:
16
- return f'😭 Error: Respuesta no exitosa con código de estado {response.status_code}'
17
-
18
- content = json.loads(response.text)
19
-
20
- if not content or len(content) < 2:
21
- return '😭 Error: No hay instantáneas disponibles para esta URL'
22
-
23
- # Procesar los resultados y devolverlos en forma de tabla HTML
24
- results = ["<table style='width:100%; border-collapse: collapse;'><tr><th>Fecha</th><th>URL</th></tr>"]
25
- for row in content[1:]: # Ignorar la primera fila, que es el encabezado
26
- date, original_url = row
27
- formatted_date = datetime.strptime(date, '%Y%m%d%H%M%S').strftime('%d/%m/%Y')
28
- snapshot_url = f'https://web.archive.org/web/{date}/{original_url}'
29
- results.append(f"<tr><td>{formatted_date}</td><td><a href='{snapshot_url}' target='_blank'>{original_url}</a></td></tr>")
30
- results.append("</table>")
31
- return "".join(results)
32
 
33
- except json.JSONDecodeError as e:
34
- return f'😭 Error al analizar JSON: {e}'
 
 
 
 
 
 
 
 
 
 
 
35
  except Exception as e:
36
- return f"😭 Error: {e}"
 
 
37
 
38
  # Crear la interfaz de Gradio
39
  iface = gr.Interface(
40
  fn=wayback,
41
  inputs="text",
42
  outputs="html",
43
- title="Wayback Machine CDX Server Lookup",
44
  description="Busca instantáneas archivadas de una página web en la Wayback Machine. Introduce la URL."
45
  )
46
 
47
  # Lanzar la aplicación
48
- iface.launch()
 
1
  import gradio as gr
2
+ from datetime import datetime, timedelta
3
  import requests
4
  import json
5
 
6
  def wayback(website):
7
  if not website:
8
+ return '<p>😭 Error: introduce una url correcta</p>'
9
 
10
+ # Intentar primero con la Wayback CDX Server API
11
+ end_date = datetime.now()
12
+ start_date = end_date - timedelta(days=365)
13
+ datefrom = start_date.strftime('%Y%m%d')
14
+ dateto = end_date.strftime('%Y%m%d')
15
+ cdx_api_url = f"http://web.archive.org/cdx/search/cdx?url={website}&output=json&from={datefrom}&to={dateto}&limit=3000"
16
 
17
  try:
18
+ response = requests.get(cdx_api_url)
19
+ if response.status_code == 200:
20
+ content = json.loads(response.text)
21
+ if len(content) > 1:
22
+ # Procesar y mostrar los resultados
23
+ results = []
24
+ for row in content[1:]:
25
+ date, page, status = [row[i] for i in [1, 2, 4]]
26
+ formatted_date = datetime.strptime(date, '%Y%m%d%H%M%S').strftime('%d/%m/%Y')
27
+ formatted_wayback_url = f"https://web.archive.org/web/{date}/{page}"
28
+ results.append({'date': date, 'formatted_date': formatted_date, 'link': formatted_wayback_url})
29
+ results.sort(key=lambda x: x['date'], reverse=True)
30
+ formatted_results = [f"<tr><td>{item['formatted_date']}</td><td><a href='{item['link']}' target='_blank'>{item['link']}</a></td></tr>" for item in results]
31
+ return "<table><tr><th>Fecha</th><th>URL</th></tr>" + "".join(formatted_results) + "</table>"
32
+ except Exception as e:
33
+ pass # Falla silenciosa, intentar con el siguiente método
 
 
34
 
35
+ # Si falla, intentar con la Wayback Availability JSON API
36
+ availability_api_url = f"http://archive.org/wayback/available?url={website}"
37
+ try:
38
+ response = requests.get(availability_api_url)
39
+ if response.status_code == 200:
40
+ data = json.loads(response.text)
41
+ if data["archived_snapshots"]:
42
+ closest_snapshot = data["archived_snapshots"]["closest"]
43
+ if closest_snapshot and closest_snapshot["available"]:
44
+ snapshot_url = closest_snapshot["url"]
45
+ timestamp = closest_snapshot["timestamp"]
46
+ formatted_date = datetime.strptime(timestamp, '%Y%m%d%H%M%S').strftime('%d/%m/%Y')
47
+ return f'<table><tr><th>Fecha</th><th>URL</th></tr><tr><td>{formatted_date}</td><td><a href="{snapshot_url}" target="_blank">{snapshot_url}</a></td></tr></table>'
48
  except Exception as e:
49
+ return f"<p>😭 Error: {e}</p>"
50
+
51
+ return '<p>😭 Error: No se encontraron datos archivados para esta URL.</p>'
52
 
53
  # Crear la interfaz de Gradio
54
  iface = gr.Interface(
55
  fn=wayback,
56
  inputs="text",
57
  outputs="html",
58
+ title="Wayback Machine Combined Lookup",
59
  description="Busca instantáneas archivadas de una página web en la Wayback Machine. Introduce la URL."
60
  )
61
 
62
  # Lanzar la aplicación
63
+ iface.launch()