Spaces:
Sleeping
Sleeping
File size: 5,505 Bytes
1608e99 8168844 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pypdf import PdfReader
import time
import re
import io
BASE_URL = "https://www.poderjudicial.es"
SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
POST_URL = "https://www.poderjudicial.es/search/search.action"
DELAY = 6
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "es-ES,es;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Referer": SEARCH_URL,
}
def build_pdf_url(open_doc_link):
partes = open_doc_link.rstrip("/").split("/")
reference = partes[-2]
optimize = partes[-1]
return (
f"https://www.poderjudicial.es/search/contenidos.action"
f"?action=accessToPDF&publicinterface=true&tab=AN"
f"&reference={reference}&encode=true"
f"&optimize={optimize}&databasematch=AN"
)
def extraer_texto_pdf(contenido_bytes):
reader = PdfReader(io.BytesIO(contenido_bytes))
return "\n".join(page.extract_text() or "" for page in reader.pages)
def extraer_fundamentos(texto):
patron = re.search(
r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)',
texto,
re.IGNORECASE | re.DOTALL
)
if patron:
return patron.group(2).strip()
return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"
def extraer_metadatos(texto):
meta = {}
ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto)
meta["ECLI"] = ecli.group(1) if ecli else "N/D"
ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
meta["FECHA"] = fecha.group(1) if fecha else "N/D"
return meta
def scrape_cendoj(query, jurisdiccion, num_resultados):
log = []
output_path = "/tmp/sentencias_cendoj.txt"
payload = {
"action": "query",
"sort": "IN_FECHARESOLUCION:decreasing",
"recordsPerPage": str(num_resultados),
"databasematch": "AN",
"start": "1",
"TEXT": query,
"JURISDICCION": f"|{jurisdiccion}|",
"TIPOORGANOPUB": "|11|12|13|14|15|16|",
"field": "JURISDICCION",
"idtab": "jurisprudencia",
"org": "",
}
session = requests.Session()
session.get(SEARCH_URL, headers=headers)
log.append("Ejecutando búsqueda...")
response = session.post(POST_URL, headers=headers, data=payload)
response.raise_for_status()
result_html = response.text
log.append(f"Respuesta recibida: {len(result_html)} caracteres")
soup_results = BeautifulSoup(result_html, "html.parser")
links = []
for a in soup_results.find_all("a", href=True):
href = a["href"]
if "openDocument" in href:
full = urljoin(BASE_URL, href)
links.append(full)
links = list(set(links))
log.append(f"Enlaces detectados: {len(links)}")
with open(output_path, "w", encoding="utf-8") as f_out:
for i, link in enumerate(links):
log.append(f"[{i+1}/{len(links)}] Procesando: {link}")
try:
pdf_url = build_pdf_url(link)
r_pdf = session.get(pdf_url, headers=headers)
r_pdf.raise_for_status()
if "text/html" in r_pdf.headers.get("Content-Type", ""):
log.append(" ⚠ CAPTCHA — no se obtuvo PDF")
time.sleep(DELAY)
continue
texto_completo = extraer_texto_pdf(r_pdf.content)
meta = extraer_metadatos(texto_completo)
fundamentos = extraer_fundamentos(texto_completo)
f_out.write(f"ENLACE: {link}\n")
f_out.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n")
f_out.write("FUNDAMENTOS DE DERECHO:\n")
f_out.write(fundamentos)
f_out.write("\n" + "="*80 + "\n\n")
log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}")
except Exception as e:
log.append(f" ✗ Error: {e}")
time.sleep(DELAY)
log.append(f"\nFichero generado: {output_path}")
return "\n".join(log), output_path
with gr.Blocks(title="Scraper CENDOJ") as demo:
gr.Markdown("# Scraper CENDOJ — Tribunal Supremo")
gr.Markdown("Descarga y extrae fundamentos de derecho de sentencias del TS.")
with gr.Row():
query = gr.Textbox(label="Texto de búsqueda", value="responsabilidad patrimonial")
jurisdiccion = gr.Textbox(label="Jurisdicción", value="PENAL")
num = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Número de resultados")
btn = gr.Button("Buscar y descargar", variant="primary")
log = gr.Textbox(label="Log de ejecución", lines=20)
fichero = gr.File(label="Descargar TXT")
btn.click(fn=scrape_cendoj, inputs=[query, jurisdiccion, num], outputs=[log, fichero])
demo.launch(server_name="0.0.0.0", server_port=7860)
|