import gradio as gr import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from pypdf import PdfReader import time import re import io BASE_URL = "https://www.poderjudicial.es" SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp" POST_URL = "https://www.poderjudicial.es/search/search.action" DELAY = 6 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "es-ES,es;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Referer": SEARCH_URL, } def build_pdf_url(open_doc_link): partes = open_doc_link.rstrip("/").split("/") reference = partes[-2] optimize = partes[-1] return ( f"https://www.poderjudicial.es/search/contenidos.action" f"?action=accessToPDF&publicinterface=true&tab=AN" f"&reference={reference}&encode=true" f"&optimize={optimize}&databasematch=AN" ) def extraer_texto_pdf(contenido_bytes): reader = PdfReader(io.BytesIO(contenido_bytes)) return "\n".join(page.extract_text() or "" for page in reader.pages) def extraer_fundamentos(texto): patron = re.search( r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)', texto, re.IGNORECASE | re.DOTALL ) if patron: return patron.group(2).strip() return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO" def extraer_metadatos(texto): meta = {} ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto) meta["ECLI"] = ecli.group(1) if ecli else "N/D" ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto) meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D" fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto) meta["FECHA"] = fecha.group(1) if fecha else "N/D" return meta def scrape_cendoj(query, jurisdiccion, num_resultados): log = [] output_path = "/tmp/sentencias_cendoj.txt" payload = { "action": "query", "sort": "IN_FECHARESOLUCION:decreasing", "recordsPerPage": str(num_resultados), "databasematch": "AN", "start": "1", "TEXT": query, "JURISDICCION": f"|{jurisdiccion}|", "TIPOORGANOPUB": "|11|12|13|14|15|16|", "field": "JURISDICCION", "idtab": "jurisprudencia", "org": "", } session = requests.Session() session.get(SEARCH_URL, headers=headers) log.append("Ejecutando búsqueda...") response = session.post(POST_URL, headers=headers, data=payload) response.raise_for_status() result_html = response.text log.append(f"Respuesta recibida: {len(result_html)} caracteres") soup_results = BeautifulSoup(result_html, "html.parser") links = [] for a in soup_results.find_all("a", href=True): href = a["href"] if "openDocument" in href: full = urljoin(BASE_URL, href) links.append(full) links = list(set(links)) log.append(f"Enlaces detectados: {len(links)}") with open(output_path, "w", encoding="utf-8") as f_out: for i, link in enumerate(links): log.append(f"[{i+1}/{len(links)}] Procesando: {link}") try: pdf_url = build_pdf_url(link) r_pdf = session.get(pdf_url, headers=headers) r_pdf.raise_for_status() if "text/html" in r_pdf.headers.get("Content-Type", ""): log.append(" ⚠ CAPTCHA — no se obtuvo PDF") time.sleep(DELAY) continue texto_completo = extraer_texto_pdf(r_pdf.content) meta = extraer_metadatos(texto_completo) fundamentos = extraer_fundamentos(texto_completo) f_out.write(f"ENLACE: {link}\n") f_out.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n") f_out.write("FUNDAMENTOS DE DERECHO:\n") f_out.write(fundamentos) f_out.write("\n" + "="*80 + "\n\n") log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}") except Exception as e: log.append(f" ✗ Error: {e}") time.sleep(DELAY) log.append(f"\nFichero generado: {output_path}") return "\n".join(log), output_path with gr.Blocks(title="Scraper CENDOJ") as demo: gr.Markdown("# Scraper CENDOJ — Tribunal Supremo") gr.Markdown("Descarga y extrae fundamentos de derecho de sentencias del TS.") with gr.Row(): query = gr.Textbox(label="Texto de búsqueda", value="responsabilidad patrimonial") jurisdiccion = gr.Textbox(label="Jurisdicción", value="PENAL") num = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Número de resultados") btn = gr.Button("Buscar y descargar", variant="primary") log = gr.Textbox(label="Log de ejecución", lines=20) fichero = gr.File(label="Descargar TXT") btn.click(fn=scrape_cendoj, inputs=[query, jurisdiccion, num], outputs=[log, fichero]) demo.launch(server_name="0.0.0.0", server_port=7860)