Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| from pypdf import PdfReader | |
| import time | |
| import re | |
| import io | |
| BASE_URL = "https://www.poderjudicial.es" | |
| SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp" | |
| POST_URL = "https://www.poderjudicial.es/search/search.action" | |
| DELAY = 6 | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", | |
| "Accept-Language": "es-ES,es;q=0.9", | |
| "Accept-Encoding": "gzip, deflate, br", | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1", | |
| "Sec-Fetch-Dest": "document", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-Site": "same-origin", | |
| "Sec-Fetch-User": "?1", | |
| "Referer": SEARCH_URL, | |
| } | |
| def build_pdf_url(open_doc_link): | |
| partes = open_doc_link.rstrip("/").split("/") | |
| reference = partes[-2] | |
| optimize = partes[-1] | |
| return ( | |
| f"https://www.poderjudicial.es/search/contenidos.action" | |
| f"?action=accessToPDF&publicinterface=true&tab=AN" | |
| f"&reference={reference}&encode=true" | |
| f"&optimize={optimize}&databasematch=AN" | |
| ) | |
| def extraer_texto_pdf(contenido_bytes): | |
| reader = PdfReader(io.BytesIO(contenido_bytes)) | |
| return "\n".join(page.extract_text() or "" for page in reader.pages) | |
| def extraer_fundamentos(texto): | |
| patron = re.search( | |
| r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)', | |
| texto, | |
| re.IGNORECASE | re.DOTALL | |
| ) | |
| if patron: | |
| return patron.group(2).strip() | |
| return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO" | |
| def extraer_metadatos(texto): | |
| meta = {} | |
| ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto) | |
| meta["ECLI"] = ecli.group(1) if ecli else "N/D" | |
| ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto) | |
| meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D" | |
| fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto) | |
| meta["FECHA"] = fecha.group(1) if fecha else "N/D" | |
| return meta | |
| def scrape_cendoj(query, jurisdiccion, num_resultados): | |
| log = [] | |
| output_path = "/tmp/sentencias_cendoj.txt" | |
| payload = { | |
| "action": "query", | |
| "sort": "IN_FECHARESOLUCION:decreasing", | |
| "recordsPerPage": str(num_resultados), | |
| "databasematch": "AN", | |
| "start": "1", | |
| "TEXT": query, | |
| "JURISDICCION": f"|{jurisdiccion}|", | |
| "TIPOORGANOPUB": "|11|12|13|14|15|16|", | |
| "field": "JURISDICCION", | |
| "idtab": "jurisprudencia", | |
| "org": "", | |
| } | |
| session = requests.Session() | |
| session.get(SEARCH_URL, headers=headers) | |
| log.append("Ejecutando búsqueda...") | |
| response = session.post(POST_URL, headers=headers, data=payload) | |
| response.raise_for_status() | |
| result_html = response.text | |
| log.append(f"Respuesta recibida: {len(result_html)} caracteres") | |
| soup_results = BeautifulSoup(result_html, "html.parser") | |
| links = [] | |
| for a in soup_results.find_all("a", href=True): | |
| href = a["href"] | |
| if "openDocument" in href: | |
| full = urljoin(BASE_URL, href) | |
| links.append(full) | |
| links = list(set(links)) | |
| log.append(f"Enlaces detectados: {len(links)}") | |
| with open(output_path, "w", encoding="utf-8") as f_out: | |
| for i, link in enumerate(links): | |
| log.append(f"[{i+1}/{len(links)}] Procesando: {link}") | |
| try: | |
| pdf_url = build_pdf_url(link) | |
| r_pdf = session.get(pdf_url, headers=headers) | |
| r_pdf.raise_for_status() | |
| if "text/html" in r_pdf.headers.get("Content-Type", ""): | |
| log.append(" ⚠ CAPTCHA — no se obtuvo PDF") | |
| time.sleep(DELAY) | |
| continue | |
| texto_completo = extraer_texto_pdf(r_pdf.content) | |
| meta = extraer_metadatos(texto_completo) | |
| fundamentos = extraer_fundamentos(texto_completo) | |
| f_out.write(f"ENLACE: {link}\n") | |
| f_out.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n") | |
| f_out.write("FUNDAMENTOS DE DERECHO:\n") | |
| f_out.write(fundamentos) | |
| f_out.write("\n" + "="*80 + "\n\n") | |
| log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}") | |
| except Exception as e: | |
| log.append(f" ✗ Error: {e}") | |
| time.sleep(DELAY) | |
| log.append(f"\nFichero generado: {output_path}") | |
| return "\n".join(log), output_path | |
| with gr.Blocks(title="Scraper CENDOJ") as demo: | |
| gr.Markdown("# Scraper CENDOJ — Tribunal Supremo") | |
| gr.Markdown("Descarga y extrae fundamentos de derecho de sentencias del TS.") | |
| with gr.Row(): | |
| query = gr.Textbox(label="Texto de búsqueda", value="responsabilidad patrimonial") | |
| jurisdiccion = gr.Textbox(label="Jurisdicción", value="PENAL") | |
| num = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Número de resultados") | |
| btn = gr.Button("Buscar y descargar", variant="primary") | |
| log = gr.Textbox(label="Log de ejecución", lines=20) | |
| fichero = gr.File(label="Descargar TXT") | |
| btn.click(fn=scrape_cendoj, inputs=[query, jurisdiccion, num], outputs=[log, fichero]) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |