Spaces:
Sleeping
Sleeping
File size: 5,229 Bytes
aa4466c 6c6b428 aa4466c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pypdf import PdfReader
import time
import re
import io
BASE_URL = "https://www.poderjudicial.es"
SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
POST_URL = "https://www.poderjudicial.es/search/search.action"
DELAY = 6
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "es-ES,es;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Referer": SEARCH_URL,
}
PAYLOAD = {
"action": "query",
"sort": "IN_FECHARESOLUCION:decreasing",
"recordsPerPage": "10",
"databasematch": "AN",
"start": "1",
"TEXT": "responsabilidad patrimonial",
"JURISDICCION": "|PENAL|",
"TIPOORGANOPUB": "|11|12|13|14|15|16|",
"field": "JURISDICCION",
"idtab": "jurisprudencia",
"org": "",
}
def build_pdf_url(open_doc_link):
partes = open_doc_link.rstrip("/").split("/")
reference = partes[-2]
optimize = partes[-1]
return (
f"https://www.poderjudicial.es/search/contenidos.action"
f"?action=accessToPDF&publicinterface=true&tab=AN"
f"&reference={reference}&encode=true"
f"&optimize={optimize}&databasematch=AN"
)
def extraer_texto_pdf(contenido_bytes):
reader = PdfReader(io.BytesIO(contenido_bytes))
return "\n".join(page.extract_text() or "" for page in reader.pages)
def extraer_fundamentos(texto):
patron = re.search(
r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)',
texto,
re.IGNORECASE | re.DOTALL
)
if patron:
return patron.group(2).strip()
return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"
def extraer_metadatos(texto):
meta = {}
ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto)
meta["ECLI"] = ecli.group(1) if ecli else "N/D"
ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
meta["FECHA"] = fecha.group(1) if fecha else "N/D"
return meta
def scrape_una_sentencia():
log = []
# Paso 1: inicializar sesión
session = requests.Session()
log.append("Paso 1: inicializando sesión...")
session.get(SEARCH_URL, headers=headers)
# Paso 2: búsqueda POST
log.append("Paso 2: ejecutando búsqueda POST...")
response = session.post(POST_URL, headers=headers, data=PAYLOAD)
log.append(f" Status: {response.status_code} — Tamaño: {len(response.text)} caracteres")
log.append(f" Respuesta: {response.text[:500]}")
# Paso 3: extraer primer enlace
log.append("Paso 3: extrayendo primer enlace...")
soup = BeautifulSoup(response.text, "html.parser")
link = None
for a in soup.find_all("a", href=True):
if "openDocument" in a["href"]:
link = urljoin(BASE_URL, a["href"])
break
if not link:
log.append(" ✗ No se encontró ningún enlace")
return "\n".join(log), None
log.append(f" Enlace: {link}")
# Paso 4: construir URL de descarga
pdf_url = build_pdf_url(link)
log.append(f"Paso 4: URL de descarga: {pdf_url}")
# Paso 5: descargar PDF
log.append("Paso 5: descargando PDF...")
time.sleep(DELAY)
r_pdf = session.get(pdf_url, headers=headers)
content_type = r_pdf.headers.get("Content-Type", "")
log.append(f" Content-Type: {content_type}")
if "text/html" in content_type:
log.append(" ✗ CAPTCHA — no se obtuvo PDF")
return "\n".join(log), None
# Paso 6: extraer texto y fundamentos
log.append("Paso 6: extrayendo texto del PDF...")
texto = extraer_texto_pdf(r_pdf.content)
meta = extraer_metadatos(texto)
fund = extraer_fundamentos(texto)
# Paso 7: guardar resultado
output_path = "/tmp/sentencia.txt"
with open(output_path, "w", encoding="utf-8") as f:
f.write(f"ENLACE: {link}\n")
f.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n")
f.write("FUNDAMENTOS DE DERECHO:\n")
f.write(fund)
f.write("\n" + "="*80 + "\n")
log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}")
return "\n".join(log), output_path
with gr.Blocks(title="CENDOJ — Una sentencia") as demo:
gr.Markdown("# CENDOJ — Descarga de prueba")
gr.Markdown("Busca una sentencia del Tribunal Supremo (Penal) sobre responsabilidad patrimonial y descarga sus fundamentos de derecho.")
btn = gr.Button("Ejecutar", variant="primary")
log = gr.Textbox(label="Log", lines=15)
fichero = gr.File(label="Descargar resultado")
btn.click(fn=scrape_una_sentencia, inputs=[], outputs=[log, fichero])
demo.launch(server_name="0.0.0.0", server_port=7860)
|