testing_my / app.py
vcasas's picture
Update app.py
6c6b428 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pypdf import PdfReader
import time
import re
import io
BASE_URL = "https://www.poderjudicial.es"
SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
POST_URL = "https://www.poderjudicial.es/search/search.action"
DELAY = 6
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "es-ES,es;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Referer": SEARCH_URL,
}
PAYLOAD = {
"action": "query",
"sort": "IN_FECHARESOLUCION:decreasing",
"recordsPerPage": "10",
"databasematch": "AN",
"start": "1",
"TEXT": "responsabilidad patrimonial",
"JURISDICCION": "|PENAL|",
"TIPOORGANOPUB": "|11|12|13|14|15|16|",
"field": "JURISDICCION",
"idtab": "jurisprudencia",
"org": "",
}
def build_pdf_url(open_doc_link):
partes = open_doc_link.rstrip("/").split("/")
reference = partes[-2]
optimize = partes[-1]
return (
f"https://www.poderjudicial.es/search/contenidos.action"
f"?action=accessToPDF&publicinterface=true&tab=AN"
f"&reference={reference}&encode=true"
f"&optimize={optimize}&databasematch=AN"
)
def extraer_texto_pdf(contenido_bytes):
reader = PdfReader(io.BytesIO(contenido_bytes))
return "\n".join(page.extract_text() or "" for page in reader.pages)
def extraer_fundamentos(texto):
patron = re.search(
r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)',
texto,
re.IGNORECASE | re.DOTALL
)
if patron:
return patron.group(2).strip()
return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"
def extraer_metadatos(texto):
meta = {}
ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto)
meta["ECLI"] = ecli.group(1) if ecli else "N/D"
ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
meta["FECHA"] = fecha.group(1) if fecha else "N/D"
return meta
def scrape_una_sentencia():
log = []
# Paso 1: inicializar sesión
session = requests.Session()
log.append("Paso 1: inicializando sesión...")
session.get(SEARCH_URL, headers=headers)
# Paso 2: búsqueda POST
log.append("Paso 2: ejecutando búsqueda POST...")
response = session.post(POST_URL, headers=headers, data=PAYLOAD)
log.append(f" Status: {response.status_code} — Tamaño: {len(response.text)} caracteres")
log.append(f" Respuesta: {response.text[:500]}")
# Paso 3: extraer primer enlace
log.append("Paso 3: extrayendo primer enlace...")
soup = BeautifulSoup(response.text, "html.parser")
link = None
for a in soup.find_all("a", href=True):
if "openDocument" in a["href"]:
link = urljoin(BASE_URL, a["href"])
break
if not link:
log.append(" ✗ No se encontró ningún enlace")
return "\n".join(log), None
log.append(f" Enlace: {link}")
# Paso 4: construir URL de descarga
pdf_url = build_pdf_url(link)
log.append(f"Paso 4: URL de descarga: {pdf_url}")
# Paso 5: descargar PDF
log.append("Paso 5: descargando PDF...")
time.sleep(DELAY)
r_pdf = session.get(pdf_url, headers=headers)
content_type = r_pdf.headers.get("Content-Type", "")
log.append(f" Content-Type: {content_type}")
if "text/html" in content_type:
log.append(" ✗ CAPTCHA — no se obtuvo PDF")
return "\n".join(log), None
# Paso 6: extraer texto y fundamentos
log.append("Paso 6: extrayendo texto del PDF...")
texto = extraer_texto_pdf(r_pdf.content)
meta = extraer_metadatos(texto)
fund = extraer_fundamentos(texto)
# Paso 7: guardar resultado
output_path = "/tmp/sentencia.txt"
with open(output_path, "w", encoding="utf-8") as f:
f.write(f"ENLACE: {link}\n")
f.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n")
f.write("FUNDAMENTOS DE DERECHO:\n")
f.write(fund)
f.write("\n" + "="*80 + "\n")
log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}")
return "\n".join(log), output_path
with gr.Blocks(title="CENDOJ — Una sentencia") as demo:
gr.Markdown("# CENDOJ — Descarga de prueba")
gr.Markdown("Busca una sentencia del Tribunal Supremo (Penal) sobre responsabilidad patrimonial y descarga sus fundamentos de derecho.")
btn = gr.Button("Ejecutar", variant="primary")
log = gr.Textbox(label="Log", lines=15)
fichero = gr.File(label="Descargar resultado")
btn.click(fn=scrape_una_sentencia, inputs=[], outputs=[log, fichero])
demo.launch(server_name="0.0.0.0", server_port=7860)