Spaces:

vcasas
/

testing_my

Sleeping

File size: 5,229 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pypdf import PdfReader
import time
import re
import io

BASE_URL   = "https://www.poderjudicial.es"
SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
POST_URL   = "https://www.poderjudicial.es/search/search.action"
DELAY      = 6

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "es-ES,es;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Referer": SEARCH_URL,
}

PAYLOAD = {
    "action": "query",
    "sort": "IN_FECHARESOLUCION:decreasing",
    "recordsPerPage": "10",
    "databasematch": "AN",
    "start": "1",
    "TEXT": "responsabilidad patrimonial",
    "JURISDICCION": "|PENAL|",
    "TIPOORGANOPUB": "|11|12|13|14|15|16|",
    "field": "JURISDICCION",
    "idtab": "jurisprudencia",
    "org": "",
}

def build_pdf_url(open_doc_link):
    partes    = open_doc_link.rstrip("/").split("/")
    reference = partes[-2]
    optimize  = partes[-1]
    return (
        f"https://www.poderjudicial.es/search/contenidos.action"
        f"?action=accessToPDF&publicinterface=true&tab=AN"
        f"&reference={reference}&encode=true"
        f"&optimize={optimize}&databasematch=AN"
    )

def extraer_texto_pdf(contenido_bytes):
    reader = PdfReader(io.BytesIO(contenido_bytes))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def extraer_fundamentos(texto):
    patron = re.search(
        r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)',
        texto,
        re.IGNORECASE | re.DOTALL
    )
    if patron:
        return patron.group(2).strip()
    return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"

def extraer_metadatos(texto):
    meta = {}
    ecli    = re.search(r'ECLI[:\s]+(ES\S+)', texto)
    meta["ECLI"]    = ecli.group(1) if ecli else "N/D"
    ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
    meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
    fecha   = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
    meta["FECHA"]   = fecha.group(1) if fecha else "N/D"
    return meta

def scrape_una_sentencia():
    log = []

    # Paso 1: inicializar sesión
    session = requests.Session()
    log.append("Paso 1: inicializando sesión...")
    session.get(SEARCH_URL, headers=headers)

    # Paso 2: búsqueda POST
    log.append("Paso 2: ejecutando búsqueda POST...")
    response = session.post(POST_URL, headers=headers, data=PAYLOAD)
    log.append(f"  Status: {response.status_code} — Tamaño: {len(response.text)} caracteres")
    log.append(f"  Respuesta: {response.text[:500]}")

    # Paso 3: extraer primer enlace
    log.append("Paso 3: extrayendo primer enlace...")
    soup = BeautifulSoup(response.text, "html.parser")
    link = None
    for a in soup.find_all("a", href=True):
        if "openDocument" in a["href"]:
            link = urljoin(BASE_URL, a["href"])
            break

    if not link:
        log.append("  ✗ No se encontró ningún enlace")
        return "\n".join(log), None

    log.append(f"  Enlace: {link}")

    # Paso 4: construir URL de descarga
    pdf_url = build_pdf_url(link)
    log.append(f"Paso 4: URL de descarga: {pdf_url}")

    # Paso 5: descargar PDF
    log.append("Paso 5: descargando PDF...")
    time.sleep(DELAY)
    r_pdf = session.get(pdf_url, headers=headers)
    content_type = r_pdf.headers.get("Content-Type", "")
    log.append(f"  Content-Type: {content_type}")

    if "text/html" in content_type:
        log.append("  ✗ CAPTCHA — no se obtuvo PDF")
        return "\n".join(log), None

    # Paso 6: extraer texto y fundamentos
    log.append("Paso 6: extrayendo texto del PDF...")
    texto = extraer_texto_pdf(r_pdf.content)
    meta  = extraer_metadatos(texto)
    fund  = extraer_fundamentos(texto)

    # Paso 7: guardar resultado
    output_path = "/tmp/sentencia.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(f"ENLACE: {link}\n")
        f.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n")
        f.write("FUNDAMENTOS DE DERECHO:\n")
        f.write(fund)
        f.write("\n" + "="*80 + "\n")

    log.append(f"  ✓ Guardada — ECLI: {meta['ECLI']}")
    return "\n".join(log), output_path

with gr.Blocks(title="CENDOJ — Una sentencia") as demo:
    gr.Markdown("# CENDOJ — Descarga de prueba")
    gr.Markdown("Busca una sentencia del Tribunal Supremo (Penal) sobre responsabilidad patrimonial y descarga sus fundamentos de derecho.")
    btn     = gr.Button("Ejecutar", variant="primary")
    log     = gr.Textbox(label="Log", lines=15)
    fichero = gr.File(label="Descargar resultado")
    btn.click(fn=scrape_una_sentencia, inputs=[], outputs=[log, fichero])

demo.launch(server_name="0.0.0.0", server_port=7860)