Spaces:

vcasas
/

testing_my

Sleeping

File size: 5,505 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pypdf import PdfReader
import time
import re
import io

BASE_URL   = "https://www.poderjudicial.es"
SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
POST_URL   = "https://www.poderjudicial.es/search/search.action"
DELAY      = 6

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "es-ES,es;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Referer": SEARCH_URL,
}

def build_pdf_url(open_doc_link):
    partes    = open_doc_link.rstrip("/").split("/")
    reference = partes[-2]
    optimize  = partes[-1]
    return (
        f"https://www.poderjudicial.es/search/contenidos.action"
        f"?action=accessToPDF&publicinterface=true&tab=AN"
        f"&reference={reference}&encode=true"
        f"&optimize={optimize}&databasematch=AN"
    )

def extraer_texto_pdf(contenido_bytes):
    reader = PdfReader(io.BytesIO(contenido_bytes))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def extraer_fundamentos(texto):
    patron = re.search(
        r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)',
        texto,
        re.IGNORECASE | re.DOTALL
    )
    if patron:
        return patron.group(2).strip()
    return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"

def extraer_metadatos(texto):
    meta = {}
    ecli    = re.search(r'ECLI[:\s]+(ES\S+)', texto)
    meta["ECLI"]    = ecli.group(1) if ecli else "N/D"
    ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
    meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
    fecha   = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
    meta["FECHA"]   = fecha.group(1) if fecha else "N/D"
    return meta

def scrape_cendoj(query, jurisdiccion, num_resultados):
    log = []
    output_path = "/tmp/sentencias_cendoj.txt"

    payload = {
        "action": "query",
        "sort": "IN_FECHARESOLUCION:decreasing",
        "recordsPerPage": str(num_resultados),
        "databasematch": "AN",
        "start": "1",
        "TEXT": query,
        "JURISDICCION": f"|{jurisdiccion}|",
        "TIPOORGANOPUB": "|11|12|13|14|15|16|",
        "field": "JURISDICCION",
        "idtab": "jurisprudencia",
        "org": "",
    }

    session = requests.Session()
    session.get(SEARCH_URL, headers=headers)

    log.append("Ejecutando búsqueda...")
    response = session.post(POST_URL, headers=headers, data=payload)
    response.raise_for_status()
    result_html = response.text
    log.append(f"Respuesta recibida: {len(result_html)} caracteres")

    soup_results = BeautifulSoup(result_html, "html.parser")
    links = []
    for a in soup_results.find_all("a", href=True):
        href = a["href"]
        if "openDocument" in href:
            full = urljoin(BASE_URL, href)
            links.append(full)
    links = list(set(links))
    log.append(f"Enlaces detectados: {len(links)}")

    with open(output_path, "w", encoding="utf-8") as f_out:
        for i, link in enumerate(links):
            log.append(f"[{i+1}/{len(links)}] Procesando: {link}")
            try:
                pdf_url = build_pdf_url(link)
                r_pdf = session.get(pdf_url, headers=headers)
                r_pdf.raise_for_status()

                if "text/html" in r_pdf.headers.get("Content-Type", ""):
                    log.append("  ⚠ CAPTCHA — no se obtuvo PDF")
                    time.sleep(DELAY)
                    continue

                texto_completo = extraer_texto_pdf(r_pdf.content)
                meta            = extraer_metadatos(texto_completo)
                fundamentos     = extraer_fundamentos(texto_completo)

                f_out.write(f"ENLACE: {link}\n")
                f_out.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n")
                f_out.write("FUNDAMENTOS DE DERECHO:\n")
                f_out.write(fundamentos)
                f_out.write("\n" + "="*80 + "\n\n")

                log.append(f"  ✓ Guardada — ECLI: {meta['ECLI']}")

            except Exception as e:
                log.append(f"  ✗ Error: {e}")

            time.sleep(DELAY)

    log.append(f"\nFichero generado: {output_path}")
    return "\n".join(log), output_path

with gr.Blocks(title="Scraper CENDOJ") as demo:
    gr.Markdown("# Scraper CENDOJ — Tribunal Supremo")
    gr.Markdown("Descarga y extrae fundamentos de derecho de sentencias del TS.")

    with gr.Row():
        query        = gr.Textbox(label="Texto de búsqueda", value="responsabilidad patrimonial")
        jurisdiccion = gr.Textbox(label="Jurisdicción", value="PENAL")
        num          = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Número de resultados")

    btn    = gr.Button("Buscar y descargar", variant="primary")
    log    = gr.Textbox(label="Log de ejecución", lines=20)
    fichero = gr.File(label="Descargar TXT")

    btn.click(fn=scrape_cendoj, inputs=[query, jurisdiccion, num], outputs=[log, fichero])

demo.launch(server_name="0.0.0.0", server_port=7860)