testing_my / app.py0
vcasas's picture
Rename app.py to app.py0
6b12848 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pypdf import PdfReader
import time
import re
import io
BASE_URL = "https://www.poderjudicial.es"
SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
POST_URL = "https://www.poderjudicial.es/search/search.action"
DELAY = 6
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "es-ES,es;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Referer": SEARCH_URL,
}
def build_pdf_url(open_doc_link):
partes = open_doc_link.rstrip("/").split("/")
reference = partes[-2]
optimize = partes[-1]
return (
f"https://www.poderjudicial.es/search/contenidos.action"
f"?action=accessToPDF&publicinterface=true&tab=AN"
f"&reference={reference}&encode=true"
f"&optimize={optimize}&databasematch=AN"
)
def extraer_texto_pdf(contenido_bytes):
reader = PdfReader(io.BytesIO(contenido_bytes))
return "\n".join(page.extract_text() or "" for page in reader.pages)
def extraer_fundamentos(texto):
patron = re.search(
r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO|$)',
texto,
re.IGNORECASE | re.DOTALL
)
if patron:
return patron.group(2).strip()
return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"
def extraer_metadatos(texto):
meta = {}
ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto)
meta["ECLI"] = ecli.group(1) if ecli else "N/D"
ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
meta["FECHA"] = fecha.group(1) if fecha else "N/D"
return meta
def scrape_cendoj(query, jurisdiccion, num_resultados):
log = []
output_path = "/tmp/sentencias_cendoj.txt"
payload = {
"action": "query",
"sort": "IN_FECHARESOLUCION:decreasing",
"recordsPerPage": str(num_resultados),
"databasematch": "AN",
"start": "1",
"TEXT": query,
"JURISDICCION": f"|{jurisdiccion}|",
"TIPOORGANOPUB": "|11|12|13|14|15|16|",
"field": "JURISDICCION",
"idtab": "jurisprudencia",
"org": "",
}
session = requests.Session()
session.get(SEARCH_URL, headers=headers)
log.append("Ejecutando búsqueda...")
response = session.post(POST_URL, headers=headers, data=payload)
response.raise_for_status()
result_html = response.text
log.append(f"Respuesta recibida: {len(result_html)} caracteres")
soup_results = BeautifulSoup(result_html, "html.parser")
links = []
for a in soup_results.find_all("a", href=True):
href = a["href"]
if "openDocument" in href:
full = urljoin(BASE_URL, href)
links.append(full)
links = list(set(links))
log.append(f"Enlaces detectados: {len(links)}")
with open(output_path, "w", encoding="utf-8") as f_out:
for i, link in enumerate(links):
log.append(f"[{i+1}/{len(links)}] Procesando: {link}")
try:
pdf_url = build_pdf_url(link)
r_pdf = session.get(pdf_url, headers=headers)
r_pdf.raise_for_status()
if "text/html" in r_pdf.headers.get("Content-Type", ""):
log.append(" ⚠ CAPTCHA — no se obtuvo PDF")
time.sleep(DELAY)
continue
texto_completo = extraer_texto_pdf(r_pdf.content)
meta = extraer_metadatos(texto_completo)
fundamentos = extraer_fundamentos(texto_completo)
f_out.write(f"ENLACE: {link}\n")
f_out.write(f"ECLI: {meta['ECLI']} | PONENTE: {meta['PONENTE']} | FECHA: {meta['FECHA']}\n")
f_out.write("FUNDAMENTOS DE DERECHO:\n")
f_out.write(fundamentos)
f_out.write("\n" + "="*80 + "\n\n")
log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}")
except Exception as e:
log.append(f" ✗ Error: {e}")
time.sleep(DELAY)
log.append(f"\nFichero generado: {output_path}")
return "\n".join(log), output_path
with gr.Blocks(title="Scraper CENDOJ") as demo:
gr.Markdown("# Scraper CENDOJ — Tribunal Supremo")
gr.Markdown("Descarga y extrae fundamentos de derecho de sentencias del TS.")
with gr.Row():
query = gr.Textbox(label="Texto de búsqueda", value="responsabilidad patrimonial")
jurisdiccion = gr.Textbox(label="Jurisdicción", value="PENAL")
num = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Número de resultados")
btn = gr.Button("Buscar y descargar", variant="primary")
log = gr.Textbox(label="Log de ejecución", lines=20)
fichero = gr.File(label="Descargar TXT")
btn.click(fn=scrape_cendoj, inputs=[query, jurisdiccion, num], outputs=[log, fichero])
demo.launch(server_name="0.0.0.0", server_port=7860)