Spaces:

vcasas
/

testing_my

Sleeping

App Files Files Community

testing_my / app.py0

vcasas

Rename app.py to app.py0

6b12848 verified 3 months ago

raw

history blame contribute delete

5.51 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from pypdf import PdfReader
	import time
	import re
	import io

	BASE_URL = "https://www.poderjudicial.es"
	SEARCH_URL = "https://www.poderjudicial.es/search/indexAN.jsp"
	POST_URL = "https://www.poderjudicial.es/search/search.action"
	DELAY = 6

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "es-ES,es;q=0.9",
	"Accept-Encoding": "gzip, deflate, br",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	"Sec-Fetch-Dest": "document",
	"Sec-Fetch-Mode": "navigate",
	"Sec-Fetch-Site": "same-origin",
	"Sec-Fetch-User": "?1",
	"Referer": SEARCH_URL,
	}

	def build_pdf_url(open_doc_link):
	partes = open_doc_link.rstrip("/").split("/")
	reference = partes[-2]
	optimize = partes[-1]
	return (
	f"https://www.poderjudicial.es/search/contenidos.action"
	f"?action=accessToPDF&publicinterface=true&tab=AN"
	f"&reference={reference}&encode=true"
	f"&optimize={optimize}&databasematch=AN"
	)

	def extraer_texto_pdf(contenido_bytes):
	reader = PdfReader(io.BytesIO(contenido_bytes))
	return "\n".join(page.extract_text() or "" for page in reader.pages)

	def extraer_fundamentos(texto):
	patron = re.search(
	r'(FUNDAMENTOS\s+DE\s+DERECHO)(.*?)(?=FALLO\|$)',
	texto,
	re.IGNORECASE \| re.DOTALL
	)
	if patron:
	return patron.group(2).strip()
	return "NO SE ENCONTRARON FUNDAMENTOS DE DERECHO"

	def extraer_metadatos(texto):
	meta = {}
	ecli = re.search(r'ECLI[:\s]+(ES\S+)', texto)
	meta["ECLI"] = ecli.group(1) if ecli else "N/D"
	ponente = re.search(r'Ponente[:\s]+([A-ZÁÉÍÓÚÑ ]+)', texto)
	meta["PONENTE"] = ponente.group(1).strip() if ponente else "N/D"
	fecha = re.search(r'Fecha[:\s]+(\d{2}/\d{2}/\d{4})', texto)
	meta["FECHA"] = fecha.group(1) if fecha else "N/D"
	return meta

	def scrape_cendoj(query, jurisdiccion, num_resultados):
	log = []
	output_path = "/tmp/sentencias_cendoj.txt"

	payload = {
	"action": "query",
	"sort": "IN_FECHARESOLUCION:decreasing",
	"recordsPerPage": str(num_resultados),
	"databasematch": "AN",
	"start": "1",
	"TEXT": query,
	"JURISDICCION": f"\|{jurisdiccion}\|",
	"TIPOORGANOPUB": "\|11\|12\|13\|14\|15\|16\|",
	"field": "JURISDICCION",
	"idtab": "jurisprudencia",
	"org": "",
	}

	session = requests.Session()
	session.get(SEARCH_URL, headers=headers)

	log.append("Ejecutando búsqueda...")
	response = session.post(POST_URL, headers=headers, data=payload)
	response.raise_for_status()
	result_html = response.text
	log.append(f"Respuesta recibida: {len(result_html)} caracteres")

	soup_results = BeautifulSoup(result_html, "html.parser")
	links = []
	for a in soup_results.find_all("a", href=True):
	href = a["href"]
	if "openDocument" in href:
	full = urljoin(BASE_URL, href)
	links.append(full)
	links = list(set(links))
	log.append(f"Enlaces detectados: {len(links)}")

	with open(output_path, "w", encoding="utf-8") as f_out:
	for i, link in enumerate(links):
	log.append(f"[{i+1}/{len(links)}] Procesando: {link}")
	try:
	pdf_url = build_pdf_url(link)
	r_pdf = session.get(pdf_url, headers=headers)
	r_pdf.raise_for_status()

	if "text/html" in r_pdf.headers.get("Content-Type", ""):
	log.append(" ⚠ CAPTCHA — no se obtuvo PDF")
	time.sleep(DELAY)
	continue

	texto_completo = extraer_texto_pdf(r_pdf.content)
	meta = extraer_metadatos(texto_completo)
	fundamentos = extraer_fundamentos(texto_completo)

	f_out.write(f"ENLACE: {link}\n")
	f_out.write(f"ECLI: {meta['ECLI']} \| PONENTE: {meta['PONENTE']} \| FECHA: {meta['FECHA']}\n")
	f_out.write("FUNDAMENTOS DE DERECHO:\n")
	f_out.write(fundamentos)
	f_out.write("\n" + "="*80 + "\n\n")

	log.append(f" ✓ Guardada — ECLI: {meta['ECLI']}")

	except Exception as e:
	log.append(f" ✗ Error: {e}")

	time.sleep(DELAY)

	log.append(f"\nFichero generado: {output_path}")
	return "\n".join(log), output_path

	with gr.Blocks(title="Scraper CENDOJ") as demo:
	gr.Markdown("# Scraper CENDOJ — Tribunal Supremo")
	gr.Markdown("Descarga y extrae fundamentos de derecho de sentencias del TS.")

	with gr.Row():
	query = gr.Textbox(label="Texto de búsqueda", value="responsabilidad patrimonial")
	jurisdiccion = gr.Textbox(label="Jurisdicción", value="PENAL")
	num = gr.Slider(minimum=1, maximum=20, value=10, step=1, label="Número de resultados")

	btn = gr.Button("Buscar y descargar", variant="primary")
	log = gr.Textbox(label="Log de ejecución", lines=20)
	fichero = gr.File(label="Descargar TXT")

	btn.click(fn=scrape_cendoj, inputs=[query, jurisdiccion, num], outputs=[log, fichero])

	demo.launch(server_name="0.0.0.0", server_port=7860)