Spaces:
Sleeping
Sleeping
Update app.py
Browse files- marker_single tryouts
- change pdf mgt
app.py
CHANGED
|
@@ -7,23 +7,19 @@ from urllib.parse import urlparse
|
|
| 7 |
import time
|
| 8 |
import re
|
| 9 |
|
| 10 |
-
def sanear_nombre(nombre
|
| 11 |
nombre_base = os.path.splitext(nombre)[0]
|
| 12 |
nombre_limpio = re.sub(r"[^\w\-_.]", "_", nombre_base)
|
| 13 |
-
if not nombre_limpio.endswith(ext):
|
| 14 |
-
nombre_limpio += ext
|
| 15 |
return nombre_limpio
|
| 16 |
|
| 17 |
def obtener_ruta_segura(base_dir, nombre_archivo):
|
| 18 |
-
|
| 19 |
-
ruta_final = os.path.join(base_dir,
|
| 20 |
contador = 1
|
| 21 |
-
while os.path.exists(ruta_final):
|
| 22 |
-
|
| 23 |
-
nombre_final = f"{nombre_sin_ext}_{contador}.pdf"
|
| 24 |
-
ruta_final = os.path.join(base_dir, nombre_final)
|
| 25 |
contador += 1
|
| 26 |
-
return ruta_final
|
| 27 |
|
| 28 |
def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
| 29 |
output_base = "./marker_output"
|
|
@@ -34,14 +30,15 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
|
| 34 |
yield "Selecciona al menos un formato de salida.", "", None, None, None
|
| 35 |
return
|
| 36 |
|
|
|
|
| 37 |
if url_pdf:
|
| 38 |
parsed = urlparse(url_pdf)
|
| 39 |
-
nombre_pdf = parsed.path.split("/")[-1] or "documento
|
| 40 |
ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
|
| 41 |
try:
|
| 42 |
response = requests.get(url_pdf)
|
| 43 |
-
if response.status_code != 200
|
| 44 |
-
yield "La URL no
|
| 45 |
return
|
| 46 |
with open(ruta_pdf, "wb") as f:
|
| 47 |
f.write(response.content)
|
|
@@ -49,7 +46,7 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
|
| 49 |
yield f"Error al descargar: {str(e)}", "", None, None, None
|
| 50 |
return
|
| 51 |
elif pdf_file:
|
| 52 |
-
nombre_pdf = pdf_file.name or "documento
|
| 53 |
ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
|
| 54 |
try:
|
| 55 |
shutil.copyfile(pdf_file.name, ruta_pdf)
|
|
@@ -60,6 +57,16 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
|
| 60 |
yield "No se proporcionó archivo ni URL.", "", None, None, None
|
| 61 |
return
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
if os.path.getsize(ruta_pdf) > 3 * 1024 * 1024:
|
| 64 |
yield "Aviso: el PDF es grande y puede tardar más de lo normal.", "", None, None, None
|
| 65 |
|
|
@@ -69,32 +76,41 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
|
| 69 |
shutil.rmtree(carpeta_salida)
|
| 70 |
os.makedirs(carpeta_salida, exist_ok=True)
|
| 71 |
|
|
|
|
| 72 |
try:
|
| 73 |
for fmt in formatos:
|
| 74 |
tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
|
| 75 |
if fmt == "md":
|
| 76 |
-
status = "Procesando: markdown ligero"
|
| 77 |
cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--skip_ocr", "--output_dir", carpeta_salida]
|
| 78 |
elif fmt == "md + ocr":
|
| 79 |
-
status = "Procesando: markdown con OCR"
|
| 80 |
cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--output_dir", carpeta_salida]
|
| 81 |
else:
|
| 82 |
-
status = f"Procesando: {fmt}"
|
| 83 |
cmd = ["marker_single", ruta_pdf, "--output_format", fmt, "--output_dir", carpeta_salida]
|
| 84 |
|
| 85 |
-
yield
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return
|
| 91 |
except subprocess.CalledProcessError as e:
|
| 92 |
-
yield f"Error
|
| 93 |
return
|
| 94 |
except Exception as e:
|
| 95 |
yield f"Error inesperado: {str(e)}", "", None, None, None
|
| 96 |
return
|
| 97 |
|
|
|
|
| 98 |
md_path = os.path.join(carpeta_salida, f"{nombre_sin_ext}.md")
|
| 99 |
md_content = ""
|
| 100 |
if os.path.exists(md_path):
|
|
@@ -122,10 +138,11 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
|
| 122 |
zip_path = os.path.join(output_base, f"{nombre_sin_ext}.zip")
|
| 123 |
shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=carpeta_salida)
|
| 124 |
|
| 125 |
-
estado_final = "Procesamiento completado correctamente"
|
| 126 |
salida_md = md_content + resumen if md_content else resumen
|
| 127 |
-
yield estado_final,
|
| 128 |
|
|
|
|
| 129 |
demo = gr.Interface(
|
| 130 |
fn=procesar_pdf,
|
| 131 |
inputs=[
|
|
@@ -138,15 +155,14 @@ demo = gr.Interface(
|
|
| 138 |
)
|
| 139 |
],
|
| 140 |
outputs=[
|
| 141 |
-
gr.Textbox(label="Estado del procesamiento", lines=
|
| 142 |
-
gr.Textbox(label="Tiempo transcurrido", interactive=False),
|
| 143 |
gr.File(label="Descargar Markdown", visible=False),
|
| 144 |
gr.File(label="Descargar ZIP completo", visible=False),
|
| 145 |
gr.Textbox(label="Contenido Markdown extraído", lines=25, visible=False)
|
| 146 |
],
|
| 147 |
title="Marker PDF",
|
| 148 |
-
description="Convierte PDFs científicos en Markdown, HTML o JSON.
|
| 149 |
-
|
| 150 |
)
|
| 151 |
|
| 152 |
if __name__ == "__main__":
|
|
|
|
| 7 |
import time
|
| 8 |
import re
|
| 9 |
|
| 10 |
+
def sanear_nombre(nombre):
|
| 11 |
nombre_base = os.path.splitext(nombre)[0]
|
| 12 |
nombre_limpio = re.sub(r"[^\w\-_.]", "_", nombre_base)
|
|
|
|
|
|
|
| 13 |
return nombre_limpio
|
| 14 |
|
| 15 |
def obtener_ruta_segura(base_dir, nombre_archivo):
|
| 16 |
+
nombre_base = sanear_nombre(nombre_archivo)
|
| 17 |
+
ruta_final = os.path.join(base_dir, nombre_base)
|
| 18 |
contador = 1
|
| 19 |
+
while os.path.exists(ruta_final + ".pdf"):
|
| 20 |
+
ruta_final = os.path.join(base_dir, f"{nombre_base}_{contador}")
|
|
|
|
|
|
|
| 21 |
contador += 1
|
| 22 |
+
return ruta_final + ".pdf"
|
| 23 |
|
| 24 |
def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
| 25 |
output_base = "./marker_output"
|
|
|
|
| 30 |
yield "Selecciona al menos un formato de salida.", "", None, None, None
|
| 31 |
return
|
| 32 |
|
| 33 |
+
# 1. Guardar archivo desde URL o subida
|
| 34 |
if url_pdf:
|
| 35 |
parsed = urlparse(url_pdf)
|
| 36 |
+
nombre_pdf = parsed.path.split("/")[-1] or "documento"
|
| 37 |
ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
|
| 38 |
try:
|
| 39 |
response = requests.get(url_pdf)
|
| 40 |
+
if response.status_code != 200:
|
| 41 |
+
yield "La URL no pudo ser descargada correctamente.", "", None, None, None
|
| 42 |
return
|
| 43 |
with open(ruta_pdf, "wb") as f:
|
| 44 |
f.write(response.content)
|
|
|
|
| 46 |
yield f"Error al descargar: {str(e)}", "", None, None, None
|
| 47 |
return
|
| 48 |
elif pdf_file:
|
| 49 |
+
nombre_pdf = pdf_file.name or "documento"
|
| 50 |
ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
|
| 51 |
try:
|
| 52 |
shutil.copyfile(pdf_file.name, ruta_pdf)
|
|
|
|
| 57 |
yield "No se proporcionó archivo ni URL.", "", None, None, None
|
| 58 |
return
|
| 59 |
|
| 60 |
+
# 2. Verificar que realmente sea un PDF
|
| 61 |
+
try:
|
| 62 |
+
with open(ruta_pdf, "rb") as f:
|
| 63 |
+
if f.read(4) != b"%PDF":
|
| 64 |
+
yield "El archivo no es un PDF válido (falta cabecera %PDF).", "", None, None, None
|
| 65 |
+
return
|
| 66 |
+
except Exception as e:
|
| 67 |
+
yield f"No se pudo leer el archivo para verificar su tipo: {str(e)}", "", None, None, None
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
if os.path.getsize(ruta_pdf) > 3 * 1024 * 1024:
|
| 71 |
yield "Aviso: el PDF es grande y puede tardar más de lo normal.", "", None, None, None
|
| 72 |
|
|
|
|
| 76 |
shutil.rmtree(carpeta_salida)
|
| 77 |
os.makedirs(carpeta_salida, exist_ok=True)
|
| 78 |
|
| 79 |
+
# 3. Procesar con marker_single con reintento
|
| 80 |
try:
|
| 81 |
for fmt in formatos:
|
| 82 |
tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
|
| 83 |
if fmt == "md":
|
|
|
|
| 84 |
cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--skip_ocr", "--output_dir", carpeta_salida]
|
| 85 |
elif fmt == "md + ocr":
|
|
|
|
| 86 |
cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--output_dir", carpeta_salida]
|
| 87 |
else:
|
|
|
|
| 88 |
cmd = ["marker_single", ruta_pdf, "--output_format", fmt, "--output_dir", carpeta_salida]
|
| 89 |
|
| 90 |
+
yield f"Procesando formato '{fmt}'... (tiempo: {tiempo_parcial})", "", None, None, None
|
| 91 |
+
|
| 92 |
+
intentos = 3
|
| 93 |
+
for intento in range(1, intentos + 1):
|
| 94 |
+
try:
|
| 95 |
+
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=900)
|
| 96 |
+
if result.returncode == 0:
|
| 97 |
+
break # Éxito
|
| 98 |
+
if intento == intentos:
|
| 99 |
+
raise subprocess.CalledProcessError(result.returncode, cmd, output=result.stdout, stderr=result.stderr)
|
| 100 |
+
time.sleep(2) # Esperar antes del siguiente intento
|
| 101 |
+
except subprocess.CalledProcessError as e:
|
| 102 |
+
raise e
|
| 103 |
+
except FileNotFoundError:
|
| 104 |
+
yield "Error: el comando 'marker_single' no está disponible. ¿Está correctamente instalado?", "", None, None, None
|
| 105 |
return
|
| 106 |
except subprocess.CalledProcessError as e:
|
| 107 |
+
yield f"Error ejecutando Marker:\n{e.stderr.strip()}", "", None, None, None
|
| 108 |
return
|
| 109 |
except Exception as e:
|
| 110 |
yield f"Error inesperado: {str(e)}", "", None, None, None
|
| 111 |
return
|
| 112 |
|
| 113 |
+
# 4. Leer Markdown y calcular resumen
|
| 114 |
md_path = os.path.join(carpeta_salida, f"{nombre_sin_ext}.md")
|
| 115 |
md_content = ""
|
| 116 |
if os.path.exists(md_path):
|
|
|
|
| 138 |
zip_path = os.path.join(output_base, f"{nombre_sin_ext}.zip")
|
| 139 |
shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=carpeta_salida)
|
| 140 |
|
| 141 |
+
estado_final = f"Procesamiento completado correctamente (tiempo total: {tiempo_total} segundos)"
|
| 142 |
salida_md = md_content + resumen if md_content else resumen
|
| 143 |
+
yield estado_final, "", gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True), salida_md
|
| 144 |
|
| 145 |
+
# Interfaz Gradio
|
| 146 |
demo = gr.Interface(
|
| 147 |
fn=procesar_pdf,
|
| 148 |
inputs=[
|
|
|
|
| 155 |
)
|
| 156 |
],
|
| 157 |
outputs=[
|
| 158 |
+
gr.Textbox(label="Estado del procesamiento y tiempo total", lines=3),
|
|
|
|
| 159 |
gr.File(label="Descargar Markdown", visible=False),
|
| 160 |
gr.File(label="Descargar ZIP completo", visible=False),
|
| 161 |
gr.Textbox(label="Contenido Markdown extraído", lines=25, visible=False)
|
| 162 |
],
|
| 163 |
title="Marker PDF",
|
| 164 |
+
description="Convierte PDFs científicos en Markdown, HTML o JSON. Incluye OCR opcional y resumen detallado del contenido.",
|
| 165 |
+
flagging_mode="never"
|
| 166 |
)
|
| 167 |
|
| 168 |
if __name__ == "__main__":
|