Spaces:

haryde
/

marker-pdf

Sleeping

App Files Files Community

haryde commited on May 14, 2025

Commit

94ae36a

verified ·

1 Parent(s): 2b7e158

Update app.py

Browse files

- marker_single tryouts
- change pdf mgt

Files changed (1) hide show

app.py +45 -29

app.py CHANGED Viewed

@@ -7,23 +7,19 @@ from urllib.parse import urlparse
 import time
 import re
-def sanear_nombre(nombre, ext=".pdf"):
     nombre_base = os.path.splitext(nombre)[0]
     nombre_limpio = re.sub(r"[^\w\-_.]", "_", nombre_base)
-    if not nombre_limpio.endswith(ext):
-        nombre_limpio += ext
     return nombre_limpio
 def obtener_ruta_segura(base_dir, nombre_archivo):
-    nombre_final = sanear_nombre(nombre_archivo)
-    ruta_final = os.path.join(base_dir, nombre_final)
     contador = 1
-    while os.path.exists(ruta_final):
-        nombre_sin_ext = os.path.splitext(nombre_final)[0]
-        nombre_final = f"{nombre_sin_ext}_{contador}.pdf"
-        ruta_final = os.path.join(base_dir, nombre_final)
         contador += 1
-    return ruta_final
 def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
     output_base = "./marker_output"
@@ -34,14 +30,15 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
         yield "Selecciona al menos un formato de salida.", "", None, None, None
         return
     if url_pdf:
         parsed = urlparse(url_pdf)
-        nombre_pdf = parsed.path.split("/")[-1] or "documento.pdf"
         ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
         try:
             response = requests.get(url_pdf)
-            if response.status_code != 200 or b"%PDF" not in response.content[:1024]:
-                yield "La URL no contiene un PDF válido.", "", None, None, None
                 return
             with open(ruta_pdf, "wb") as f:
                 f.write(response.content)
@@ -49,7 +46,7 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
             yield f"Error al descargar: {str(e)}", "", None, None, None
             return
     elif pdf_file:
-        nombre_pdf = pdf_file.name or "documento.pdf"
         ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
         try:
             shutil.copyfile(pdf_file.name, ruta_pdf)
@@ -60,6 +57,16 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
         yield "No se proporcionó archivo ni URL.", "", None, None, None
         return
     if os.path.getsize(ruta_pdf) > 3 * 1024 * 1024:
         yield "Aviso: el PDF es grande y puede tardar más de lo normal.", "", None, None, None
@@ -69,32 +76,41 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
         shutil.rmtree(carpeta_salida)
     os.makedirs(carpeta_salida, exist_ok=True)
     try:
         for fmt in formatos:
             tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
             if fmt == "md":
-                status = "Procesando: markdown ligero"
                 cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--skip_ocr", "--output_dir", carpeta_salida]
             elif fmt == "md + ocr":
-                status = "Procesando: markdown con OCR"
                 cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--output_dir", carpeta_salida]
             else:
-                status = f"Procesando: {fmt}"
                 cmd = ["marker_single", ruta_pdf, "--output_format", fmt, "--output_dir", carpeta_salida]
-            yield status, tiempo_parcial, None, None, None
-            subprocess.run(cmd, check=True, timeout=900)
-    except subprocess.TimeoutExpired:
-        yield "El procesamiento superó el límite de tiempo permitido (15 minutos).", "", None, None, None
         return
     except subprocess.CalledProcessError as e:
-        yield f"Error durante la ejecución de Marker: {str(e)}", "", None, None, None
         return
     except Exception as e:
         yield f"Error inesperado: {str(e)}", "", None, None, None
         return
     md_path = os.path.join(carpeta_salida, f"{nombre_sin_ext}.md")
     md_content = ""
     if os.path.exists(md_path):
@@ -122,10 +138,11 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
     zip_path = os.path.join(output_base, f"{nombre_sin_ext}.zip")
     shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=carpeta_salida)
-    estado_final = "Procesamiento completado correctamente"
     salida_md = md_content + resumen if md_content else resumen
-    yield estado_final, f"{tiempo_total} s", gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True), salida_md
 demo = gr.Interface(
     fn=procesar_pdf,
     inputs=[
@@ -138,15 +155,14 @@ demo = gr.Interface(
         )
     ],
     outputs=[
-        gr.Textbox(label="Estado del procesamiento", lines=2),
-        gr.Textbox(label="Tiempo transcurrido", interactive=False),
         gr.File(label="Descargar Markdown", visible=False),
         gr.File(label="Descargar ZIP completo", visible=False),
         gr.Textbox(label="Contenido Markdown extraído", lines=25, visible=False)
     ],
     title="Marker PDF",
-    description="Convierte PDFs científicos en Markdown, HTML o JSON. Opcionalmente incluye OCR. Extrae imágenes, tablas y contenido estructurado.",
-    allow_flagging="never"
 )
 if __name__ == "__main__":

 import time
 import re
+def sanear_nombre(nombre):
     nombre_base = os.path.splitext(nombre)[0]
     nombre_limpio = re.sub(r"[^\w\-_.]", "_", nombre_base)
     return nombre_limpio
 def obtener_ruta_segura(base_dir, nombre_archivo):
+    nombre_base = sanear_nombre(nombre_archivo)
+    ruta_final = os.path.join(base_dir, nombre_base)
     contador = 1
+    while os.path.exists(ruta_final + ".pdf"):
+        ruta_final = os.path.join(base_dir, f"{nombre_base}_{contador}")
         contador += 1
+    return ruta_final + ".pdf"
 def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
     output_base = "./marker_output"
         yield "Selecciona al menos un formato de salida.", "", None, None, None
         return
+    # 1. Guardar archivo desde URL o subida
     if url_pdf:
         parsed = urlparse(url_pdf)
+        nombre_pdf = parsed.path.split("/")[-1] or "documento"
         ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
         try:
             response = requests.get(url_pdf)
+            if response.status_code != 200:
+                yield "La URL no pudo ser descargada correctamente.", "", None, None, None
                 return
             with open(ruta_pdf, "wb") as f:
                 f.write(response.content)
             yield f"Error al descargar: {str(e)}", "", None, None, None
             return
     elif pdf_file:
+        nombre_pdf = pdf_file.name or "documento"
         ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
         try:
             shutil.copyfile(pdf_file.name, ruta_pdf)
         yield "No se proporcionó archivo ni URL.", "", None, None, None
         return
+    # 2. Verificar que realmente sea un PDF
+    try:
+        with open(ruta_pdf, "rb") as f:
+            if f.read(4) != b"%PDF":
+                yield "El archivo no es un PDF válido (falta cabecera %PDF).", "", None, None, None
+                return
+    except Exception as e:
+        yield f"No se pudo leer el archivo para verificar su tipo: {str(e)}", "", None, None, None
+        return
     if os.path.getsize(ruta_pdf) > 3 * 1024 * 1024:
         yield "Aviso: el PDF es grande y puede tardar más de lo normal.", "", None, None, None
         shutil.rmtree(carpeta_salida)
     os.makedirs(carpeta_salida, exist_ok=True)
+    # 3. Procesar con marker_single con reintento
     try:
         for fmt in formatos:
             tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
             if fmt == "md":
                 cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--skip_ocr", "--output_dir", carpeta_salida]
             elif fmt == "md + ocr":
                 cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--output_dir", carpeta_salida]
             else:
                 cmd = ["marker_single", ruta_pdf, "--output_format", fmt, "--output_dir", carpeta_salida]
+            yield f"Procesando formato '{fmt}'... (tiempo: {tiempo_parcial})", "", None, None, None
+            intentos = 3
+            for intento in range(1, intentos + 1):
+                try:
+                    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=900)
+                    if result.returncode == 0:
+                        break  # Éxito
+                    if intento == intentos:
+                        raise subprocess.CalledProcessError(result.returncode, cmd, output=result.stdout, stderr=result.stderr)
+                    time.sleep(2)  # Esperar antes del siguiente intento
+                except subprocess.CalledProcessError as e:
+                    raise e
+    except FileNotFoundError:
+        yield "Error: el comando 'marker_single' no está disponible. ¿Está correctamente instalado?", "", None, None, None
         return
     except subprocess.CalledProcessError as e:
+        yield f"Error ejecutando Marker:\n{e.stderr.strip()}", "", None, None, None
         return
     except Exception as e:
         yield f"Error inesperado: {str(e)}", "", None, None, None
         return
+    # 4. Leer Markdown y calcular resumen
     md_path = os.path.join(carpeta_salida, f"{nombre_sin_ext}.md")
     md_content = ""
     if os.path.exists(md_path):
     zip_path = os.path.join(output_base, f"{nombre_sin_ext}.zip")
     shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=carpeta_salida)
+    estado_final = f"Procesamiento completado correctamente (tiempo total: {tiempo_total} segundos)"
     salida_md = md_content + resumen if md_content else resumen
+    yield estado_final, "", gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True), salida_md
+# Interfaz Gradio
 demo = gr.Interface(
     fn=procesar_pdf,
     inputs=[
         )
     ],
     outputs=[
+        gr.Textbox(label="Estado del procesamiento y tiempo total", lines=3),
         gr.File(label="Descargar Markdown", visible=False),
         gr.File(label="Descargar ZIP completo", visible=False),
         gr.Textbox(label="Contenido Markdown extraído", lines=25, visible=False)
     ],
     title="Marker PDF",
+    description="Convierte PDFs científicos en Markdown, HTML o JSON. Incluye OCR opcional y resumen detallado del contenido.",
+    flagging_mode="never"
 )
 if __name__ == "__main__":