Spaces:

haryde
/

marker-pdf

Sleeping

App Files Files Community

haryde commited on May 14, 2025

Commit

32b0a3f

verified ·

1 Parent(s): ddb1cd6

Update app.py

Browse files

fixed open md error

Files changed (1) hide show

app.py +39 -36

app.py CHANGED Viewed

@@ -25,26 +25,26 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
     os.makedirs(output_base, exist_ok=True)
     start_time = time.time()
-    # Verificar marker_single
     try:
         subprocess.run(["marker_single", "--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     except FileNotFoundError:
-        yield "❌ Error: el comando 'marker_single' no está disponible.", None, None, None
         return
     if not formatos:
-        yield "⚠️ Selecciona al menos un formato de salida.", None, None, None
         return
     try:
-        # Obtener el archivo
         if url_pdf:
             parsed = urlparse(url_pdf)
             nombre_pdf = parsed.path.split("/")[-1] or "documento"
             ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
             response = requests.get(url_pdf)
             if response.status_code != 200:
-                yield "❌ No se pudo descargar correctamente el PDF desde la URL.", None, None, None
                 return
             with open(ruta_pdf, "wb") as f:
                 f.write(response.content)
@@ -53,17 +53,16 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
             ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
             shutil.copyfile(pdf_file.name, ruta_pdf)
         else:
-            yield "⚠️ No se proporcionó archivo ni URL.", None, None, None
             return
-        # Verificar cabecera PDF
         with open(ruta_pdf, "rb") as f:
             if f.read(4) != b"%PDF":
-                yield "❌ El archivo no es un PDF válido (no comienza por %PDF).", None, None, None
                 return
         if os.path.getsize(ruta_pdf) > 3 * 1024 * 1024:
-            yield "⚠️ Aviso: el PDF es grande y puede tardar más de lo normal.", None, None, None
         nombre_sin_ext = os.path.splitext(os.path.basename(ruta_pdf))[0]
         carpeta_salida = os.path.join(output_base, nombre_sin_ext)
@@ -71,9 +70,11 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
             shutil.rmtree(carpeta_salida)
         os.makedirs(carpeta_salida, exist_ok=True)
-        # Procesar cada formato
         for fmt in formatos:
-            tiempo_parcial = round(time.time() - start_time, 1)
             if fmt == "md":
                 cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--disable_ocr", "--output_dir", carpeta_salida]
             elif fmt == "md + ocr":
@@ -81,54 +82,56 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
             else:
                 cmd = ["marker_single", ruta_pdf, "--output_format", fmt, "--output_dir", carpeta_salida]
-            yield f"⏳ Procesando formato '{fmt}'... (tiempo: {tiempo_parcial} s)", None, None, None
             for intento in range(1, 4):
                 try:
                     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=900)
                     if result.returncode == 0:
                         break
                     elif intento == 3:
-                        yield f"❌ Error en marker_single:\n{result.stderr}", None, None, None
                         return
                     time.sleep(2)
                 except Exception as e:
-                    yield f"❌ Error inesperado:\n{str(e)}", None, None, None
                     return
-        # Leer contenido Markdown
         md_path = os.path.join(carpeta_salida, f"{nombre_sin_ext}.md")
-        md_content = ""
-        if os.path.exists(md_path):
-            with open(md_path, "r", encoding="utf-8") as f:
-                md_content = f.read()
         tiempo_total = round(time.time() - start_time, 1)
         palabras = len(md_content.split())
         caracteres = len(md_content)
         figuras = len([f for f in os.listdir(carpeta_salida) if f.lower().endswith(".jpeg")])
         tablas = len(os.listdir(os.path.join(carpeta_salida, "tables"))) if os.path.exists(os.path.join(carpeta_salida, "tables")) else 0
-        tamaño_kb = os.path.getsize(md_path) / 1024 if os.path.exists(md_path) else 0
-        resumen = "\n\n---\nResumen del procesamiento\n"
-        resumen += f"- Palabras detectadas: {palabras}\n"
-        resumen += f"- Caracteres totales: {caracteres}\n"
-        resumen += f"- Tamaño del archivo Markdown: {tamaño_kb:.2f} KB\n"
         resumen += f"- Imágenes extraídas: {figuras}\n"
         resumen += f"- Tablas detectadas: {tablas}\n"
-        resumen += f"- Tiempo total: {tiempo_total} segundos"
         zip_path = os.path.join(output_base, f"{nombre_sin_ext}.zip")
         shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=carpeta_salida)
-        estado_final = f"✅ Procesamiento completado correctamente (tiempo total: {tiempo_total} segundos)"
-        salida_md = md_content + resumen if md_content else resumen
-        yield estado_final, gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True), salida_md
     except Exception as e:
-        yield f"❌ Error general inesperado: {str(e)}", None, None, None
-# Interfaz con 4 salidas
 demo = gr.Interface(
     fn=procesar_pdf,
     inputs=[
@@ -141,13 +144,13 @@ demo = gr.Interface(
         )
     ],
     outputs=[
-        gr.Textbox(label="Estado del procesamiento", lines=5),
-        gr.File(label="Descargar Markdown", visible=False),
-        gr.File(label="Descargar ZIP completo", visible=False),
-        gr.Textbox(label="Contenido extraído en Markdown", lines=25, visible=False)
     ],
     title="Marker PDF",
-    description="Convierte PDFs científicos en Markdown, HTML o JSON. Incluye OCR opcional y resumen completo.",
     flagging_mode="never"
 )

     os.makedirs(output_base, exist_ok=True)
     start_time = time.time()
+    estado = ""
     try:
         subprocess.run(["marker_single", "--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     except FileNotFoundError:
+        yield "Error: el comando 'marker_single' no está disponible.\nVerifica que marker-pdf esté en requirements.txt", "", None, None
         return
     if not formatos:
+        yield "Debes seleccionar al menos un formato de salida.", "", None, None
         return
     try:
         if url_pdf:
             parsed = urlparse(url_pdf)
             nombre_pdf = parsed.path.split("/")[-1] or "documento"
             ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
             response = requests.get(url_pdf)
             if response.status_code != 200:
+                yield "No se pudo descargar el PDF desde la URL proporcionada.", "", None, None
                 return
             with open(ruta_pdf, "wb") as f:
                 f.write(response.content)
             ruta_pdf = obtener_ruta_segura(output_base, nombre_pdf)
             shutil.copyfile(pdf_file.name, ruta_pdf)
         else:
+            yield "No se proporcionó ni archivo ni URL.", "", None, None
             return
         with open(ruta_pdf, "rb") as f:
             if f.read(4) != b"%PDF":
+                yield "El archivo proporcionado no es un PDF válido.", "", None, None
                 return
         if os.path.getsize(ruta_pdf) > 3 * 1024 * 1024:
+            estado += "Aviso: el PDF es grande y puede tardar más de lo normal.\n"
         nombre_sin_ext = os.path.splitext(os.path.basename(ruta_pdf))[0]
         carpeta_salida = os.path.join(output_base, nombre_sin_ext)
             shutil.rmtree(carpeta_salida)
         os.makedirs(carpeta_salida, exist_ok=True)
         for fmt in formatos:
+            tiempo = round(time.time() - start_time, 1)
+            estado += f"Procesando formato '{fmt}'... (tiempo: {tiempo} s)\n"
+            yield estado, f"{tiempo} s", None, None
             if fmt == "md":
                 cmd = ["marker_single", ruta_pdf, "--output_format", "markdown", "--disable_ocr", "--output_dir", carpeta_salida]
             elif fmt == "md + ocr":
             else:
                 cmd = ["marker_single", ruta_pdf, "--output_format", fmt, "--output_dir", carpeta_salida]
             for intento in range(1, 4):
                 try:
                     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=900)
                     if result.returncode == 0:
                         break
                     elif intento == 3:
+                        estado += f"Error ejecutando marker_single (intento {intento}):\n{result.stderr}\n"
+                        yield estado, "", None, None
                         return
                     time.sleep(2)
                 except Exception as e:
+                    estado += f"Error inesperado durante el intento {intento}: {str(e)}\n"
+                    yield estado, "", None, None
                     return
         md_path = os.path.join(carpeta_salida, f"{nombre_sin_ext}.md")
+        if not os.path.exists(md_path):
+            estado += "No se generó el archivo Markdown. Puede que el PDF no contenga texto reconocible.\n"
+            yield estado, "", None, None
+            return
+        with open(md_path, "r", encoding="utf-8") as f:
+            md_content = f.read()
         tiempo_total = round(time.time() - start_time, 1)
         palabras = len(md_content.split())
         caracteres = len(md_content)
         figuras = len([f for f in os.listdir(carpeta_salida) if f.lower().endswith(".jpeg")])
         tablas = len(os.listdir(os.path.join(carpeta_salida, "tables"))) if os.path.exists(os.path.join(carpeta_salida, "tables")) else 0
+        tamaño_kb = os.path.getsize(md_path) / 1024
+        resumen = f"\n---\nResumen del procesamiento:\n"
+        resumen += f"- Palabras: {palabras}\n"
+        resumen += f"- Caracteres: {caracteres}\n"
+        resumen += f"- Tamaño Markdown: {tamaño_kb:.2f} KB\n"
         resumen += f"- Imágenes extraídas: {figuras}\n"
         resumen += f"- Tablas detectadas: {tablas}\n"
+        resumen += f"- Tiempo total: {tiempo_total} s"
         zip_path = os.path.join(output_base, f"{nombre_sin_ext}.zip")
         shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=carpeta_salida)
+        estado += f"Procesamiento finalizado correctamente. Tiempo total: {tiempo_total} s"
+        yield estado + resumen, f"{tiempo_total} s", gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True)
     except Exception as e:
+        estado += f"Error general inesperado: {str(e)}"
+        yield estado, "", None, None
+# Interfaz 4 salidas (estado + tiempo + 2 archivos)
 demo = gr.Interface(
     fn=procesar_pdf,
     inputs=[
         )
     ],
     outputs=[
+        gr.Textbox(label="Estado del procesamiento", lines=12),
+        gr.Textbox(label="Tiempo transcurrido", interactive=False),
+        gr.File(label="Descargar Markdown (.md)", visible=False),
+        gr.File(label="Descargar ZIP completo", visible=False)
     ],
     title="Marker PDF",
+    description="Convierte artículos científicos en Markdown, HTML o JSON. Incluye OCR opcional y resumen detallado del contenido.",
     flagging_mode="never"
 )