Spaces:

haryde
/

marker-pdf

Sleeping

App Files Files Community

haryde commited on May 14, 2025

Commit

29d547d

verified ·

1 Parent(s): d850e0b

Update app-py

Browse files

- change layaout
- added light md

Files changed (1) hide show

app.py +51 -42

app.py CHANGED Viewed

@@ -6,18 +6,15 @@ import gradio as gr
 from urllib.parse import urlparse
 import time
-def procesar_pdf(pdf_file=None, url_pdf=None, formatos=["markdown"]):
     output_base = "./marker_output"
     os.makedirs(output_base, exist_ok=True)
     start_time = time.time()
-    status = "Iniciando procesamiento..."
-    yield status, "", None, None
-    # Descargar o copiar PDF
     if url_pdf:
         status = "Descargando PDF desde la URL..."
-        yield status, "", None, None
         parsed = urlparse(url_pdf)
         file_name = os.path.basename(parsed.path) or "documento.pdf"
         if not file_name.endswith(".pdf"):
@@ -26,70 +23,83 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=["markdown"]):
         try:
             response = requests.get(url_pdf)
             if response.status_code != 200 or b"%PDF" not in response.content[:1024]:
-                yield "La URL no contiene un PDF válido.", "", None, None
                 return
             with open(file_path, "wb") as f:
                 f.write(response.content)
         except Exception as e:
-            yield f"Error al descargar: {str(e)}", "", None, None
             return
     elif pdf_file:
         status = "Guardando archivo PDF..."
-        yield status, "", None, None
         try:
-            file_path = os.path.join(output_base, os.path.basename(pdf_file.name))
             shutil.copyfile(pdf_file.name, file_path)
         except Exception as e:
-            yield f"Error al guardar el PDF: {str(e)}", "", None, None
             return
     else:
-        yield "No se proporcionó archivo ni URL.", "", None, None
         return
     name_wo_ext = os.path.splitext(os.path.basename(file_path))[0]
     output_dir = os.path.join(output_base, name_wo_ext)
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     try:
         for fmt in formatos:
-            status = f"Procesando formato: {fmt}"
-            yield status, "", None, None
-            subprocess.run(
-                ["marker_single", file_path, "--output_format", fmt, "--output_dir", output_dir],
-                check=True,
-                timeout=900
-            )
     except subprocess.TimeoutExpired:
-        yield "El procesamiento superó el límite de 15 minutos.", "", None, None
         return
     except subprocess.CalledProcessError as e:
-        yield f"Error ejecutando Marker: {str(e)}", "", None, None
         return
     except Exception as e:
-        yield f"Error inesperado: {str(e)}", "", None, None
         return
-    # Leer Markdown si existe
     md_path = os.path.join(output_dir, f"{name_wo_ext}.md")
     md_content = ""
     if os.path.exists(md_path):
         try:
             with open(md_path, "r", encoding="utf-8") as f:
                 md_content = f.read()
-        except Exception:
             md_content = ""
     # Generar resumen
     status = "Generando resumen..."
-    yield status, "", None, None
     palabras = len(md_content.split()) if md_content else 0
     caracteres = len(md_content) if md_content else 0
     figuras = len([f for f in os.listdir(output_dir) if f.lower().endswith(".jpeg")])
     tablas = len([f for f in os.listdir(os.path.join(output_dir, "tables"))]) if os.path.exists(os.path.join(output_dir, "tables")) else 0
     tamaño_kb = os.path.getsize(md_path) / 1024 if os.path.exists(md_path) else 0
-    duracion = round(time.time() - start_time, 1)
     resumen = f"\n\n---\nResumen del procesamiento:\n"
     resumen += f"- Palabras detectadas: {palabras}\n"
@@ -97,18 +107,17 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=["markdown"]):
     resumen += f"- Tamaño del archivo generado: {tamaño_kb:.2f} KB\n"
     resumen += f"- Imágenes extraídas: {figuras}\n"
     resumen += f"- Tablas detectadas: {tablas}\n"
-    resumen += f"- Tiempo total: {duracion} segundos"
-    # Generar ZIP
     status = "Generando archivo ZIP..."
-    yield status, "", None, None
     zip_path = os.path.join(output_base, f"{name_wo_ext}.zip")
     shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=output_dir)
-    # Final
-    status = None
-    contenido_final = md_content + resumen if os.path.exists(md_path) else resumen
-    yield status, contenido_final, md_path if os.path.exists(md_path) else None, zip_path if os.path.exists(zip_path) else None
 demo = gr.Interface(
     fn=procesar_pdf,
@@ -116,21 +125,21 @@ demo = gr.Interface(
         gr.File(label="Sube un PDF (opcional)", file_types=[".pdf"]),
         gr.Textbox(label="O introduce una URL de PDF", placeholder="https://ejemplo.com/articulo.pdf"),
         gr.CheckboxGroup(
-            choices=["markdown", "html", "json"],
-            value=["markdown"],
-            label="Selecciona formatos de salida",
-            info="Puedes elegir uno o varios formatos"
         )
     ],
     outputs=[
         gr.Textbox(label="Estado del procesamiento", lines=2),
-        gr.Textbox(label="Contenido extraído en Markdown", lines=25),
-        gr.File(label="Descargar solo Markdown (.md)"),
-        gr.File(label="Descargar todo en ZIP")
     ],
     title="Marker PDF",
-    description="Convierte artículos científicos en Markdown, HTML o JSON usando Marker. Descarga el Markdown o todos los resultados en un ZIP.",
-    allow_flagging="never"
 )
 demo.launch()

 from urllib.parse import urlparse
 import time
+def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
     output_base = "./marker_output"
     os.makedirs(output_base, exist_ok=True)
     start_time = time.time()
+    tiempo_parcial = "0.0 s"
+    # Guardar archivo
     if url_pdf:
         status = "Descargando PDF desde la URL..."
         parsed = urlparse(url_pdf)
         file_name = os.path.basename(parsed.path) or "documento.pdf"
         if not file_name.endswith(".pdf"):
         try:
             response = requests.get(url_pdf)
             if response.status_code != 200 or b"%PDF" not in response.content[:1024]:
+                yield "La URL no contiene un PDF válido.", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
                 return
             with open(file_path, "wb") as f:
                 f.write(response.content)
         except Exception as e:
+            yield f"Error al descargar: {str(e)}", "", None, None, None
             return
     elif pdf_file:
         status = "Guardando archivo PDF..."
+        file_path = os.path.join(output_base, os.path.basename(pdf_file.name))
         try:
             shutil.copyfile(pdf_file.name, file_path)
         except Exception as e:
+            yield f"Error al guardar el PDF: {str(e)}", "", None, None, None
             return
     else:
+        yield "No se proporcionó archivo ni URL.", "", None, None, None
         return
+    # AVISO si el archivo es pesado (> 3 MB)
+    if os.path.getsize(file_path) > 3 * 1024 * 1024:
+        yield "Aviso: este PDF puede tardar más de lo normal debido a su tamaño o complejidad.", "", None, None, None
     name_wo_ext = os.path.splitext(os.path.basename(file_path))[0]
     output_dir = os.path.join(output_base, name_wo_ext)
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.makedirs(output_dir, exist_ok=True)
+    # Procesar formatos seleccionados
     try:
         for fmt in formatos:
+            tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
+            if fmt == "md ligero":
+                status = "Procesando: md ligero..."
+                cmd = ["marker_single", file_path, "--output_format", "markdown", "--skip_ocr", "--output_dir", output_dir]
+            elif fmt == "md completo":
+                status = "Procesando: md completo (con OCR)..."
+                cmd = ["marker_single", file_path, "--output_format", "markdown", "--output_dir", output_dir]
+            else:
+                status = f"Procesando: {fmt}..."
+                cmd = ["marker_single", file_path, "--output_format", fmt, "--output_dir", output_dir]
+            yield status, tiempo_parcial, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            subprocess.run(cmd, check=True, timeout=900)
     except subprocess.TimeoutExpired:
+        yield "El procesamiento superó el límite de 15 minutos.", "", None, None, None
         return
     except subprocess.CalledProcessError as e:
+        yield f"Error ejecutando Marker: {str(e)}", "", None, None, None
         return
     except Exception as e:
+        yield f"Error inesperado: {str(e)}", "", None, None, None
         return
+    # Leer markdown si existe
     md_path = os.path.join(output_dir, f"{name_wo_ext}.md")
     md_content = ""
     if os.path.exists(md_path):
         try:
             with open(md_path, "r", encoding="utf-8") as f:
                 md_content = f.read()
+        except:
             md_content = ""
     # Generar resumen
+    tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
     status = "Generando resumen..."
+    yield status, tiempo_parcial, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     palabras = len(md_content.split()) if md_content else 0
     caracteres = len(md_content) if md_content else 0
     figuras = len([f for f in os.listdir(output_dir) if f.lower().endswith(".jpeg")])
     tablas = len([f for f in os.listdir(os.path.join(output_dir, "tables"))]) if os.path.exists(os.path.join(output_dir, "tables")) else 0
     tamaño_kb = os.path.getsize(md_path) / 1024 if os.path.exists(md_path) else 0
+    duracion_total = round(time.time() - start_time, 1)
     resumen = f"\n\n---\nResumen del procesamiento:\n"
     resumen += f"- Palabras detectadas: {palabras}\n"
     resumen += f"- Tamaño del archivo generado: {tamaño_kb:.2f} KB\n"
     resumen += f"- Imágenes extraídas: {figuras}\n"
     resumen += f"- Tablas detectadas: {tablas}\n"
+    resumen += f"- Tiempo total: {duracion_total} segundos"
+    # ZIP
     status = "Generando archivo ZIP..."
     zip_path = os.path.join(output_base, f"{name_wo_ext}.zip")
     shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=output_dir)
+    # Mostrar resultados
+    status = "Procesamiento completado."
+    final_md = md_content + resumen if os.path.exists(md_path) else resumen
+    yield status, f"{duracion_total} s", gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True), final_md
 demo = gr.Interface(
     fn=procesar_pdf,
         gr.File(label="Sube un PDF (opcional)", file_types=[".pdf"]),
         gr.Textbox(label="O introduce una URL de PDF", placeholder="https://ejemplo.com/articulo.pdf"),
         gr.CheckboxGroup(
+            choices=["md", "md + ocr", "html", "json"],
+            value=["md ligero"],
+            label="Selecciona formatos de salida"
         )
     ],
     outputs=[
         gr.Textbox(label="Estado del procesamiento", lines=2),
+        gr.Textbox(label="Tiempo de ejecución estimado", interactive=False),
+        gr.File(label="Descargar Markdown (.md)", visible=False),
+        gr.File(label="Descargar ZIP completo", visible=False),
+        gr.Textbox(label="Contenido extraído en Markdown", lines=25, visible=False)
     ],
     title="Marker PDF",
+    description="Convierte artículos científicos en Markdown o Markdown completo con OCR, HTML o JSON usando Marker. Descarga el resultado y un ZIP completo.",
+    flagging_mode="never"
 )
 demo.launch()