Spaces:
Sleeping
Sleeping
Update app-py
Browse files- change layaout
- added light md
app.py
CHANGED
|
@@ -6,18 +6,15 @@ import gradio as gr
|
|
| 6 |
from urllib.parse import urlparse
|
| 7 |
import time
|
| 8 |
|
| 9 |
-
def procesar_pdf(pdf_file=None, url_pdf=None, formatos=["
|
| 10 |
output_base = "./marker_output"
|
| 11 |
os.makedirs(output_base, exist_ok=True)
|
| 12 |
start_time = time.time()
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
yield status, "", None, None
|
| 16 |
-
|
| 17 |
-
# Descargar o copiar PDF
|
| 18 |
if url_pdf:
|
| 19 |
status = "Descargando PDF desde la URL..."
|
| 20 |
-
yield status, "", None, None
|
| 21 |
parsed = urlparse(url_pdf)
|
| 22 |
file_name = os.path.basename(parsed.path) or "documento.pdf"
|
| 23 |
if not file_name.endswith(".pdf"):
|
|
@@ -26,70 +23,83 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=["markdown"]):
|
|
| 26 |
try:
|
| 27 |
response = requests.get(url_pdf)
|
| 28 |
if response.status_code != 200 or b"%PDF" not in response.content[:1024]:
|
| 29 |
-
yield "La URL no contiene un PDF válido.", "",
|
| 30 |
return
|
| 31 |
with open(file_path, "wb") as f:
|
| 32 |
f.write(response.content)
|
| 33 |
except Exception as e:
|
| 34 |
-
yield f"Error al descargar: {str(e)}", "", None, None
|
| 35 |
return
|
| 36 |
elif pdf_file:
|
| 37 |
status = "Guardando archivo PDF..."
|
| 38 |
-
|
| 39 |
try:
|
| 40 |
-
file_path = os.path.join(output_base, os.path.basename(pdf_file.name))
|
| 41 |
shutil.copyfile(pdf_file.name, file_path)
|
| 42 |
except Exception as e:
|
| 43 |
-
yield f"Error al guardar el PDF: {str(e)}", "", None, None
|
| 44 |
return
|
| 45 |
else:
|
| 46 |
-
yield "No se proporcionó archivo ni URL.", "", None, None
|
| 47 |
return
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
name_wo_ext = os.path.splitext(os.path.basename(file_path))[0]
|
| 50 |
output_dir = os.path.join(output_base, name_wo_ext)
|
| 51 |
if os.path.exists(output_dir):
|
| 52 |
shutil.rmtree(output_dir)
|
| 53 |
os.makedirs(output_dir, exist_ok=True)
|
| 54 |
|
|
|
|
| 55 |
try:
|
| 56 |
for fmt in formatos:
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
["marker_single", file_path, "--output_format",
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
except subprocess.TimeoutExpired:
|
| 65 |
-
yield "El procesamiento superó el límite de 15 minutos.", "", None, None
|
| 66 |
return
|
| 67 |
except subprocess.CalledProcessError as e:
|
| 68 |
-
yield f"Error ejecutando Marker: {str(e)}", "", None, None
|
| 69 |
return
|
| 70 |
except Exception as e:
|
| 71 |
-
yield f"Error inesperado: {str(e)}", "", None, None
|
| 72 |
return
|
| 73 |
|
| 74 |
-
# Leer
|
| 75 |
md_path = os.path.join(output_dir, f"{name_wo_ext}.md")
|
| 76 |
md_content = ""
|
| 77 |
if os.path.exists(md_path):
|
| 78 |
try:
|
| 79 |
with open(md_path, "r", encoding="utf-8") as f:
|
| 80 |
md_content = f.read()
|
| 81 |
-
except
|
| 82 |
md_content = ""
|
| 83 |
|
| 84 |
# Generar resumen
|
|
|
|
| 85 |
status = "Generando resumen..."
|
| 86 |
-
yield status,
|
|
|
|
| 87 |
palabras = len(md_content.split()) if md_content else 0
|
| 88 |
caracteres = len(md_content) if md_content else 0
|
| 89 |
figuras = len([f for f in os.listdir(output_dir) if f.lower().endswith(".jpeg")])
|
| 90 |
tablas = len([f for f in os.listdir(os.path.join(output_dir, "tables"))]) if os.path.exists(os.path.join(output_dir, "tables")) else 0
|
| 91 |
tamaño_kb = os.path.getsize(md_path) / 1024 if os.path.exists(md_path) else 0
|
| 92 |
-
|
| 93 |
|
| 94 |
resumen = f"\n\n---\nResumen del procesamiento:\n"
|
| 95 |
resumen += f"- Palabras detectadas: {palabras}\n"
|
|
@@ -97,18 +107,17 @@ def procesar_pdf(pdf_file=None, url_pdf=None, formatos=["markdown"]):
|
|
| 97 |
resumen += f"- Tamaño del archivo generado: {tamaño_kb:.2f} KB\n"
|
| 98 |
resumen += f"- Imágenes extraídas: {figuras}\n"
|
| 99 |
resumen += f"- Tablas detectadas: {tablas}\n"
|
| 100 |
-
resumen += f"- Tiempo total: {
|
| 101 |
|
| 102 |
-
#
|
| 103 |
status = "Generando archivo ZIP..."
|
| 104 |
-
yield status, "", None, None
|
| 105 |
zip_path = os.path.join(output_base, f"{name_wo_ext}.zip")
|
| 106 |
shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=output_dir)
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
status =
|
| 110 |
-
|
| 111 |
-
yield status,
|
| 112 |
|
| 113 |
demo = gr.Interface(
|
| 114 |
fn=procesar_pdf,
|
|
@@ -116,21 +125,21 @@ demo = gr.Interface(
|
|
| 116 |
gr.File(label="Sube un PDF (opcional)", file_types=[".pdf"]),
|
| 117 |
gr.Textbox(label="O introduce una URL de PDF", placeholder="https://ejemplo.com/articulo.pdf"),
|
| 118 |
gr.CheckboxGroup(
|
| 119 |
-
choices=["
|
| 120 |
-
value=["
|
| 121 |
-
label="Selecciona formatos de salida"
|
| 122 |
-
info="Puedes elegir uno o varios formatos"
|
| 123 |
)
|
| 124 |
],
|
| 125 |
outputs=[
|
| 126 |
gr.Textbox(label="Estado del procesamiento", lines=2),
|
| 127 |
-
gr.Textbox(label="
|
| 128 |
-
gr.File(label="Descargar
|
| 129 |
-
gr.File(label="Descargar
|
|
|
|
| 130 |
],
|
| 131 |
title="Marker PDF",
|
| 132 |
-
description="Convierte artículos científicos en Markdown, HTML o JSON usando Marker. Descarga el
|
| 133 |
-
|
| 134 |
)
|
| 135 |
|
| 136 |
demo.launch()
|
|
|
|
| 6 |
from urllib.parse import urlparse
|
| 7 |
import time
|
| 8 |
|
| 9 |
+
def procesar_pdf(pdf_file=None, url_pdf=None, formatos=[], verbose="auto"):
|
| 10 |
output_base = "./marker_output"
|
| 11 |
os.makedirs(output_base, exist_ok=True)
|
| 12 |
start_time = time.time()
|
| 13 |
+
tiempo_parcial = "0.0 s"
|
| 14 |
|
| 15 |
+
# Guardar archivo
|
|
|
|
|
|
|
|
|
|
| 16 |
if url_pdf:
|
| 17 |
status = "Descargando PDF desde la URL..."
|
|
|
|
| 18 |
parsed = urlparse(url_pdf)
|
| 19 |
file_name = os.path.basename(parsed.path) or "documento.pdf"
|
| 20 |
if not file_name.endswith(".pdf"):
|
|
|
|
| 23 |
try:
|
| 24 |
response = requests.get(url_pdf)
|
| 25 |
if response.status_code != 200 or b"%PDF" not in response.content[:1024]:
|
| 26 |
+
yield "La URL no contiene un PDF válido.", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
| 27 |
return
|
| 28 |
with open(file_path, "wb") as f:
|
| 29 |
f.write(response.content)
|
| 30 |
except Exception as e:
|
| 31 |
+
yield f"Error al descargar: {str(e)}", "", None, None, None
|
| 32 |
return
|
| 33 |
elif pdf_file:
|
| 34 |
status = "Guardando archivo PDF..."
|
| 35 |
+
file_path = os.path.join(output_base, os.path.basename(pdf_file.name))
|
| 36 |
try:
|
|
|
|
| 37 |
shutil.copyfile(pdf_file.name, file_path)
|
| 38 |
except Exception as e:
|
| 39 |
+
yield f"Error al guardar el PDF: {str(e)}", "", None, None, None
|
| 40 |
return
|
| 41 |
else:
|
| 42 |
+
yield "No se proporcionó archivo ni URL.", "", None, None, None
|
| 43 |
return
|
| 44 |
|
| 45 |
+
# AVISO si el archivo es pesado (> 3 MB)
|
| 46 |
+
if os.path.getsize(file_path) > 3 * 1024 * 1024:
|
| 47 |
+
yield "Aviso: este PDF puede tardar más de lo normal debido a su tamaño o complejidad.", "", None, None, None
|
| 48 |
+
|
| 49 |
name_wo_ext = os.path.splitext(os.path.basename(file_path))[0]
|
| 50 |
output_dir = os.path.join(output_base, name_wo_ext)
|
| 51 |
if os.path.exists(output_dir):
|
| 52 |
shutil.rmtree(output_dir)
|
| 53 |
os.makedirs(output_dir, exist_ok=True)
|
| 54 |
|
| 55 |
+
# Procesar formatos seleccionados
|
| 56 |
try:
|
| 57 |
for fmt in formatos:
|
| 58 |
+
tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
|
| 59 |
+
if fmt == "md ligero":
|
| 60 |
+
status = "Procesando: md ligero..."
|
| 61 |
+
cmd = ["marker_single", file_path, "--output_format", "markdown", "--skip_ocr", "--output_dir", output_dir]
|
| 62 |
+
elif fmt == "md completo":
|
| 63 |
+
status = "Procesando: md completo (con OCR)..."
|
| 64 |
+
cmd = ["marker_single", file_path, "--output_format", "markdown", "--output_dir", output_dir]
|
| 65 |
+
else:
|
| 66 |
+
status = f"Procesando: {fmt}..."
|
| 67 |
+
cmd = ["marker_single", file_path, "--output_format", fmt, "--output_dir", output_dir]
|
| 68 |
+
|
| 69 |
+
yield status, tiempo_parcial, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
| 70 |
+
subprocess.run(cmd, check=True, timeout=900)
|
| 71 |
+
|
| 72 |
except subprocess.TimeoutExpired:
|
| 73 |
+
yield "El procesamiento superó el límite de 15 minutos.", "", None, None, None
|
| 74 |
return
|
| 75 |
except subprocess.CalledProcessError as e:
|
| 76 |
+
yield f"Error ejecutando Marker: {str(e)}", "", None, None, None
|
| 77 |
return
|
| 78 |
except Exception as e:
|
| 79 |
+
yield f"Error inesperado: {str(e)}", "", None, None, None
|
| 80 |
return
|
| 81 |
|
| 82 |
+
# Leer markdown si existe
|
| 83 |
md_path = os.path.join(output_dir, f"{name_wo_ext}.md")
|
| 84 |
md_content = ""
|
| 85 |
if os.path.exists(md_path):
|
| 86 |
try:
|
| 87 |
with open(md_path, "r", encoding="utf-8") as f:
|
| 88 |
md_content = f.read()
|
| 89 |
+
except:
|
| 90 |
md_content = ""
|
| 91 |
|
| 92 |
# Generar resumen
|
| 93 |
+
tiempo_parcial = f"{round(time.time() - start_time, 1)} s"
|
| 94 |
status = "Generando resumen..."
|
| 95 |
+
yield status, tiempo_parcial, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
| 96 |
+
|
| 97 |
palabras = len(md_content.split()) if md_content else 0
|
| 98 |
caracteres = len(md_content) if md_content else 0
|
| 99 |
figuras = len([f for f in os.listdir(output_dir) if f.lower().endswith(".jpeg")])
|
| 100 |
tablas = len([f for f in os.listdir(os.path.join(output_dir, "tables"))]) if os.path.exists(os.path.join(output_dir, "tables")) else 0
|
| 101 |
tamaño_kb = os.path.getsize(md_path) / 1024 if os.path.exists(md_path) else 0
|
| 102 |
+
duracion_total = round(time.time() - start_time, 1)
|
| 103 |
|
| 104 |
resumen = f"\n\n---\nResumen del procesamiento:\n"
|
| 105 |
resumen += f"- Palabras detectadas: {palabras}\n"
|
|
|
|
| 107 |
resumen += f"- Tamaño del archivo generado: {tamaño_kb:.2f} KB\n"
|
| 108 |
resumen += f"- Imágenes extraídas: {figuras}\n"
|
| 109 |
resumen += f"- Tablas detectadas: {tablas}\n"
|
| 110 |
+
resumen += f"- Tiempo total: {duracion_total} segundos"
|
| 111 |
|
| 112 |
+
# ZIP
|
| 113 |
status = "Generando archivo ZIP..."
|
|
|
|
| 114 |
zip_path = os.path.join(output_base, f"{name_wo_ext}.zip")
|
| 115 |
shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=output_dir)
|
| 116 |
|
| 117 |
+
# Mostrar resultados
|
| 118 |
+
status = "Procesamiento completado."
|
| 119 |
+
final_md = md_content + resumen if os.path.exists(md_path) else resumen
|
| 120 |
+
yield status, f"{duracion_total} s", gr.update(value=md_path, visible=True), gr.update(value=zip_path, visible=True), final_md
|
| 121 |
|
| 122 |
demo = gr.Interface(
|
| 123 |
fn=procesar_pdf,
|
|
|
|
| 125 |
gr.File(label="Sube un PDF (opcional)", file_types=[".pdf"]),
|
| 126 |
gr.Textbox(label="O introduce una URL de PDF", placeholder="https://ejemplo.com/articulo.pdf"),
|
| 127 |
gr.CheckboxGroup(
|
| 128 |
+
choices=["md", "md + ocr", "html", "json"],
|
| 129 |
+
value=["md ligero"],
|
| 130 |
+
label="Selecciona formatos de salida"
|
|
|
|
| 131 |
)
|
| 132 |
],
|
| 133 |
outputs=[
|
| 134 |
gr.Textbox(label="Estado del procesamiento", lines=2),
|
| 135 |
+
gr.Textbox(label="Tiempo de ejecución estimado", interactive=False),
|
| 136 |
+
gr.File(label="Descargar Markdown (.md)", visible=False),
|
| 137 |
+
gr.File(label="Descargar ZIP completo", visible=False),
|
| 138 |
+
gr.Textbox(label="Contenido extraído en Markdown", lines=25, visible=False)
|
| 139 |
],
|
| 140 |
title="Marker PDF",
|
| 141 |
+
description="Convierte artículos científicos en Markdown o Markdown completo con OCR, HTML o JSON usando Marker. Descarga el resultado y un ZIP completo.",
|
| 142 |
+
flagging_mode="never"
|
| 143 |
)
|
| 144 |
|
| 145 |
demo.launch()
|