Spaces:

NoeGH
/

libretranslate-prodoc

Running

App Files Files Community

AdVision AI commited on 1 day ago

Commit

da3fcd4

1 Parent(s): fb62447

refactor: implement structural conversion with pdf2docx

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +42 -202
requirements.txt +2 -2

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ keys.txt

app.py CHANGED Viewed

@@ -680,46 +680,6 @@ def translate_text(text: str, from_code: str, to_code: str) -> str:
 #   3. Escribe la traducción encima en la misma posición
 # =============================================================================
-def _get_font_for_language(to_code: str) -> tuple[str, str]:
-    """
-    Descarga o ubica una fuente TTF compatible con el idioma de destino.
-    Esencial para que fpdf2 soporte caracteres CJK, Árabes y otros.
-    """
-    import urllib.request
-    import platform
-    font_dir = Path(tempfile.gettempdir()) / "fonts"
-    font_dir.mkdir(exist_ok=True)
-    if to_code in ["zh", "ja", "ko"]:
-        font_name = "NotoSansCJK"
-        url = "https://github.com/google/fonts/raw/main/ofl/notosanssc/NotoSansSC-Regular.ttf"
-        font_filename = "NotoSansSC-Regular.ttf"
-    elif to_code == "ar":
-        font_name = "NotoSansArabic"
-        url = "https://github.com/google/fonts/raw/main/ofl/notosansarabic/NotoSansArabic-Regular.ttf"
-        font_filename = "NotoSansArabic-Regular.ttf"
-    else:
-        font_name = "NotoSans"
-        url = "https://github.com/google/fonts/raw/main/ofl/notosans/NotoSans-Regular.ttf"
-        font_filename = "NotoSans-Regular.ttf"
-    font_path = font_dir / font_filename
-    if not font_path.exists():
-        try:
-            logger.info(f"Descargando fuente {font_name}...")
-            urllib.request.urlretrieve(url, str(font_path))
-        except Exception as e:
-            logger.warning(f"Error descargando fuente, fallback: {e}")
-            if platform.system() == "Windows":
-                 return "C:\\Windows\\Fonts\\arial.ttf", "Arial"
-            else:
-                 return "", "" # Default a core fonts en Linux
-    return str(font_path), font_name
 def translate_pdf(
     input_path: str,
     from_code: str,
@@ -728,173 +688,52 @@ def translate_pdf(
     status_text
 ) -> str:
     """
-    Reconstruye el PDF traduciendo el texto e intercalando las imágenes en orden secuencial.
-    Incluye fallback OCR (Tesseract) si el PDF es solo un escaneo.
     """
     try:
-        import fitz  # PyMuPDF
-        from fpdf import FPDF
-        # Mapeo para motor OCR
-        def _map_tess_lang(lang):
-            m = {"es":"spa","en":"eng","fr":"fra","de":"deu","it":"ita",
-                 "pt":"por","ru":"rus","zh":"chi_sim","ja":"jpn","ko":"kor","ar":"ara"}
-            return m.get(lang, "eng")
-        tess_lang = _map_tess_lang(from_code)
-        doc = fitz.open(input_path)
-        total_pages = len(doc)
-        if total_pages == 0:
-            raise ValueError("El PDF no contiene páginas.")
         status_text.markdown(
-            f'<div class="status-badge processing">📄 Analizando {total_pages} página(s)...</div>',
             unsafe_allow_html=True
         )
-        content_sequence = []
-        # 1. Extracción Secuencial y OCR Inteligente
-        for page_num in range(total_pages):
-            page = doc[page_num]
-            raw_text = page.get_text("text").strip()
-            imgs = page.get_images(full=True)
-            # Detectar escaneo (pocos caracteres, contiene imágenes)
-            if len(raw_text) < 50 and len(imgs) > 0:
-                progress_bar.progress((page_num) / total_pages, text=f"OCR en página {page_num+1}...")
-                try:
-                    tp = page.get_textpage_ocr(flags=0, language=tess_lang, dpi=150)
-                    page_dict = page.get_text("dict", textpage=tp, sort=True)
-                except Exception as e:
-                    logger.warning(f"Error OCR pág {page_num+1}: {e}")
-                    page_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_IMAGES, sort=True)
-            else:
-                progress_bar.progress((page_num) / total_pages, text=f"Analizando página {page_num+1}...")
-                page_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_IMAGES, sort=True)
-            for block in page_dict.get("blocks", []):
-                b_type = block.get("type", 0)
-                if b_type == 0:  # Bloque de Texto
-                    block_text = ""
-                    sizes = []
-                    for line in block.get("lines", []):
-                        for span in line.get("spans", []):
-                            txt = span.get("text", "")
-                            block_text += txt + " "
-                            sizes.append(span.get("size", 11))
-                    block_text = block_text.strip()
-                    if block_text and len(block_text) > 1:
-                        avg_size = sum(sizes) / len(sizes) if sizes else 11
-                        avg_size = min(max(avg_size, 9), 32) # Tamaño razonable
-                        content_sequence.append({
-                            "type": "text",
-                            "original": block_text,
-                            "size": avg_size
-                        })
-                elif b_type == 1:  # Bloque de Imagen
-                    img_bytes = block.get("image")
-                    ext = block.get("ext", "jpeg")
-                    if img_bytes:
-                        content_sequence.append({
-                            "type": "image",
-                            "bytes": img_bytes,
-                            "ext": ext
-                        })
-        # 2. Traducción en Lote Altamente Optimizada
-        texts_to_translate = [item["original"] for item in content_sequence if item["type"] == "text"]
-        translated_texts = []
-        if texts_to_translate:
-            status_text.markdown(
-                f'<div class="status-badge processing pulse">🔄 Traducción en lote ({len(texts_to_translate)} bloques)...</div>',
-                unsafe_allow_html=True
-            )
-            progress_bar.progress(0.85, text="Enviando textos a motor neuronal...")
-            translated_texts = translate_batch(texts_to_translate, from_code, to_code)
-        # Asignar textos de vuelta
-        txt_idx = 0
-        for item in content_sequence:
-            if item["type"] == "text":
-                item["translated"] = translated_texts[txt_idx]
-                txt_idx += 1
-        # 3. Ensamblado del Nuevo PDF Dinámico
-        progress_bar.progress(0.95, text="Armando documento secuencial...")
-        pdf = FPDF()
-        pdf.set_auto_page_break(auto=True, margin=15)
-        pdf.add_page()
-        # Cargar fuentes del idioma meta
-        font_path, font_family = _get_font_for_language(to_code)
-        has_font = False
-        if font_path and os.path.exists(font_path):
-            try:
-                pdf.add_font(font_family, style="", fname=font_path)
-                has_font = True
-            except Exception as e:
-                logger.warning(f"Error agregando fuente FPDF: {e}")
-        for item in content_sequence:
-            if item["type"] == "text":
-                text = item["translated"]
-                size = item["size"]
-                if has_font:
-                    pdf.set_font(font_family, size=size)
-                else:
-                    pdf.set_font("Helvetica", size=size)
-                try:
-                    # Renderizar texto limpiando errores de Unicode
-                    safe_text = text.encode('utf-16', 'surrogatepass').decode('utf-16')
-                    pdf.multi_cell(0, max(5, size * 0.35), safe_text)
-                except Exception as e:
-                    fallback_text = text.encode('ascii', 'ignore').decode('ascii')
-                    try:
-                        pdf.multi_cell(0, 6, fallback_text)
-                    except: pass
-                pdf.ln(3)  # Margen inferor del bloque
-            elif item["type"] == "image":
-                import uuid
-                img_ext = item["ext"]
-                if "/" in img_ext: img_ext = img_ext.split("/")[-1]
-                tmp_img = os.path.join(tempfile.gettempdir(), f"img_{uuid.uuid4().hex}.{img_ext}")
-                with open(tmp_img, "wb") as f:
-                    f.write(item["bytes"])
-                try:
-                    # Adaptar el ancho hasta un maximo de la pagina
-                    pdf.image(tmp_img, x="C", w=170)
-                    pdf.ln(5)
-                except Exception as e:
-                    logger.warning(f"Fallo al incrustar imagen: {e}")
-                finally:
-                    if os.path.exists(tmp_img): os.remove(tmp_img)
-        # 4. Guardar Resultado Final
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_filename = f"traducido_seq_{from_code}_to_{to_code}_{timestamp}.pdf"
-        output_path = os.path.join(tempfile.gettempdir(), output_filename)
-        pdf.output(output_path)
-        doc.close()
-        schedule_file_deletion(output_path)
-        logger.info(f"✅ Nuevo PDF Dinámico Generado: {output_path}")
-        return output_path
     except MemoryError:
-        raise MemoryError("El proceso excede la RAM permitida. PDF demasiado complejo.")
     except Exception as e:
-        logger.error(f"Error en reconstrucción PDF: {e}", exc_info=True)
         raise
@@ -1436,7 +1275,10 @@ def _process_translation(
         # Nombre sugerido para la descarga
         original_stem = Path(uploaded_file.name).stem
-        download_name = f"{original_stem}_traducido_{to_code}{file_ext}"
         # ── Card de resultado ──────────────────────────────────────────────
         st.markdown(
@@ -1448,12 +1290,10 @@ def _process_translation(
         col_dl, col_info = st.columns([1, 2])
         with col_dl:
-            # Determinar el MIME type para la descarga
-            mime_type = "application/pdf" if file_ext == ".pdf" else (
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-            )
             st.download_button(
-                label=f"⬇️ Descargar {file_ext.upper()[1:]} Traducido",
                 data=translated_bytes,
                 file_name=download_name,
                 mime=mime_type,

 #   3. Escribe la traducción encima en la misma posición
 # =============================================================================
 def translate_pdf(
     input_path: str,
     from_code: str,
     status_text
 ) -> str:
     """
+    Convierte el archivo PDF a DOCX estructuralmente preservando tablas e imágenes.
+    Luego lo traduce utilizando la función DOCX y devuelve el archivo resultante.
     """
     try:
+        from pdf2docx import Converter
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        docx_path = os.path.join(tempfile.gettempdir(), f"intermediate_{timestamp}.docx")
         status_text.markdown(
+            f'<div class="status-badge processing pulse">📄 Convirtiendo Estructura de PDF a Word...</div>',
             unsafe_allow_html=True
         )
+        progress_bar.progress(0.1, text="Analizando tablas, columnas e imágenes... (Esto tomará unos segundos)")
+        # Conversión estructural con pdf2docx
+        cv = Converter(input_path)
+        # convert soporta start y end, usamos todo
+        cv.convert(docx_path)
+        cv.close()
+        status_text.markdown(
+            f'<div class="status-badge success">✅ Conversión estructural completa.</div>',
+            unsafe_allow_html=True
+        )
+        progress_bar.progress(0.3, text="Traduciendo estructura DOCX generada...")
+        # Traducir el DOCX usando la función experta
+        output_docx_path = translate_docx(
+            input_path=docx_path,
+            from_code=from_code,
+            to_code=to_code,
+            progress_bar=progress_bar,
+            status_text=status_text
+        )
+        # Limpiar el pdf convertido
+        schedule_file_deletion(docx_path)
+        logger.info(f"✅ PDF convertido y traducido (Salida DOCX): {output_docx_path}")
+        return output_docx_path
     except MemoryError:
+        raise MemoryError("Proceso abortado por falta de memoria RAM al convertir PDF a DOCX.")
     except Exception as e:
+        logger.error(f"Error en proxy PDF a DOCX: {e}", exc_info=True)
         raise
         # Nombre sugerido para la descarga
         original_stem = Path(uploaded_file.name).stem
+        # Debido a la conversión estructural universal, la salida final es siempre un archivo DOCX
+        output_ext = ".docx"
+        download_name = f"{original_stem}_traducido_{to_code}{output_ext}"
         # ── Card de resultado ──────────────────────────────────────────────
         st.markdown(
         col_dl, col_info = st.columns([1, 2])
         with col_dl:
+            # Determinar el MIME type para la descarga (Siempre DOCX)
+            mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
             st.download_button(
+                label=f"⬇️ Descargar Documento (Word)",
                 data=translated_bytes,
                 file_name=download_name,
                 mime=mime_type,

requirements.txt CHANGED Viewed

@@ -27,8 +27,8 @@ sacremoses>=0.1.1
 # para extracción de texto por coordenadas y dibujo sobre páginas
 PyMuPDF>=1.24.0
-# fpdf2: Generador de PDFs dinámicos con soporte Unicode, imágenes y saltos de página
-fpdf2>=2.7.0
 # ── Procesamiento de Word (.docx) ──────────────────────────────────────────
 # python-docx: Leer y escribir documentos Word preservando formato

 # para extracción de texto por coordenadas y dibujo sobre páginas
 PyMuPDF>=1.24.0
+# pdf2docx: Conversión estructural de PDF a Word manteniendo tablas e imágenes
+pdf2docx>=0.5.6
 # ── Procesamiento de Word (.docx) ──────────────────────────────────────────
 # python-docx: Leer y escribir documentos Word preservando formato