Spaces:
Running
Running
AdVision AI commited on
Commit Β·
da3fcd4
1
Parent(s): fb62447
refactor: implement structural conversion with pdf2docx
Browse files- .gitignore +1 -0
- app.py +42 -202
- requirements.txt +2 -2
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
keys.txt
|
app.py
CHANGED
|
@@ -680,46 +680,6 @@ def translate_text(text: str, from_code: str, to_code: str) -> str:
|
|
| 680 |
# 3. Escribe la traducciΓ³n encima en la misma posiciΓ³n
|
| 681 |
# =============================================================================
|
| 682 |
|
| 683 |
-
def _get_font_for_language(to_code: str) -> tuple[str, str]:
|
| 684 |
-
"""
|
| 685 |
-
Descarga o ubica una fuente TTF compatible con el idioma de destino.
|
| 686 |
-
Esencial para que fpdf2 soporte caracteres CJK, Γrabes y otros.
|
| 687 |
-
"""
|
| 688 |
-
import urllib.request
|
| 689 |
-
import platform
|
| 690 |
-
|
| 691 |
-
font_dir = Path(tempfile.gettempdir()) / "fonts"
|
| 692 |
-
font_dir.mkdir(exist_ok=True)
|
| 693 |
-
|
| 694 |
-
if to_code in ["zh", "ja", "ko"]:
|
| 695 |
-
font_name = "NotoSansCJK"
|
| 696 |
-
url = "https://github.com/google/fonts/raw/main/ofl/notosanssc/NotoSansSC-Regular.ttf"
|
| 697 |
-
font_filename = "NotoSansSC-Regular.ttf"
|
| 698 |
-
elif to_code == "ar":
|
| 699 |
-
font_name = "NotoSansArabic"
|
| 700 |
-
url = "https://github.com/google/fonts/raw/main/ofl/notosansarabic/NotoSansArabic-Regular.ttf"
|
| 701 |
-
font_filename = "NotoSansArabic-Regular.ttf"
|
| 702 |
-
else:
|
| 703 |
-
font_name = "NotoSans"
|
| 704 |
-
url = "https://github.com/google/fonts/raw/main/ofl/notosans/NotoSans-Regular.ttf"
|
| 705 |
-
font_filename = "NotoSans-Regular.ttf"
|
| 706 |
-
|
| 707 |
-
font_path = font_dir / font_filename
|
| 708 |
-
|
| 709 |
-
if not font_path.exists():
|
| 710 |
-
try:
|
| 711 |
-
logger.info(f"Descargando fuente {font_name}...")
|
| 712 |
-
urllib.request.urlretrieve(url, str(font_path))
|
| 713 |
-
except Exception as e:
|
| 714 |
-
logger.warning(f"Error descargando fuente, fallback: {e}")
|
| 715 |
-
if platform.system() == "Windows":
|
| 716 |
-
return "C:\\Windows\\Fonts\\arial.ttf", "Arial"
|
| 717 |
-
else:
|
| 718 |
-
return "", "" # Default a core fonts en Linux
|
| 719 |
-
|
| 720 |
-
return str(font_path), font_name
|
| 721 |
-
|
| 722 |
-
|
| 723 |
def translate_pdf(
|
| 724 |
input_path: str,
|
| 725 |
from_code: str,
|
|
@@ -728,173 +688,52 @@ def translate_pdf(
|
|
| 728 |
status_text
|
| 729 |
) -> str:
|
| 730 |
"""
|
| 731 |
-
|
| 732 |
-
|
| 733 |
"""
|
| 734 |
try:
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
def _map_tess_lang(lang):
|
| 740 |
-
m = {"es":"spa","en":"eng","fr":"fra","de":"deu","it":"ita",
|
| 741 |
-
"pt":"por","ru":"rus","zh":"chi_sim","ja":"jpn","ko":"kor","ar":"ara"}
|
| 742 |
-
return m.get(lang, "eng")
|
| 743 |
-
|
| 744 |
-
tess_lang = _map_tess_lang(from_code)
|
| 745 |
-
doc = fitz.open(input_path)
|
| 746 |
-
total_pages = len(doc)
|
| 747 |
-
|
| 748 |
-
if total_pages == 0:
|
| 749 |
-
raise ValueError("El PDF no contiene pΓ‘ginas.")
|
| 750 |
|
| 751 |
status_text.markdown(
|
| 752 |
-
f'<div class="status-badge processing">π
|
| 753 |
unsafe_allow_html=True
|
| 754 |
)
|
|
|
|
| 755 |
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
#
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
raw_text = page.get_text("text").strip()
|
| 763 |
-
imgs = page.get_images(full=True)
|
| 764 |
-
|
| 765 |
-
# Detectar escaneo (pocos caracteres, contiene imΓ‘genes)
|
| 766 |
-
if len(raw_text) < 50 and len(imgs) > 0:
|
| 767 |
-
progress_bar.progress((page_num) / total_pages, text=f"OCR en pΓ‘gina {page_num+1}...")
|
| 768 |
-
try:
|
| 769 |
-
tp = page.get_textpage_ocr(flags=0, language=tess_lang, dpi=150)
|
| 770 |
-
page_dict = page.get_text("dict", textpage=tp, sort=True)
|
| 771 |
-
except Exception as e:
|
| 772 |
-
logger.warning(f"Error OCR pΓ‘g {page_num+1}: {e}")
|
| 773 |
-
page_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_IMAGES, sort=True)
|
| 774 |
-
else:
|
| 775 |
-
progress_bar.progress((page_num) / total_pages, text=f"Analizando pΓ‘gina {page_num+1}...")
|
| 776 |
-
page_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_IMAGES, sort=True)
|
| 777 |
-
|
| 778 |
-
for block in page_dict.get("blocks", []):
|
| 779 |
-
b_type = block.get("type", 0)
|
| 780 |
-
if b_type == 0: # Bloque de Texto
|
| 781 |
-
block_text = ""
|
| 782 |
-
sizes = []
|
| 783 |
-
for line in block.get("lines", []):
|
| 784 |
-
for span in line.get("spans", []):
|
| 785 |
-
txt = span.get("text", "")
|
| 786 |
-
block_text += txt + " "
|
| 787 |
-
sizes.append(span.get("size", 11))
|
| 788 |
-
|
| 789 |
-
block_text = block_text.strip()
|
| 790 |
-
if block_text and len(block_text) > 1:
|
| 791 |
-
avg_size = sum(sizes) / len(sizes) if sizes else 11
|
| 792 |
-
avg_size = min(max(avg_size, 9), 32) # TamaΓ±o razonable
|
| 793 |
-
content_sequence.append({
|
| 794 |
-
"type": "text",
|
| 795 |
-
"original": block_text,
|
| 796 |
-
"size": avg_size
|
| 797 |
-
})
|
| 798 |
-
elif b_type == 1: # Bloque de Imagen
|
| 799 |
-
img_bytes = block.get("image")
|
| 800 |
-
ext = block.get("ext", "jpeg")
|
| 801 |
-
if img_bytes:
|
| 802 |
-
content_sequence.append({
|
| 803 |
-
"type": "image",
|
| 804 |
-
"bytes": img_bytes,
|
| 805 |
-
"ext": ext
|
| 806 |
-
})
|
| 807 |
-
|
| 808 |
-
# 2. TraducciΓ³n en Lote Altamente Optimizada
|
| 809 |
-
texts_to_translate = [item["original"] for item in content_sequence if item["type"] == "text"]
|
| 810 |
-
translated_texts = []
|
| 811 |
-
if texts_to_translate:
|
| 812 |
-
status_text.markdown(
|
| 813 |
-
f'<div class="status-badge processing pulse">π TraducciΓ³n en lote ({len(texts_to_translate)} bloques)...</div>',
|
| 814 |
-
unsafe_allow_html=True
|
| 815 |
-
)
|
| 816 |
-
progress_bar.progress(0.85, text="Enviando textos a motor neuronal...")
|
| 817 |
-
translated_texts = translate_batch(texts_to_translate, from_code, to_code)
|
| 818 |
-
|
| 819 |
-
# Asignar textos de vuelta
|
| 820 |
-
txt_idx = 0
|
| 821 |
-
for item in content_sequence:
|
| 822 |
-
if item["type"] == "text":
|
| 823 |
-
item["translated"] = translated_texts[txt_idx]
|
| 824 |
-
txt_idx += 1
|
| 825 |
-
|
| 826 |
-
# 3. Ensamblado del Nuevo PDF DinΓ‘mico
|
| 827 |
-
progress_bar.progress(0.95, text="Armando documento secuencial...")
|
| 828 |
-
|
| 829 |
-
pdf = FPDF()
|
| 830 |
-
pdf.set_auto_page_break(auto=True, margin=15)
|
| 831 |
-
pdf.add_page()
|
| 832 |
-
|
| 833 |
-
# Cargar fuentes del idioma meta
|
| 834 |
-
font_path, font_family = _get_font_for_language(to_code)
|
| 835 |
-
has_font = False
|
| 836 |
-
if font_path and os.path.exists(font_path):
|
| 837 |
-
try:
|
| 838 |
-
pdf.add_font(font_family, style="", fname=font_path)
|
| 839 |
-
has_font = True
|
| 840 |
-
except Exception as e:
|
| 841 |
-
logger.warning(f"Error agregando fuente FPDF: {e}")
|
| 842 |
-
|
| 843 |
-
for item in content_sequence:
|
| 844 |
-
if item["type"] == "text":
|
| 845 |
-
text = item["translated"]
|
| 846 |
-
size = item["size"]
|
| 847 |
-
|
| 848 |
-
if has_font:
|
| 849 |
-
pdf.set_font(font_family, size=size)
|
| 850 |
-
else:
|
| 851 |
-
pdf.set_font("Helvetica", size=size)
|
| 852 |
-
|
| 853 |
-
try:
|
| 854 |
-
# Renderizar texto limpiando errores de Unicode
|
| 855 |
-
safe_text = text.encode('utf-16', 'surrogatepass').decode('utf-16')
|
| 856 |
-
pdf.multi_cell(0, max(5, size * 0.35), safe_text)
|
| 857 |
-
except Exception as e:
|
| 858 |
-
fallback_text = text.encode('ascii', 'ignore').decode('ascii')
|
| 859 |
-
try:
|
| 860 |
-
pdf.multi_cell(0, 6, fallback_text)
|
| 861 |
-
except: pass
|
| 862 |
-
|
| 863 |
-
pdf.ln(3) # Margen inferor del bloque
|
| 864 |
-
|
| 865 |
-
elif item["type"] == "image":
|
| 866 |
-
import uuid
|
| 867 |
-
img_ext = item["ext"]
|
| 868 |
-
if "/" in img_ext: img_ext = img_ext.split("/")[-1]
|
| 869 |
-
|
| 870 |
-
tmp_img = os.path.join(tempfile.gettempdir(), f"img_{uuid.uuid4().hex}.{img_ext}")
|
| 871 |
-
with open(tmp_img, "wb") as f:
|
| 872 |
-
f.write(item["bytes"])
|
| 873 |
-
try:
|
| 874 |
-
# Adaptar el ancho hasta un maximo de la pagina
|
| 875 |
-
pdf.image(tmp_img, x="C", w=170)
|
| 876 |
-
pdf.ln(5)
|
| 877 |
-
except Exception as e:
|
| 878 |
-
logger.warning(f"Fallo al incrustar imagen: {e}")
|
| 879 |
-
finally:
|
| 880 |
-
if os.path.exists(tmp_img): os.remove(tmp_img)
|
| 881 |
-
|
| 882 |
-
# 4. Guardar Resultado Final
|
| 883 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 884 |
-
output_filename = f"traducido_seq_{from_code}_to_{to_code}_{timestamp}.pdf"
|
| 885 |
-
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 886 |
-
|
| 887 |
-
pdf.output(output_path)
|
| 888 |
-
doc.close()
|
| 889 |
-
schedule_file_deletion(output_path)
|
| 890 |
|
| 891 |
-
|
| 892 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
|
| 894 |
except MemoryError:
|
| 895 |
-
raise MemoryError("
|
| 896 |
except Exception as e:
|
| 897 |
-
logger.error(f"Error en
|
| 898 |
raise
|
| 899 |
|
| 900 |
|
|
@@ -1436,7 +1275,10 @@ def _process_translation(
|
|
| 1436 |
|
| 1437 |
# Nombre sugerido para la descarga
|
| 1438 |
original_stem = Path(uploaded_file.name).stem
|
| 1439 |
-
|
|
|
|
|
|
|
|
|
|
| 1440 |
|
| 1441 |
# ββ Card de resultado ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1442 |
st.markdown(
|
|
@@ -1448,12 +1290,10 @@ def _process_translation(
|
|
| 1448 |
col_dl, col_info = st.columns([1, 2])
|
| 1449 |
|
| 1450 |
with col_dl:
|
| 1451 |
-
# Determinar el MIME type para la descarga
|
| 1452 |
-
mime_type = "application/
|
| 1453 |
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 1454 |
-
)
|
| 1455 |
st.download_button(
|
| 1456 |
-
label=f"β¬οΈ Descargar
|
| 1457 |
data=translated_bytes,
|
| 1458 |
file_name=download_name,
|
| 1459 |
mime=mime_type,
|
|
|
|
| 680 |
# 3. Escribe la traducciΓ³n encima en la misma posiciΓ³n
|
| 681 |
# =============================================================================
|
| 682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
def translate_pdf(
|
| 684 |
input_path: str,
|
| 685 |
from_code: str,
|
|
|
|
| 688 |
status_text
|
| 689 |
) -> str:
|
| 690 |
"""
|
| 691 |
+
Convierte el archivo PDF a DOCX estructuralmente preservando tablas e imΓ‘genes.
|
| 692 |
+
Luego lo traduce utilizando la funciΓ³n DOCX y devuelve el archivo resultante.
|
| 693 |
"""
|
| 694 |
try:
|
| 695 |
+
from pdf2docx import Converter
|
| 696 |
+
|
| 697 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 698 |
+
docx_path = os.path.join(tempfile.gettempdir(), f"intermediate_{timestamp}.docx")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
|
| 700 |
status_text.markdown(
|
| 701 |
+
f'<div class="status-badge processing pulse">π Convirtiendo Estructura de PDF a Word...</div>',
|
| 702 |
unsafe_allow_html=True
|
| 703 |
)
|
| 704 |
+
progress_bar.progress(0.1, text="Analizando tablas, columnas e imΓ‘genes... (Esto tomarΓ‘ unos segundos)")
|
| 705 |
|
| 706 |
+
# ConversiΓ³n estructural con pdf2docx
|
| 707 |
+
cv = Converter(input_path)
|
| 708 |
+
# convert soporta start y end, usamos todo
|
| 709 |
+
cv.convert(docx_path)
|
| 710 |
+
cv.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
|
| 712 |
+
status_text.markdown(
|
| 713 |
+
f'<div class="status-badge success">β
ConversiΓ³n estructural completa.</div>',
|
| 714 |
+
unsafe_allow_html=True
|
| 715 |
+
)
|
| 716 |
+
progress_bar.progress(0.3, text="Traduciendo estructura DOCX generada...")
|
| 717 |
+
|
| 718 |
+
# Traducir el DOCX usando la funciΓ³n experta
|
| 719 |
+
output_docx_path = translate_docx(
|
| 720 |
+
input_path=docx_path,
|
| 721 |
+
from_code=from_code,
|
| 722 |
+
to_code=to_code,
|
| 723 |
+
progress_bar=progress_bar,
|
| 724 |
+
status_text=status_text
|
| 725 |
+
)
|
| 726 |
+
|
| 727 |
+
# Limpiar el pdf convertido
|
| 728 |
+
schedule_file_deletion(docx_path)
|
| 729 |
+
|
| 730 |
+
logger.info(f"β
PDF convertido y traducido (Salida DOCX): {output_docx_path}")
|
| 731 |
+
return output_docx_path
|
| 732 |
|
| 733 |
except MemoryError:
|
| 734 |
+
raise MemoryError("Proceso abortado por falta de memoria RAM al convertir PDF a DOCX.")
|
| 735 |
except Exception as e:
|
| 736 |
+
logger.error(f"Error en proxy PDF a DOCX: {e}", exc_info=True)
|
| 737 |
raise
|
| 738 |
|
| 739 |
|
|
|
|
| 1275 |
|
| 1276 |
# Nombre sugerido para la descarga
|
| 1277 |
original_stem = Path(uploaded_file.name).stem
|
| 1278 |
+
|
| 1279 |
+
# Debido a la conversiΓ³n estructural universal, la salida final es siempre un archivo DOCX
|
| 1280 |
+
output_ext = ".docx"
|
| 1281 |
+
download_name = f"{original_stem}_traducido_{to_code}{output_ext}"
|
| 1282 |
|
| 1283 |
# ββ Card de resultado ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1284 |
st.markdown(
|
|
|
|
| 1290 |
col_dl, col_info = st.columns([1, 2])
|
| 1291 |
|
| 1292 |
with col_dl:
|
| 1293 |
+
# Determinar el MIME type para la descarga (Siempre DOCX)
|
| 1294 |
+
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
|
|
|
|
|
| 1295 |
st.download_button(
|
| 1296 |
+
label=f"β¬οΈ Descargar Documento (Word)",
|
| 1297 |
data=translated_bytes,
|
| 1298 |
file_name=download_name,
|
| 1299 |
mime=mime_type,
|
requirements.txt
CHANGED
|
@@ -27,8 +27,8 @@ sacremoses>=0.1.1
|
|
| 27 |
# para extracciΓ³n de texto por coordenadas y dibujo sobre pΓ‘ginas
|
| 28 |
PyMuPDF>=1.24.0
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
| 32 |
|
| 33 |
# ββ Procesamiento de Word (.docx) ββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
# python-docx: Leer y escribir documentos Word preservando formato
|
|
|
|
| 27 |
# para extracciΓ³n de texto por coordenadas y dibujo sobre pΓ‘ginas
|
| 28 |
PyMuPDF>=1.24.0
|
| 29 |
|
| 30 |
+
# pdf2docx: ConversiΓ³n estructural de PDF a Word manteniendo tablas e imΓ‘genes
|
| 31 |
+
pdf2docx>=0.5.6
|
| 32 |
|
| 33 |
# ββ Procesamiento de Word (.docx) ββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
# python-docx: Leer y escribir documentos Word preservando formato
|