Spaces:

Adjoumani
/

traduction-document

Sleeping

App Files Files Community

Adjoumani commited on Aug 9, 2025

Commit

3fa52b5

verified ·

1 Parent(s): ebdb667

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -15

app.py CHANGED Viewed

@@ -2,6 +2,26 @@ import streamlit as st
 import fitz  # pymupdf pour la manipulation PDF
 from deep_translator import MyMemoryTranslator # GoogleTranslator
 import time
 # Configuration de base pour l'interface Streamlit
 st.set_page_config(
@@ -117,17 +137,17 @@ if doc_file and st.button("Lancer la traduction"):
         progress_bar = st.progress(0)
         start_time = time.time()
-        for page_num, page in enumerate(doc):
-            blocks = page.get_text("blocks", flags=textflags)
-            for block in blocks:
-                bbox = block[:4]
-                english_text = block[4]
                 # Traduction du texte
                 #translated_text = translator.translate(english_text)
                 # Découper le texte en segments de 500 caractères max
-                chunks = [english_text[i:i+500] for i in range(0, len(english_text), 500)]
                 # Remplacement du texte
                 #page.draw_rect(bbox, color=None, fill=blanc, oc=ocg_xref)
@@ -149,18 +169,29 @@ if doc_file and st.button("Lancer la traduction"):
                 #    translated_text,
                 #    oc=ocg_xref,
                 #)
                 translated_text = ""
                 for chunk in chunks:
-                    if chunk.strip():  # Ignorer les chunks vides
-                        try:
-                            translated_chunk = translator.translate(chunk)
-                            translated_text += translated_chunk + " "
-                        except Exception as e:
-                            st.warning(f"Erreur sur un segment : {str(e)}")
-                            translated_text += chunk + " "  # Garde le texte original en cas d'échec
-                # Appliquer la traduction au PDF
                 page.draw_rect(bbox, color=None, fill=blanc, oc=ocg_xref)
                 page.insert_htmlbox(bbox, translated_text.strip(), oc=ocg_xref)

 import fitz  # pymupdf pour la manipulation PDF
 from deep_translator import MyMemoryTranslator # GoogleTranslator
 import time
+import re
+def clean_text(text):
+    # Remplacer les séries de points par un espace unique
+    text = re.sub(r'\.{3,}', ' ', text)
+    # Supprimer les espaces multiples
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def split_text(text, max_length=500):
+    chunks = []
+    while len(text) > max_length:
+        # Trouver le dernier espace avant max_length
+        split_pos = text.rfind(' ', 0, max_length)
+        if split_pos == -1:  # Aucun espace trouvé (mot très long)
+            split_pos = max_length
+        chunks.append(text[:split_pos])
+        text = text[split_pos:].lstrip()
+    chunks.append(text)
+    return chunks
 # Configuration de base pour l'interface Streamlit
 st.set_page_config(
         progress_bar = st.progress(0)
         start_time = time.time()
+        #for page_num, page in enumerate(doc):
+        #    blocks = page.get_text("blocks", flags=textflags)
+        #    for block in blocks:
+        #        bbox = block[:4]
+        #        english_text = block[4]
                 # Traduction du texte
                 #translated_text = translator.translate(english_text)
                 # Découper le texte en segments de 500 caractères max
                 # Remplacement du texte
                 #page.draw_rect(bbox, color=None, fill=blanc, oc=ocg_xref)
                 #    translated_text,
                 #    oc=ocg_xref,
                 #)
+        for page_num, page in enumerate(doc):
+            blocks = page.get_text("blocks", flags=textflags)
+            for block in blocks:
+                bbox = block[:4]
+                raw_text = block[4]
+                # Nettoyage + découpage
+                clean = clean_text(raw_text)
+                chunks = split_text(clean)
+                # Traduction cumulative
                 translated_text = ""
                 for chunk in chunks:
+                    try:
+                        translated_chunk = translator.transrate(chunk)
+                        translated_text += translated_chunk + " "
+                        time.sleep(0.3)  # Anti-rate-limiting
+                    except Exception as e:
+                        st.warning(f"Segment non traduit : {str(e)}")
+                        translated_text += chunk + " "  # Fallback
+                # Application au PDF
                 page.draw_rect(bbox, color=None, fill=blanc, oc=ocg_xref)
                 page.insert_htmlbox(bbox, translated_text.strip(), oc=ocg_xref)