fon-pdf-translator / pdf_utils.py
Ronaldodev's picture
Update pdf_utils.py
ecebb8d verified
import pdfplumber
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
import re
# --- Extraction PDF ---
def extract_text_pdf(file_path):
pages = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
pages.append(page.extract_text() or "")
return pages
# --- Nettoyage du texte extrait ---
def clean_extracted_text(text):
if not text:
return ""
# Supprime les underscores et espaces artificiels
text = re.sub(r'(_\s*)+', '', text)
# Supprime espaces entre lettres isolées : a w a d e m e → awademe
text = re.sub(r'(?<=\b[a-zA-Z])\s(?=[a-zA-Z]\b)', '', text)
# Nettoyage général
text = re.sub(r'\s+', ' ', text)
text = text.replace(" .", ".").replace(" ,", ",")
return text.strip()
# --- Découpage en paragraphes pour traduction ---
def split_paragraphs(text, max_len=350):
paragraphs = text.split("\n")
chunks = []
for p in paragraphs:
p = p.strip()
if not p:
continue
if len(p) <= max_len:
chunks.append(p)
else:
# découpe longue phrase
sentences = p.split(". ")
current = ""
for s in sentences:
if len(current) + len(s) < max_len:
current += s + ". "
else:
chunks.append(current.strip())
current = s + ". "
if current:
chunks.append(current.strip())
return chunks
# --- Génération PDF final ---
def create_clean_pdf(text, output_path):
doc = SimpleDocTemplate(output_path)
styles = getSampleStyleSheet()
story = []
for line in text.split("\n"):
story.append(Paragraph(line, styles["Normal"]))
doc.build(story)