import pdfplumber from reportlab.platypus import SimpleDocTemplate, Paragraph from reportlab.lib.styles import getSampleStyleSheet import re # --- Extraction PDF --- def extract_text_pdf(file_path): pages = [] with pdfplumber.open(file_path) as pdf: for page in pdf.pages: pages.append(page.extract_text() or "") return pages # --- Nettoyage du texte extrait --- def clean_extracted_text(text): if not text: return "" # Supprime les underscores et espaces artificiels text = re.sub(r'(_\s*)+', '', text) # Supprime espaces entre lettres isolées : a w a d e m e → awademe text = re.sub(r'(?<=\b[a-zA-Z])\s(?=[a-zA-Z]\b)', '', text) # Nettoyage général text = re.sub(r'\s+', ' ', text) text = text.replace(" .", ".").replace(" ,", ",") return text.strip() # --- Découpage en paragraphes pour traduction --- def split_paragraphs(text, max_len=350): paragraphs = text.split("\n") chunks = [] for p in paragraphs: p = p.strip() if not p: continue if len(p) <= max_len: chunks.append(p) else: # découpe longue phrase sentences = p.split(". ") current = "" for s in sentences: if len(current) + len(s) < max_len: current += s + ". " else: chunks.append(current.strip()) current = s + ". " if current: chunks.append(current.strip()) return chunks # --- Génération PDF final --- def create_clean_pdf(text, output_path): doc = SimpleDocTemplate(output_path) styles = getSampleStyleSheet() story = [] for line in text.split("\n"): story.append(Paragraph(line, styles["Normal"])) doc.build(story)