Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| from reportlab.platypus import SimpleDocTemplate, Paragraph | |
| from reportlab.lib.styles import getSampleStyleSheet | |
| import re | |
| # --- Extraction PDF --- | |
| def extract_text_pdf(file_path): | |
| pages = [] | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| pages.append(page.extract_text() or "") | |
| return pages | |
| # --- Nettoyage du texte extrait --- | |
| def clean_extracted_text(text): | |
| if not text: | |
| return "" | |
| # Supprime les underscores et espaces artificiels | |
| text = re.sub(r'(_\s*)+', '', text) | |
| # Supprime espaces entre lettres isolées : a w a d e m e → awademe | |
| text = re.sub(r'(?<=\b[a-zA-Z])\s(?=[a-zA-Z]\b)', '', text) | |
| # Nettoyage général | |
| text = re.sub(r'\s+', ' ', text) | |
| text = text.replace(" .", ".").replace(" ,", ",") | |
| return text.strip() | |
| # --- Découpage en paragraphes pour traduction --- | |
| def split_paragraphs(text, max_len=350): | |
| paragraphs = text.split("\n") | |
| chunks = [] | |
| for p in paragraphs: | |
| p = p.strip() | |
| if not p: | |
| continue | |
| if len(p) <= max_len: | |
| chunks.append(p) | |
| else: | |
| # découpe longue phrase | |
| sentences = p.split(". ") | |
| current = "" | |
| for s in sentences: | |
| if len(current) + len(s) < max_len: | |
| current += s + ". " | |
| else: | |
| chunks.append(current.strip()) | |
| current = s + ". " | |
| if current: | |
| chunks.append(current.strip()) | |
| return chunks | |
| # --- Génération PDF final --- | |
| def create_clean_pdf(text, output_path): | |
| doc = SimpleDocTemplate(output_path) | |
| styles = getSampleStyleSheet() | |
| story = [] | |
| for line in text.split("\n"): | |
| story.append(Paragraph(line, styles["Normal"])) | |
| doc.build(story) | |