Spaces:
Sleeping
Sleeping
File size: 1,851 Bytes
8ffdc18 ecebb8d 8ffdc18 ecebb8d 8ffdc18 ecebb8d 8ffdc18 ecebb8d 8ffdc18 ecebb8d 8ffdc18 ecebb8d 8ffdc18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import pdfplumber
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
import re
# --- Extraction PDF ---
def extract_text_pdf(file_path):
pages = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
pages.append(page.extract_text() or "")
return pages
# --- Nettoyage du texte extrait ---
def clean_extracted_text(text):
if not text:
return ""
# Supprime les underscores et espaces artificiels
text = re.sub(r'(_\s*)+', '', text)
# Supprime espaces entre lettres isolées : a w a d e m e → awademe
text = re.sub(r'(?<=\b[a-zA-Z])\s(?=[a-zA-Z]\b)', '', text)
# Nettoyage général
text = re.sub(r'\s+', ' ', text)
text = text.replace(" .", ".").replace(" ,", ",")
return text.strip()
# --- Découpage en paragraphes pour traduction ---
def split_paragraphs(text, max_len=350):
paragraphs = text.split("\n")
chunks = []
for p in paragraphs:
p = p.strip()
if not p:
continue
if len(p) <= max_len:
chunks.append(p)
else:
# découpe longue phrase
sentences = p.split(". ")
current = ""
for s in sentences:
if len(current) + len(s) < max_len:
current += s + ". "
else:
chunks.append(current.strip())
current = s + ". "
if current:
chunks.append(current.strip())
return chunks
# --- Génération PDF final ---
def create_clean_pdf(text, output_path):
doc = SimpleDocTemplate(output_path)
styles = getSampleStyleSheet()
story = []
for line in text.split("\n"):
story.append(Paragraph(line, styles["Normal"]))
doc.build(story)
|