File size: 1,851 Bytes
8ffdc18
 
 
ecebb8d
8ffdc18
ecebb8d
8ffdc18
 
 
 
 
 
 
ecebb8d
 
 
 
8ffdc18
ecebb8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ffdc18
ecebb8d
8ffdc18
 
ecebb8d
 
8ffdc18
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pdfplumber
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
import re

# --- Extraction PDF ---
def extract_text_pdf(file_path):
    pages = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            pages.append(page.extract_text() or "")
    return pages

# --- Nettoyage du texte extrait ---
def clean_extracted_text(text):
    if not text:
        return ""

    # Supprime les underscores et espaces artificiels
    text = re.sub(r'(_\s*)+', '', text)

    # Supprime espaces entre lettres isolées : a w a d e m e → awademe
    text = re.sub(r'(?<=\b[a-zA-Z])\s(?=[a-zA-Z]\b)', '', text)

    # Nettoyage général
    text = re.sub(r'\s+', ' ', text)
    text = text.replace(" .", ".").replace(" ,", ",")

    return text.strip()

# --- Découpage en paragraphes pour traduction ---
def split_paragraphs(text, max_len=350):
    paragraphs = text.split("\n")
    chunks = []

    for p in paragraphs:
        p = p.strip()
        if not p:
            continue

        if len(p) <= max_len:
            chunks.append(p)
        else:
            # découpe longue phrase
            sentences = p.split(". ")
            current = ""
            for s in sentences:
                if len(current) + len(s) < max_len:
                    current += s + ". "
                else:
                    chunks.append(current.strip())
                    current = s + ". "
            if current:
                chunks.append(current.strip())

    return chunks

# --- Génération PDF final ---
def create_clean_pdf(text, output_path):
    doc = SimpleDocTemplate(output_path)
    styles = getSampleStyleSheet()
    story = []

    for line in text.split("\n"):
        story.append(Paragraph(line, styles["Normal"]))

    doc.build(story)