Spaces:

Ronaldodev
/

fon-pdf-translator

Sleeping

fon-pdf-translator / pdf_utils.py

Update pdf_utils.py

ecebb8d verified 27 days ago

1.85 kB

	import pdfplumber
	from reportlab.platypus import SimpleDocTemplate, Paragraph
	from reportlab.lib.styles import getSampleStyleSheet
	import re

	# --- Extraction PDF ---
	def extract_text_pdf(file_path):
	pages = []
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	pages.append(page.extract_text() or "")
	return pages

	# --- Nettoyage du texte extrait ---
	def clean_extracted_text(text):
	if not text:
	return ""

	# Supprime les underscores et espaces artificiels
	text = re.sub(r'(_\s*)+', '', text)

	# Supprime espaces entre lettres isolées : a w a d e m e → awademe
	text = re.sub(r'(?<=\b[a-zA-Z])\s(?=[a-zA-Z]\b)', '', text)

	# Nettoyage général
	text = re.sub(r'\s+', ' ', text)
	text = text.replace(" .", ".").replace(" ,", ",")

	return text.strip()

	# --- Découpage en paragraphes pour traduction ---
	def split_paragraphs(text, max_len=350):
	paragraphs = text.split("\n")
	chunks = []

	for p in paragraphs:
	p = p.strip()
	if not p:
	continue

	if len(p) <= max_len:
	chunks.append(p)
	else:
	# découpe longue phrase
	sentences = p.split(". ")
	current = ""
	for s in sentences:
	if len(current) + len(s) < max_len:
	current += s + ". "
	else:
	chunks.append(current.strip())
	current = s + ". "
	if current:
	chunks.append(current.strip())

	return chunks

	# --- Génération PDF final ---
	def create_clean_pdf(text, output_path):
	doc = SimpleDocTemplate(output_path)
	styles = getSampleStyleSheet()
	story = []

	for line in text.split("\n"):
	story.append(Paragraph(line, styles["Normal"]))

	doc.build(story)