extractor-pdf-text

Sleeping

App Files Files Community

extractor-pdf-text / app.py

aysalas

Update app.py

4cb29ef verified about 2 months ago

raw

history blame contribute delete

2.31 kB

	import gradio as gr
	from PyPDF2 import PdfReader
	import tempfile
	import re
	import unicodedata

	def limpiar_texto(texto):
	# Normaliza caracteres Unicode (acentos y símbolos)
	texto = unicodedata.normalize("NFKD", texto)

	# Elimina caracteres no imprimibles
	texto = re.sub(
	r"[^\x09\x0A\x0D\x20-\x7EáéíóúÁÉÍÓÚñÑüÜ]",
	"",
	texto
	)

	# Reemplazos comunes de símbolos en PDFs
	reemplazos = {
	"�": "",
	"•": "-",
	"▪": "-",
	"–": "-",
	"—": "-",
	"“": '"',
	"”": '"',
	"‘": "'",
	"’": "'",
	"Â": "",
	}

	for simbolo, reemplazo in reemplazos.items():
	texto = texto.replace(simbolo, reemplazo)

	# Limpia espacios y saltos de línea excesivos
	texto = re.sub(r"\n{3,}", "\n\n", texto)
	texto = re.sub(r"[ \t]{2,}", " ", texto)

	return texto.strip()

	def extraer_texto_pdf(archivo_pdf):
	if archivo_pdf is None:
	return ""

	reader = PdfReader(archivo_pdf.name)
	texto = ""

	for pagina in reader.pages:
	contenido = pagina.extract_text()
	if contenido:
	texto += contenido + "\n"

	# 🔹 Limpieza del texto extraído
	texto = limpiar_texto(texto)

	return texto


	def guardar_texto_en_txt(texto):
	archivo_temp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
	with open(archivo_temp.name, "w", encoding="utf-8") as f:
	f.write(texto)
	return archivo_temp.name

	with gr.Blocks() as app:
	gr.Markdown("# 📄 Extractor de texto desde PDF - Alex")
	gr.Markdown("Sube un PDF, extrae su texto y descárgalo en formato .txt")

	entrada_pdf = gr.File(
	label="📂 Selecciona tu archivo PDF",
	file_types=[".pdf"]
	)

	boton_extraer = gr.Button("🔍 Extraer texto")

	salida_texto = gr.Textbox(
	label="Texto extraído",
	lines=15
	)

	boton_descargar = gr.Button("💾 Descargar texto (.txt)")
	salida_archivo = gr.File(label="Descargar archivo")

	boton_extraer.click(
	fn=extraer_texto_pdf,
	inputs=entrada_pdf,
	outputs=salida_texto
	)

	boton_descargar.click(
	fn=guardar_texto_en_txt,
	inputs=salida_texto,
	outputs=salida_archivo
	)

	app.launch()