pdftomd_mule / app.py
guifav's picture
Atualização do app e README com melhorias de desempenho e novas funcionalidades
4a3021f
import gradio as gr
import fitz # PyMuPDF
import re
import os
import tempfile
from PIL import Image
import io
import requests
import zipfile
import shutil
from functools import lru_cache
import concurrent.futures
# Cache para resultados de OCR
@lru_cache(maxsize=100)
def ocr_space_api(image_path):
api_key = 'YOUR_OCR_SPACE_API_KEY' # Substitua pela sua chave API
payload = {'isOverlayRequired': False,
'apikey': api_key,
'language': 'eng',
}
try:
with open(image_path, 'rb') as image_file:
response = requests.post('https://api.ocr.space/parse/image',
files={image_path: image_file},
data=payload,
timeout=30) # Timeout de 30 segundos
response.raise_for_status()
result = response.json()
if isinstance(result, dict) and 'ParsedResults' in result and len(result['ParsedResults']) > 0:
return result['ParsedResults'][0].get('ParsedText', '')
else:
return "Erro: Formato de resposta inesperado da API OCR"
except requests.RequestException as e:
return f"Erro na requisição HTTP: {str(e)}"
except Exception as e:
return f"Erro inesperado: {str(e)}"
def extract_images_and_text(pdf_document, page_num, temp_dir):
page = pdf_document[page_num]
image_list = page.get_images(full=True)
images_with_text = []
def process_image(img):
try:
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image_filename = f"image_page{page_num+1}_{img[0]}.png"
image_path = os.path.join(temp_dir, image_filename)
image.save(image_path)
text = ocr_space_api(image_path)
return (image_filename, text)
except Exception as e:
print(f"Erro ao processar imagem {img[0]} na página {page_num}: {str(e)}")
return None
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(process_image, image_list))
images_with_text = [r for r in results if r is not None]
return images_with_text
def is_heading(text):
return text.isupper() or re.match(r'^[A-Z][\w\s]+:', text)
def pdf_to_markdown(pdf_file):
try:
with tempfile.TemporaryDirectory() as temp_dir:
if isinstance(pdf_file, str):
temp_pdf_path = pdf_file
original_filename = os.path.splitext(os.path.basename(pdf_file))[0]
elif hasattr(pdf_file, 'name'):
temp_pdf_path = os.path.join(temp_dir, "temp.pdf")
with open(temp_pdf_path, "wb") as f:
f.write(pdf_file.read() if hasattr(pdf_file, 'read') else pdf_file.value)
original_filename = os.path.splitext(pdf_file.name)[0]
else:
raise ValueError("Tipo de arquivo não suportado")
doc = fitz.open(temp_pdf_path)
markdown_content = ""
for page_num in range(len(doc)):
try:
page = doc[page_num]
text = page.get_text()
images_with_text = extract_images_and_text(doc, page_num, temp_dir)
lines = text.split('\n')
for line in lines:
if is_heading(line):
markdown_content += f"## {line.strip()}\n\n"
else:
markdown_content += f"{line.strip()}\n\n"
for img_filename, img_text in images_with_text:
markdown_content += f"![{img_filename}]({img_filename})\n\n"
if img_text.strip():
markdown_content += f"**Texto extraído da imagem:**\n\n{img_text.strip()}\n\n"
except Exception as e:
print(f"Erro ao processar a página {page_num}: {str(e)}")
continue
doc.close()
markdown_path = os.path.join(temp_dir, f"{original_filename}.md")
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
zip_filename = f"{original_filename}_output.zip"
zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
with zipfile.ZipFile(zip_path, 'w') as zipf:
zipf.write(markdown_path, os.path.basename(markdown_path))
for filename in os.listdir(temp_dir):
if filename.endswith('.png'):
zipf.write(os.path.join(temp_dir, filename), filename)
return zip_path
except Exception as e:
error_path = os.path.join(tempfile.gettempdir(), "error.txt")
with open(error_path, "w") as f:
f.write(f"Erro: {str(e)}")
return error_path
# Interface Gradio
iface = gr.Interface(
fn=pdf_to_markdown,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download ZIP (Markdown + Imagens)"),
title="Conversor de PDF para Markdown com OCR (compreensão de imagens)",
description="Faça upload de um arquivo PDF para convertê-lo em Markdown, preservando códigos, imagens e extraindo texto das imagens. O resultado será um arquivo ZIP contendo o Markdown e as imagens extraídas."
)
if __name__ == "__main__":
iface.launch()