Spaces:
Build error
Build error
File size: 5,772 Bytes
a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f a97042a 4a3021f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import gradio as gr
import fitz # PyMuPDF
import re
import os
import tempfile
from PIL import Image
import io
import requests
import zipfile
import shutil
from functools import lru_cache
import concurrent.futures
# Cache para resultados de OCR
@lru_cache(maxsize=100)
def ocr_space_api(image_path):
api_key = 'YOUR_OCR_SPACE_API_KEY' # Substitua pela sua chave API
payload = {'isOverlayRequired': False,
'apikey': api_key,
'language': 'eng',
}
try:
with open(image_path, 'rb') as image_file:
response = requests.post('https://api.ocr.space/parse/image',
files={image_path: image_file},
data=payload,
timeout=30) # Timeout de 30 segundos
response.raise_for_status()
result = response.json()
if isinstance(result, dict) and 'ParsedResults' in result and len(result['ParsedResults']) > 0:
return result['ParsedResults'][0].get('ParsedText', '')
else:
return "Erro: Formato de resposta inesperado da API OCR"
except requests.RequestException as e:
return f"Erro na requisição HTTP: {str(e)}"
except Exception as e:
return f"Erro inesperado: {str(e)}"
def extract_images_and_text(pdf_document, page_num, temp_dir):
page = pdf_document[page_num]
image_list = page.get_images(full=True)
images_with_text = []
def process_image(img):
try:
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
image_filename = f"image_page{page_num+1}_{img[0]}.png"
image_path = os.path.join(temp_dir, image_filename)
image.save(image_path)
text = ocr_space_api(image_path)
return (image_filename, text)
except Exception as e:
print(f"Erro ao processar imagem {img[0]} na página {page_num}: {str(e)}")
return None
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(process_image, image_list))
images_with_text = [r for r in results if r is not None]
return images_with_text
def is_heading(text):
return text.isupper() or re.match(r'^[A-Z][\w\s]+:', text)
def pdf_to_markdown(pdf_file):
try:
with tempfile.TemporaryDirectory() as temp_dir:
if isinstance(pdf_file, str):
temp_pdf_path = pdf_file
original_filename = os.path.splitext(os.path.basename(pdf_file))[0]
elif hasattr(pdf_file, 'name'):
temp_pdf_path = os.path.join(temp_dir, "temp.pdf")
with open(temp_pdf_path, "wb") as f:
f.write(pdf_file.read() if hasattr(pdf_file, 'read') else pdf_file.value)
original_filename = os.path.splitext(pdf_file.name)[0]
else:
raise ValueError("Tipo de arquivo não suportado")
doc = fitz.open(temp_pdf_path)
markdown_content = ""
for page_num in range(len(doc)):
try:
page = doc[page_num]
text = page.get_text()
images_with_text = extract_images_and_text(doc, page_num, temp_dir)
lines = text.split('\n')
for line in lines:
if is_heading(line):
markdown_content += f"## {line.strip()}\n\n"
else:
markdown_content += f"{line.strip()}\n\n"
for img_filename, img_text in images_with_text:
markdown_content += f"\n\n"
if img_text.strip():
markdown_content += f"**Texto extraído da imagem:**\n\n{img_text.strip()}\n\n"
except Exception as e:
print(f"Erro ao processar a página {page_num}: {str(e)}")
continue
doc.close()
markdown_path = os.path.join(temp_dir, f"{original_filename}.md")
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
zip_filename = f"{original_filename}_output.zip"
zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
with zipfile.ZipFile(zip_path, 'w') as zipf:
zipf.write(markdown_path, os.path.basename(markdown_path))
for filename in os.listdir(temp_dir):
if filename.endswith('.png'):
zipf.write(os.path.join(temp_dir, filename), filename)
return zip_path
except Exception as e:
error_path = os.path.join(tempfile.gettempdir(), "error.txt")
with open(error_path, "w") as f:
f.write(f"Erro: {str(e)}")
return error_path
# Interface Gradio
iface = gr.Interface(
fn=pdf_to_markdown,
inputs=gr.File(label="Upload PDF"),
outputs=gr.File(label="Download ZIP (Markdown + Imagens)"),
title="Conversor de PDF para Markdown com OCR (compreensão de imagens)",
description="Faça upload de um arquivo PDF para convertê-lo em Markdown, preservando códigos, imagens e extraindo texto das imagens. O resultado será um arquivo ZIP contendo o Markdown e as imagens extraídas."
)
if __name__ == "__main__":
iface.launch() |