Spaces:
Build error
Build error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import re | |
| import os | |
| import tempfile | |
| from PIL import Image | |
| import io | |
| import requests | |
| import zipfile | |
| import shutil | |
| from functools import lru_cache | |
| import concurrent.futures | |
| # Cache para resultados de OCR | |
| def ocr_space_api(image_path): | |
| api_key = 'YOUR_OCR_SPACE_API_KEY' # Substitua pela sua chave API | |
| payload = {'isOverlayRequired': False, | |
| 'apikey': api_key, | |
| 'language': 'eng', | |
| } | |
| try: | |
| with open(image_path, 'rb') as image_file: | |
| response = requests.post('https://api.ocr.space/parse/image', | |
| files={image_path: image_file}, | |
| data=payload, | |
| timeout=30) # Timeout de 30 segundos | |
| response.raise_for_status() | |
| result = response.json() | |
| if isinstance(result, dict) and 'ParsedResults' in result and len(result['ParsedResults']) > 0: | |
| return result['ParsedResults'][0].get('ParsedText', '') | |
| else: | |
| return "Erro: Formato de resposta inesperado da API OCR" | |
| except requests.RequestException as e: | |
| return f"Erro na requisição HTTP: {str(e)}" | |
| except Exception as e: | |
| return f"Erro inesperado: {str(e)}" | |
| def extract_images_and_text(pdf_document, page_num, temp_dir): | |
| page = pdf_document[page_num] | |
| image_list = page.get_images(full=True) | |
| images_with_text = [] | |
| def process_image(img): | |
| try: | |
| xref = img[0] | |
| base_image = pdf_document.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| image_filename = f"image_page{page_num+1}_{img[0]}.png" | |
| image_path = os.path.join(temp_dir, image_filename) | |
| image.save(image_path) | |
| text = ocr_space_api(image_path) | |
| return (image_filename, text) | |
| except Exception as e: | |
| print(f"Erro ao processar imagem {img[0]} na página {page_num}: {str(e)}") | |
| return None | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | |
| results = list(executor.map(process_image, image_list)) | |
| images_with_text = [r for r in results if r is not None] | |
| return images_with_text | |
| def is_heading(text): | |
| return text.isupper() or re.match(r'^[A-Z][\w\s]+:', text) | |
| def pdf_to_markdown(pdf_file): | |
| try: | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| if isinstance(pdf_file, str): | |
| temp_pdf_path = pdf_file | |
| original_filename = os.path.splitext(os.path.basename(pdf_file))[0] | |
| elif hasattr(pdf_file, 'name'): | |
| temp_pdf_path = os.path.join(temp_dir, "temp.pdf") | |
| with open(temp_pdf_path, "wb") as f: | |
| f.write(pdf_file.read() if hasattr(pdf_file, 'read') else pdf_file.value) | |
| original_filename = os.path.splitext(pdf_file.name)[0] | |
| else: | |
| raise ValueError("Tipo de arquivo não suportado") | |
| doc = fitz.open(temp_pdf_path) | |
| markdown_content = "" | |
| for page_num in range(len(doc)): | |
| try: | |
| page = doc[page_num] | |
| text = page.get_text() | |
| images_with_text = extract_images_and_text(doc, page_num, temp_dir) | |
| lines = text.split('\n') | |
| for line in lines: | |
| if is_heading(line): | |
| markdown_content += f"## {line.strip()}\n\n" | |
| else: | |
| markdown_content += f"{line.strip()}\n\n" | |
| for img_filename, img_text in images_with_text: | |
| markdown_content += f"\n\n" | |
| if img_text.strip(): | |
| markdown_content += f"**Texto extraído da imagem:**\n\n{img_text.strip()}\n\n" | |
| except Exception as e: | |
| print(f"Erro ao processar a página {page_num}: {str(e)}") | |
| continue | |
| doc.close() | |
| markdown_path = os.path.join(temp_dir, f"{original_filename}.md") | |
| with open(markdown_path, "w", encoding="utf-8") as f: | |
| f.write(markdown_content) | |
| zip_filename = f"{original_filename}_output.zip" | |
| zip_path = os.path.join(tempfile.gettempdir(), zip_filename) | |
| with zipfile.ZipFile(zip_path, 'w') as zipf: | |
| zipf.write(markdown_path, os.path.basename(markdown_path)) | |
| for filename in os.listdir(temp_dir): | |
| if filename.endswith('.png'): | |
| zipf.write(os.path.join(temp_dir, filename), filename) | |
| return zip_path | |
| except Exception as e: | |
| error_path = os.path.join(tempfile.gettempdir(), "error.txt") | |
| with open(error_path, "w") as f: | |
| f.write(f"Erro: {str(e)}") | |
| return error_path | |
| # Interface Gradio | |
| iface = gr.Interface( | |
| fn=pdf_to_markdown, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs=gr.File(label="Download ZIP (Markdown + Imagens)"), | |
| title="Conversor de PDF para Markdown com OCR (compreensão de imagens)", | |
| description="Faça upload de um arquivo PDF para convertê-lo em Markdown, preservando códigos, imagens e extraindo texto das imagens. O resultado será um arquivo ZIP contendo o Markdown e as imagens extraídas." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |