File size: 5,772 Bytes
a97042a
 
 
 
 
 
 
4a3021f
a97042a
 
4a3021f
 
a97042a
4a3021f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a97042a
 
 
 
 
4a3021f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a97042a
 
4a3021f
 
a97042a
 
 
 
4a3021f
a97042a
 
4a3021f
a97042a
4a3021f
 
a97042a
 
 
 
 
 
 
 
4a3021f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a97042a
4a3021f
 
 
a97042a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a3021f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import fitz  # PyMuPDF
import re
import os
import tempfile
from PIL import Image
import io
import requests
import zipfile
import shutil
from functools import lru_cache
import concurrent.futures

# Cache para resultados de OCR
@lru_cache(maxsize=100)
def ocr_space_api(image_path):
    api_key = 'YOUR_OCR_SPACE_API_KEY'  # Substitua pela sua chave API
    payload = {'isOverlayRequired': False,
               'apikey': api_key,
               'language': 'eng',
               }
    try:
        with open(image_path, 'rb') as image_file:
            response = requests.post('https://api.ocr.space/parse/image',
                                     files={image_path: image_file},
                                     data=payload,
                                     timeout=30)  # Timeout de 30 segundos
        response.raise_for_status()
        result = response.json()
        
        if isinstance(result, dict) and 'ParsedResults' in result and len(result['ParsedResults']) > 0:
            return result['ParsedResults'][0].get('ParsedText', '')
        else:
            return "Erro: Formato de resposta inesperado da API OCR"
    except requests.RequestException as e:
        return f"Erro na requisição HTTP: {str(e)}"
    except Exception as e:
        return f"Erro inesperado: {str(e)}"

def extract_images_and_text(pdf_document, page_num, temp_dir):
    page = pdf_document[page_num]
    image_list = page.get_images(full=True)
    images_with_text = []
    
    def process_image(img):
        try:
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            image_filename = f"image_page{page_num+1}_{img[0]}.png"
            image_path = os.path.join(temp_dir, image_filename)
            image.save(image_path)
            
            text = ocr_space_api(image_path)
            return (image_filename, text)
        except Exception as e:
            print(f"Erro ao processar imagem {img[0]} na página {page_num}: {str(e)}")
            return None

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(process_image, image_list))
    
    images_with_text = [r for r in results if r is not None]
    return images_with_text

def is_heading(text):
    return text.isupper() or re.match(r'^[A-Z][\w\s]+:', text)

def pdf_to_markdown(pdf_file):
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            if isinstance(pdf_file, str):
                temp_pdf_path = pdf_file
                original_filename = os.path.splitext(os.path.basename(pdf_file))[0]
            elif hasattr(pdf_file, 'name'):
                temp_pdf_path = os.path.join(temp_dir, "temp.pdf")
                with open(temp_pdf_path, "wb") as f:
                    f.write(pdf_file.read() if hasattr(pdf_file, 'read') else pdf_file.value)
                original_filename = os.path.splitext(pdf_file.name)[0]
            else:
                raise ValueError("Tipo de arquivo não suportado")
            
            doc = fitz.open(temp_pdf_path)
            
            markdown_content = ""
            for page_num in range(len(doc)):
                try:
                    page = doc[page_num]
                    text = page.get_text()
                    
                    images_with_text = extract_images_and_text(doc, page_num, temp_dir)
                    
                    lines = text.split('\n')
                    for line in lines:
                        if is_heading(line):
                            markdown_content += f"## {line.strip()}\n\n"
                        else:
                            markdown_content += f"{line.strip()}\n\n"
                    
                    for img_filename, img_text in images_with_text:
                        markdown_content += f"![{img_filename}]({img_filename})\n\n"
                        if img_text.strip():
                            markdown_content += f"**Texto extraído da imagem:**\n\n{img_text.strip()}\n\n"
                
                except Exception as e:
                    print(f"Erro ao processar a página {page_num}: {str(e)}")
                    continue
            
            doc.close()
            
            markdown_path = os.path.join(temp_dir, f"{original_filename}.md")
            with open(markdown_path, "w", encoding="utf-8") as f:
                f.write(markdown_content)
            
            zip_filename = f"{original_filename}_output.zip"
            zip_path = os.path.join(tempfile.gettempdir(), zip_filename)
            with zipfile.ZipFile(zip_path, 'w') as zipf:
                zipf.write(markdown_path, os.path.basename(markdown_path))
                for filename in os.listdir(temp_dir):
                    if filename.endswith('.png'):
                        zipf.write(os.path.join(temp_dir, filename), filename)
            
            return zip_path

    except Exception as e:
        error_path = os.path.join(tempfile.gettempdir(), "error.txt")
        with open(error_path, "w") as f:
            f.write(f"Erro: {str(e)}")
        return error_path

# Interface Gradio
iface = gr.Interface(
    fn=pdf_to_markdown,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.File(label="Download ZIP (Markdown + Imagens)"),
    title="Conversor de PDF para Markdown com OCR (compreensão de imagens)",
    description="Faça upload de um arquivo PDF para convertê-lo em Markdown, preservando códigos, imagens e extraindo texto das imagens. O resultado será um arquivo ZIP contendo o Markdown e as imagens extraídas."
)

if __name__ == "__main__":
    iface.launch()