import streamlit as st import os import time import logging import shutil from pathlib import Path import fitz # PyMuPDF from PIL import Image, ImageDraw import pandas as pd from rich.console import Console from rich.panel import Panel from rich.progress import Progress from rich.table import Table import concurrent.futures import numpy as np from skimage import filters, morphology from scipy.ndimage import gaussian_filter import easyocr from transformers import BertTokenizer, BertModel import torch from sklearn.metrics.pairwise import cosine_similarity import matplotlib.cm as cm import psutil from sklearn.cluster import AgglomerativeClustering import zipfile from io import BytesIO # Configuração do Logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('pdf_processor_streamlit.log', mode='w', encoding='utf-8'), # Log em arquivo logging.StreamHandler() # Log na saída do console ] ) # Variáveis de Ambiente (carregadas de .env) EXTRACTED_FOLDER = 'extracted' # Diretório de saída MAX_WORKERS = 4 # Número de threads/processos IMAGE_FORMAT = 'jpeg' # Formato da imagem de saída IMAGE_DPI = 300 # DPI da imagem LOG_PROCESS = True REMOVE_EXTRACTED = True HEATMAP_ALPHA = 0.3 # Transparência do heatmap BLUR_RADIUS = 10 # Raio do Blur THRESHOLD = 0.1 # Limiar do Threshold HEATMAP_COLOR_SCHEME = 'viridis' # Color Scheme do Heatmap TEXT_HEATMAP_ALPHA = 0.5 SIMILARITY_THRESHOLD = 0.7 OCR_DETECTION_THRESHOLD = 0.1 JPEG_QUALITY = 90 MAX_IMAGE_SIZE = 1000 # Inicialização do OCR e do Modelo BERT # Tenta usar detect_threshold, se falhar, usa threshold try: ocr_reader = easyocr.Reader(['en'], detect_threshold=OCR_DETECTION_THRESHOLD) except TypeError: try: ocr_reader = easyocr.Reader(['en'], threshold=OCR_DETECTION_THRESHOLD) except TypeError: ocr_reader = easyocr.Reader(['en']) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') # Paletas de cores simplificadas COLOR_SCHEMES = { 'red': [(i, 0, 0) for i in range(256)], 'green': [(0, i, 0) for i in range(256)], 'blue': [(0, 0, i) for i in range(256)], 'viridis': [ (48, 7, 75), (53, 8, 87), (59, 8, 99), (65, 7, 112), (71, 7, 125), (77, 7, 139), (84, 9, 154), (90, 10, 170), (97, 12, 187), (105, 16, 203), (112, 22, 220), (120, 29, 237), (128, 38, 253), (136, 51, 255), (145, 65, 255), (154, 80, 254), (163, 97, 252), (173, 114, 249), (183, 132, 244), (193, 150, 238), (203, 167, 231), (213, 184, 223), (223, 199, 215), (232, 213, 206), (240, 227, 197), (247, 240, 188), (252, 252, 178), (255, 255, 168), (255, 252, 158), (255, 244, 147), (255, 233, 137), (255, 221, 126), (255, 208, 115), (255, 195, 104), (255, 181, 92), (255, 167, 81), (255, 153, 69), (255, 138, 58), (255, 123, 46), (255, 108, 35), (255, 92, 23), (255, 77, 12), (255, 61, 0) ], 'magma': [ (0, 0, 0), (7, 0, 9), (13, 2, 18), (19, 5, 28), (24, 9, 38), (29, 13, 48), (34, 18, 58), (39, 24, 68), (43, 31, 78), (48, 39, 88), (52, 48, 98), (57, 58, 108), (61, 69, 118), (65, 81, 127), (69, 94, 137), (73, 108, 146), (77, 122, 154), (81, 136, 162), (85, 150, 169), (89, 164, 176), (92, 178, 183), (96, 192, 189), (99, 205, 195), (102, 219, 201), (105, 232, 206), (108, 245, 212), (111, 255, 217) ], 'inferno': [ (0, 0, 0), (3, 1, 8), (7, 1, 16), (11, 2, 24), (15, 3, 32), (19, 4, 40), (23, 5, 48), (27, 7, 56), (32, 8, 64), (36, 9, 72), (40, 11, 80), (44, 13, 88), (49, 15, 96), (54, 17, 104), (59, 19, 112), (64, 22, 120), (69, 24, 128), (75, 26, 136), (80, 29, 143), (86, 32, 151), (92, 35, 159), (98, 38, 166), (104, 42, 174), (111, 45, 181), (118, 49, 188), (125, 53, 195), (132, 58, 202), (140, 62, 209), (148, 67, 215), (156, 72, 222), (164, 78, 228), (172, 84, 234), (181, 90, 239), (190, 97, 244), (199, 104, 248), (208, 111, 252), (217, 119, 255), (227, 127, 255), (236, 135, 255), (246, 145, 254), (255, 154, 252), (255, 164, 248), (255, 174, 243), (255, 185, 237), (255, 196, 230),(255, 207, 223), (255, 219, 214),(255, 231, 205), (255, 242, 195), (255, 253, 184), (255, 255, 171), (255, 255, 155) ], 'plasma': [ (0, 0, 0), (3, 0, 6), (6, 1, 13), (9, 2, 19), (11, 3, 26), (14, 5, 32), (17, 8, 39), (20, 11, 45), (22, 15, 51), (24, 19, 57), (26, 23, 63), (28, 28, 69), (30, 32, 75), (31, 37, 81), (33, 42, 87), (34, 47, 92), (35, 53, 98), (36, 59, 104), (36, 65, 109), (37, 71, 115), (38, 77, 120), (38, 83, 125), (39, 90, 130), (39, 96, 135), (40, 102, 140), (41, 109, 144), (42, 115, 149), (44, 122, 154), (46, 128, 158), (47, 135, 162), (49, 142, 166), (51, 148, 170), (54, 155, 173), (56, 161, 177), (59, 168, 180), (61, 174, 183), (64, 181, 186), (67, 187, 189), (70, 193, 192), (73, 199, 194), (76, 206, 197), (79, 212, 199), (83, 218, 201), (86, 224, 203), (89, 230, 205), (92, 236, 208), (96, 241, 208), (100, 246, 210), (103, 251, 211), (106, 255, 212) ] } def list_pdf_files(root_dir='.'): """Lista todos os arquivos PDF em um diretório.""" logging.info(f"🔎 Iniciando a busca por arquivos PDF em: {root_dir}") pdf_files = [f for f in Path(root_dir).glob('*.pdf')] logging.info(f"📚 Encontrados {len(pdf_files)} arquivos PDF.") return pdf_files def create_dataframe(pdf_files): """Cria um DataFrame com informações sobre os arquivos PDF.""" logging.info("📊 Criando DataFrame...") df = pd.DataFrame(pdf_files, columns=['filepath']) df['filename'] = df['filepath'].apply(lambda x: x.name) df['pages_processed'] = 0 # Coluna para contar páginas processadas logging.info("✅ DataFrame criado.") return df def create_output_folder(filename): """Cria a pasta de saída para um arquivo PDF específico.""" output_path = Path(EXTRACTED_FOLDER) / Path(filename).stem os.makedirs(output_path, exist_ok=True) return output_path def create_image_heatmap(image_array, heatmap_type='object'): """Gera um mapa de calor a partir de uma matriz de imagem.""" gray_image = np.mean(image_array, axis=2) # Aplica um filtro gaussiano blurred_image = gaussian_filter(gray_image, sigma=BLUR_RADIUS) # Aplica um limiar adaptativo thresh = filters.threshold_otsu(blurred_image) binary_mask = blurred_image > thresh if heatmap_type == 'object': # Preenche pequenos buracos e remove objetos pequenos binary_mask = morphology.remove_small_objects(binary_mask, min_size=100) binary_mask = morphology.remove_small_holes(binary_mask, area_threshold=50) elif heatmap_type == 'logo': binary_mask = morphology.remove_small_objects(binary_mask, min_size=50) # Filtra novamente filtered_image = gaussian_filter(binary_mask.astype(float), sigma=BLUR_RADIUS) # Escala para [0, 255] heatmap = (filtered_image * 255).astype(np.uint8) if heatmap.shape[0] == 0 or heatmap.shape[1] == 0: return np.zeros((image_array.shape[0],image_array.shape[1]), dtype=np.uint8) return heatmap def apply_heatmap_overlay(image, heatmap, alpha=HEATMAP_ALPHA, color_scheme = HEATMAP_COLOR_SCHEME): """Aplica a sobreposição de mapa de calor a uma imagem.""" heatmap_pil = Image.fromarray(heatmap) heatmap_pil = heatmap_pil.convert("RGBA") # Aplica a paleta de cores if color_scheme in COLOR_SCHEMES: colormap = COLOR_SCHEMES[color_scheme] # Garante que a paleta de cores tenha pelo menos 256 cores while len(colormap) < 256: colormap.extend(colormap[-1:]) # Duplica a ultima cor else: colormap = [(i, i, i) for i in range(256)] # Default grayscale if color_scheme is not found heatmap_pil = Image.new("RGBA", heatmap_pil.size) pixels = heatmap_pil.load() for y in range(heatmap_pil.size[1]): for x in range(heatmap_pil.size[0]): pixel_value = heatmap[y][x] pixels[x, y] = colormap[pixel_value] + (int(alpha * 255),) image.paste(heatmap_pil, (0, 0), heatmap_pil) return image def extract_text_from_page(image): """Extrai o texto de uma imagem usando EasyOCR.""" results = ocr_reader.readtext(np.array(image)) return results def get_text_embeddings(text): """Obtém os embeddings de texto usando o modelo BERT.""" tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True) with torch.no_grad(): outputs = model(**tokens) return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() def create_text_heatmap(image, text_boxes, similarity_threshold = SIMILARITY_THRESHOLD, text_heatmap_alpha = TEXT_HEATMAP_ALPHA): """Cria um mapa de calor de texto sobrepondo na imagem.""" image_with_text_heatmap = image.copy().convert("RGBA") draw = ImageDraw.Draw(image_with_text_heatmap) if not text_boxes or len(text_boxes) < 2: return image_with_text_heatmap texts = [text for (_, text, _) in text_boxes] embeddings = [get_text_embeddings(text) for text in texts] # Calcula a similaridade entre todos os pares de textos similarity_matrix = cosine_similarity(embeddings) #Normalização das similaridades para o range [0, 1] min_similarity = np.min(similarity_matrix) max_similarity = np.max(similarity_matrix) normalized_similarity_matrix = (similarity_matrix - min_similarity) / (max_similarity - min_similarity) # Define uma paleta de cores cmap = cm.get_cmap('viridis') for i, (bbox, text, _) in enumerate(text_boxes): x1, y1 = int(bbox[0][0]), int(bbox[0][1]) x2, y2 = int(bbox[2][0]), int(bbox[2][1]) # Calcula a similaridade com outros textos e cria a média average_similarity = np.mean([normalized_similarity_matrix[i][j] for j in range(len(texts)) if i != j]) if average_similarity > similarity_threshold: color = tuple(int(c * 255) for c in cmap(average_similarity)[:3]) + (int(text_heatmap_alpha * 255),) draw.rectangle([(x1, y1), (x2, y2)], fill=color) return image_with_text_heatmap def resize_image(image, max_size): """Redimensiona a imagem mantendo a proporção.""" width, height = image.size if width > max_size or height > max_size: if width > height: new_width = max_size new_height = int(height * (max_size / width)) else: new_height = max_size new_width = int(width * (max_size / height)) return image.resize((new_width, new_height), Image.LANCZOS) return image def cluster_text_by_similarity(image, text_boxes, num_clusters=3): """Agrupa os blocos de texto por similaridade semântica.""" if not text_boxes or len(text_boxes) < 2: return image texts = [text for (_, text, _) in text_boxes] embeddings = [get_text_embeddings(text) for text in texts] if len(embeddings) < num_clusters: num_clusters = len(embeddings) # Agrupar embeddings clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward') clustering.fit(embeddings) labels = clustering.labels_ # Define uma paleta de cores cmap = cm.get_cmap('viridis', num_clusters) image_with_text_clusters = image.copy().convert("RGBA") draw = ImageDraw.Draw(image_with_text_clusters) for i, (bbox, _, _) in enumerate(text_boxes): x1, y1 = int(bbox[0][0]), int(bbox[0][1]) x2, y2 = int(bbox[2][0]), int(bbox[2][1]) color = tuple(int(c * 255) for c in cmap(labels[i])[:3]) + (int(TEXT_HEATMAP_ALPHA * 255),) draw.rectangle([(x1, y1), (x2, y2)], fill=color) return image_with_text_clusters def process_image(image, console, page_number): """Processa a imagem, aplicando todos os filtros e análises.""" # Redimensionar a imagem image = resize_image(image, MAX_IMAGE_SIZE) image_array = np.array(image) # Criar uma imagem transparente para sobreposições image_rgba = Image.new("RGBA", image.size, (0, 0, 0, 0)) # 1. Criar Mapa de Calor para Objetos (Vermelho) object_heatmap = create_image_heatmap(image_array, heatmap_type='object') image_rgba = apply_heatmap_overlay(image_rgba, object_heatmap, color_scheme='red') # 2. Criar Mapa de Calor para Blocos de Texto (Azul) text_heatmap = create_image_heatmap(image_array, heatmap_type='text') image_rgba = apply_heatmap_overlay(image_rgba, text_heatmap, color_scheme='blue') # 3. Extrair e Analisar o Texto text_boxes = extract_text_from_page(image) # 4. Agrupar texto por similaridade image_with_semantic_overlay = cluster_text_by_similarity(image_rgba, text_boxes) # 5. Criar Mapa de Calor para Logos (Verde) logo_heatmap = create_image_heatmap(image_array, heatmap_type='logo') final_image = apply_heatmap_overlay(image_with_semantic_overlay, logo_heatmap, color_scheme='green') # Sobrepõe a imagem com os heatmaps image.paste(final_image, (0, 0), final_image) return image def process_pdf_page(page, output_path, page_number, filename, console, image_placeholder): """Processa uma única página do PDF.""" try: pix = page.get_pixmap(dpi=IMAGE_DPI) image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) processed_image = process_image(image, console, page_number) image_path = output_path / f"page_{page_number + 1}.{IMAGE_FORMAT}" # Convert to RGB before saving as JPEG processed_image.convert('RGB').save(image_path, quality=JPEG_QUALITY, format=IMAGE_FORMAT) if LOG_PROCESS: st.success(f"🖼️ Página {page_number + 1} de '{filename}' salva em: {image_path}") # Exibe a imagem processada na tela st.image(processed_image, caption=f"Página {page_number + 1}", use_column_width=True) return True # Marca sucesso except Exception as e: logging.error(f"❌ Erro ao processar a página {page_number+1} de '{filename}': {e}") return False # Marca falha def process_pdf(row, image_placeholder): """Processa um único arquivo PDF.""" console = Console() # Cria console por thread filepath = row['filepath'] filename = row['filename'] output_path = create_output_folder(filename) total_pages = 0 # Para marcar no DF pages_processed = 0 # Para rastrear páginas processadas try: doc = fitz.open(str(filepath)) total_pages = len(doc) progress_bar = st.progress(0) for page_number, page in enumerate(doc): if process_pdf_page(page, output_path, page_number, filename, console, image_placeholder): pages_processed += 1 progress_bar.progress((page_number + 1) / total_pages) doc.close() if LOG_PROCESS: st.success(f"✅ '{filename}' concluído. {pages_processed} de {total_pages} páginas processadas.") except Exception as e: logging.error(f"🚨 Erro ao processar o PDF '{filename}': {e}") return pages_processed, output_path def parallel_pdf_processing(df, image_placeholder): """Processa os PDFs em paralelo, atualizando o DataFrame.""" logging.info(f"🚀 Iniciando o processamento paralelo com {MAX_WORKERS} threads.") with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = [executor.submit(process_pdf, row, image_placeholder) for index, row in df.iterrows()] processed_paths = {} # Usar tqdm para acompanhar o progresso geral with st.spinner("Processando PDFs..."): for index, future in enumerate(concurrent.futures.as_completed(futures)): try: pages_processed, output_path = future.result() df.loc[index, 'pages_processed'] = pages_processed processed_paths[df.loc[index, 'filename']] = output_path except Exception as e: logging.error(f"❌ Erro no processamento do PDF: {e}") logging.info("🏁 Processamento paralelo concluído.") return df, processed_paths def display_summary(df): """Exibe um resumo do processamento em tabela.""" st.markdown("### 📊 Resumo do Processamento") st.dataframe(df[['filename', 'pages_processed']]) def clean_extracted_folders(): if REMOVE_EXTRACTED: logging.info(f"🧹 Limpando pasta de extração: {EXTRACTED_FOLDER}") shutil.rmtree(EXTRACTED_FOLDER, ignore_errors=True) logging.info("✅ Pasta de extração limpa.") else: logging.info("⚠️ Remoção da pasta de extração desabilitada") def get_resource_usage(): """Obtém o uso de CPU e memória.""" cpu_percent = psutil.cpu_percent() memory_usage = psutil.virtual_memory().percent return cpu_percent, memory_usage def display_initialization_info(): """Exibe informações de inicialização no console.""" st.markdown("## 🚀 Iniciando o Processamento de PDFs 🚀", unsafe_allow_html=True) st.markdown("### 📚 Bibliotecas carregadas:") st.markdown(" - `fitz` 📄 (PyMuPDF)") st.markdown(" - `Pillow` 🖼️ (PIL)") st.markdown(" - `rich` 🎨 (Console Ricos)") st.markdown(" - `tqdm` ⏳ (Barra de Progresso)") st.markdown(" - `scikit-image` 🔬 (Processamento de Imagem)") st.markdown(" - `numpy` 🔢 (Arrays Numéricos)") st.markdown(" - `scipy` 📈 (Filtro Gaussiano)") st.markdown(" - `easyocr` 👁️ (OCR)") st.markdown(" - `transformers` 🤖 (Modelos NLP)") st.markdown(" - `torch` 🔥 (Tensor Engine)") st.markdown(" - `psutil` ⚙️ (Monitor de Recursos)") st.markdown("### 🤖 Modelos inicializados:") st.markdown(" - `BERT` 🧠 (Modelo de Embedding de Texto)") st.markdown(" - `EasyOCR` 👁️ (Modelo de OCR)") cpu_percent, memory_usage = get_resource_usage() st.markdown("### ⚙️ Recursos:") st.markdown(f" - 🎛️ CPU: `{cpu_percent:.2f}%`") st.markdown(f" - 💾 Memória: `{memory_usage:.2f}%`") st.markdown("### 🔀 Pipelines Multi-Thread:") st.markdown(f" - 🧵 Máximo de Threads: `{MAX_WORKERS}`") st.markdown(" - 🔄 Processamento Paralelo Ativo") def create_zip_archive(output_paths, zip_name = "processed_images.zip"): """Cria um arquivo zip com todas as imagens processadas.""" zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: for filename, output_path in output_paths.items(): for image_file in output_path.glob(f"*.{IMAGE_FORMAT}"): zipf.write(image_file, arcname=f"{Path(filename).stem}/{image_file.name}") zip_buffer.seek(0) # Move para o início do buffer return zip_buffer def main(): """Função principal que coordena todo o processo.""" start_time = time.time() display_initialization_info() uploaded_file = st.file_uploader("Carregue um arquivo PDF", type=['pdf']) image_placeholder = st.empty() # Placeholder para exibir as imagens if uploaded_file is not None: # Salvar o arquivo PDF temporariamente with open("temp.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) pdf_files = [Path("temp.pdf")] df = create_dataframe(pdf_files) df, processed_paths = parallel_pdf_processing(df, image_placeholder) display_summary(df) zip_buffer = create_zip_archive(processed_paths) st.download_button( label="Baixar Imagens Processadas (ZIP)", data=zip_buffer, file_name="processed_images.zip", mime="application/zip", ) os.remove("temp.pdf") # Remove o arquivo temporario end_time = time.time() duration = end_time - start_time st.success(f"🎉 Processo Concluído em {duration:.2f} segundos!", icon="🎉") cpu_percent, memory_usage = get_resource_usage() st.markdown("### ⚙️ Consumo Final de Recursos:") st.markdown(f" - 🎛️ CPU: `{cpu_percent:.2f}%`") st.markdown(f" - 💾 Memória: `{memory_usage:.2f}%`") if __name__ == "__main__": main()