Spaces:

caarleexx
/

ToM

Sleeping

App Files Files Community

caarleexx commited on Dec 6, 2025

Commit

1dacd2d

verified ·

1 Parent(s): 3ba9006

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -120

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ╔════════════════════════════════════════════════════════════════════════════╗
-# ║  PIPELINE V44: FRAG + VISÃO PAGINADA + PARALELISMO + CACHE + AUDITORIA     ║
 # ╚════════════════════════════════════════════════════════════════════════════╝
 import os
@@ -11,7 +11,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 import gradio as gr
 import google.generativeai as genai
-import pypdf  # pip install pypdf
 # ==================== 1. CONFIGURAÇÃO ====================
@@ -19,12 +20,15 @@ api_key = os.getenv("GOOGLE_API_KEY", "SUA_API_KEY_AQUI")
 if api_key and api_key != "SUA_API_KEY_AQUI":
     genai.configure(api_key=api_key)
 model_flash = genai.GenerativeModel("gemini-flash-latest")
 model_pro   = genai.GenerativeModel("gemini-pro-latest")
 ARQUIVO_CONFIG = "protocolo_fragmentacao_visao-3.json"
 PASTA_CACHE = "cache_processamento"
-MAX_WORKERS = 5  # Paralelismo
 os.makedirs(PASTA_CACHE, exist_ok=True)
@@ -41,14 +45,13 @@ def carregar_protocolo():
     except:
         proto = [
             {
-                "nome": "PAGINADOR_VISUAL",
                 "missao": (
-                    "Transcreva"
-                    "Separe por página e devolva uma lista JSON com objetos "
-                    "{'pagina','transcricao'}."
-                    "Retorne APENAS essa lista JSON, sem texto extra."
                 ),
-                "tipo_saida": "json",
                 "modelo": "pro",
             }
         ]
@@ -78,10 +81,27 @@ def carregar_cache(hash_id):
             return json.load(f)
     return None
-# --------- DIVISÃO PDF ---------
 def ler_anexo_e_fragmentar(arquivo, paginas_por_fragmento=5, logs=""):
-    logs = log_point("ler_anexo_e_fragmentar() chamado", logs)
     if arquivo is None:
         return [], "", logs
@@ -91,50 +111,72 @@ def ler_anexo_e_fragmentar(arquivo, paginas_por_fragmento=5, logs=""):
     if not os.path.exists(filename):
         return [], f"[ERRO: Arquivo não encontrado]", logs
-    anexo_info = f"[PDF: {os.path.basename(filename)}]"
     if not filename.lower().endswith(".pdf"):
         logs = log_point("Arquivo texto simples detectado", logs)
         try:
             with open(filename, "r", encoding="utf-8") as f:
                 texto = f.read()
-            # Retorna como um único fragmento de texto
             return [texto], f"[TXT: {os.path.basename(filename)}]", logs
         except:
             return [], "[ERRO LEITURA TXT]", logs
     try:
-        reader = pypdf.PdfReader(filename)
-        total_pages = len(reader.pages)
-        logs = log_point(f"PDF carregado: {total_pages} páginas", logs)
         fragments = []
         for i in range(0, total_pages, paginas_por_fragmento):
-            start = i + 1
             end = min(i + paginas_por_fragmento, total_pages)
-            bloco_texto = ""
-            for p in range(i, end):
-                try:
-                    t = reader.pages[p].extract_text() or ""
-                except Exception as e:
-                    t = f"\n[ERRO_EXTRACT_PAG_{p+1}: {e}]\n"
-                bloco_texto += f"\n=== PAGINA {p+1}/{total_pages} ===\n{t}\n"
             fragment = (
                 f"=== FRAG {i//paginas_por_fragmento + 1} "
-                f"(PÁGS {start}-{end}/{total_pages}) ===\n"
-                f"{bloco_texto.strip()}"
             )
             fragments.append(fragment)
         logs = log_point(f"Total de fragmentos criados: {len(fragments)}", logs)
         return fragments, anexo_info, logs
     except Exception as e:
-        logs = log_point(f"ERRO PDF: {e}", logs)
-        return [], f"[ERRO PDF: {str(e)}]", logs
-# ==================== 3. ENGINE DE EXECUÇÃO ====================
 def _extrair_json_possivel(out_raw: str) -> str:
     cleaned = out_raw.strip()
@@ -151,7 +193,7 @@ def _extrair_json_possivel(out_raw: str) -> str:
 def executar_no(timeline, config, fragmento_input=None):
     """
-    Função Worker que será chamada tanto sequencialmente quanto em paralelo.
     """
     modelo = model_pro if config.get("modelo") == "pro" else model_flash
@@ -176,56 +218,67 @@ def executar_no(timeline, config, fragmento_input=None):
                 out = resp.text or ""
                 break
             except Exception as e:
-                if "429" in str(e):
                     time.sleep(2 * (tentativa + 1))
                     continue
                 raise e
         content = out
-        if config["tipo_saida"] == "json":
             cleaned = _extrair_json_possivel(out)
             try:
                 content = json.loads(cleaned)
             except:
-                content = [] # Fallback em caso de erro de parse
         return {"role": "assistant", "agent": config["nome"], "content": content}, None
     except Exception as e:
         return {"role": "system", "error": str(e)}, str(e)
-# ==================== 4. ORQUESTRADOR ====================
 def orquestrador(texto, arquivo, history, json_config, confext_state):
-    logs = f"🚀 START: {datetime.now().strftime('%H:%M:%S')}\n"
-    logs = log_point("Orquestrador V44 iniciado", logs)
     # 1. Preparação
     if history is None: history = []
     nome_arquivo = os.path.basename(getattr(arquivo, "name", "sem_arquivo")) if arquivo else "sem_arquivo"
-    hash_op = gerar_hash_arquivo(nome_arquivo + json_config) # Hash baseado no arquivo + protocolo
     # 2. Verifica Cache
     cache_existente = carregar_cache(hash_op) if arquivo else None
     if cache_existente:
         logs = log_point(f"♻️ Cache encontrado para {nome_arquivo}", logs)
         confext_upload = cache_existente["confext_upload"]
         timeline = cache_existente.get("timeline", [])
-        history.append([texto, "✅ Arquivo carregado do cache! Análise pronta."])
         yield history, timeline, logs, confext_upload
-        # Se houver texto novo do usuário, seguimos para análise final, senão paramos
         if not texto:
             return
     else:
-        # 3. Processamento Normal
         fragmentos, anexo_info, logs = ler_anexo_e_fragmentar(
             arquivo, paginas_por_fragmento=5, logs=logs
         )
-        history.append([texto + (" 📎" if arquivo else ""), None])
-        yield history, {}, logs, confext_state
         try:
             protocolo = json.loads(json_config)
@@ -235,87 +288,73 @@ def orquestrador(texto, arquivo, history, json_config, confext_state):
             return
         timeline = [{"role": "user", "content": texto}]
         confext_upload = {
             "arquivo": nome_arquivo,
             "meta": anexo_info,
-            "paginas": []
         }
-        # 4. Execução Paginador (Paralela)
-        if protocolo and fragmentos:
-            cfg_visao = protocolo[0] # Assume que o primeiro é o leitor
-            logs = log_point(f"Iniciando Leitura Paralela ({MAX_WORKERS} workers) com {cfg_visao['nome']}", logs)
-            history[-1][1] = f"⏳ Fragmentando e lendo {len(fragmentos)} partes em paralelo..."
-            yield history, timeline, logs, confext_upload
-            resultados_ordenados = [None] * len(fragmentos)
-            with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-                futures_map = {executor.submit(executar_no, [], cfg_visao, frag): i for i, frag in enumerate(fragmentos)}
-                concluidos = 0
-                for future in as_completed(futures_map):
-                    idx = futures_map[future]
-                    res, erro = future.result()
-                    if erro:
-                        logs = log_point(f"Erro no frag {idx}: {erro}", logs)
-                    else:
-                        resultados_ordenados[idx] = res["content"]
-                    concluidos += 1
-                    history[-1][1] = f"⏳ Leitura: {concluidos}/{len(fragmentos)} partes processadas..."
-                    yield history, timeline, logs, confext_upload
-            # Consolidar resultados ordenados
-            for pags in resultados_ordenados:
-                if pags:
-                    if isinstance(pags, list):
-                        confext_upload["paginas"].extend(pags)
-                    elif isinstance(pags, dict):
-                        confext_upload["paginas"].append(pags)
-            logs = log_point(f"Leitura concluída. Total páginas extraídas: {len(confext_upload['paginas'])}", logs)
-            # Salvar Cache após a leitura pesada
-            if arquivo:
-                salvar_cache(hash_op, {"confext_upload": confext_upload, "timeline": timeline})
-                logs = log_point("Estado salvo em Cache", logs)
-        # Injeta contexto no timeline
-        timeline.append({
-            "role": "system",
-            "agent": "CONFEXT_UPLOAD",
-            "content": confext_upload
-        })
-        # 5. Execução dos Agentes de Análise (Sequencial)
-        restante = protocolo[1:] if protocolo else []
-        for cfg in restante:
-            history[-1][1] = f"⚙️ {cfg['nome']} analisando..."
-            logs = log_point(f"Iniciando agente: {cfg['nome']}", logs)
-            yield history, timeline, logs, confext_upload
-            # Passa timeline atualizada
-            res, erro = executar_no(timeline, cfg, fragmento_input=None)
-            if erro:
-                logs = log_point(f"Erro agente {cfg['nome']}: {erro}", logs)
-            else:
-                timeline.append(res)
-                if cfg.get("tipo_saida") == "texto":
-                    history[-1][1] = res["content"]
-            yield history, timeline, logs, confext_upload
     if not texto and arquivo:
-        history[-1][1] = "✅ Documento processado e indexado. Pode fazer perguntas."
     logs = log_point("Processo Finalizado", logs)
     yield history, timeline, logs, confext_upload
-# ==================== 5. UI ====================
 def ui_clean():
     css = """
@@ -325,7 +364,7 @@ def ui_clean():
     config_init = carregar_protocolo()
-    with gr.Blocks(title="AI Forensics Auto V44", css=css, theme=gr.themes.Soft()) as app:
         confext_state = gr.State(value=None)
         with gr.Tabs():
@@ -360,18 +399,17 @@ def ui_clean():
                 def _on_upload(x):
                     nome = os.path.basename(getattr(x, "name", x))
-                    return f"📎 Anexo pronto para análise: {nome}"
                 file_in.upload(_on_upload, inputs=file_in, outputs=file_status)
-            # --- AQUI ESTÁ A ABA SOLICITADA ---
             with gr.Tab("🕵️ Auditoria & Debug"):
-                gr.Markdown("### 🧠 Processo Interno de Pensamento")
                 with gr.Row():
-                    out_dna = gr.JSON(label="Timeline da IA (Contexto)")
                     out_logs = gr.Textbox(label="Logs do Sistema", lines=20)
-                gr.Markdown("### 📂 Dados Estruturados (Confext)")
                 confext_view = gr.JSON(label="Conteúdo Extraído")
             with gr.Tab("⚙️ Config"):
@@ -393,7 +431,7 @@ def ui_clean():
             trig(
                 _orq_wrapper,
                 inputs=[txt_in, file_in, chatbot, code_json, confext_state],
-                outputs=[chatbot, out_dna, out_logs, confext_state], # Atualiza aba Debug
             ).then(
                 lambda c: (None, None, "", c)[1:],
                 inputs=confext_state,
@@ -401,7 +439,7 @@ def ui_clean():
             ).then(
                 lambda c: c,
                 inputs=confext_state,
-                outputs=confext_view, # Atualiza visualizador JSON
             )
     return app

 # ╔════════════════════════════════════════════════════════════════════════════╗
+# ║  PIPELINE V45: FRAG + OCR VISUAL (GEMINI) + AUDITORIA (HF READY)           ║
 # ╚════════════════════════════════════════════════════════════════════════════╝
 import os
 import gradio as gr
 import google.generativeai as genai
+from pdf2image import convert_from_path  # Requer poppler-utils no packages.txt
+from PIL import Image
 # ==================== 1. CONFIGURAÇÃO ====================
 if api_key and api_key != "SUA_API_KEY_AQUI":
     genai.configure(api_key=api_key)
+# Modelos
+# Flash: Usado para OCR rápido (Visual) e tarefas simples
 model_flash = genai.GenerativeModel("gemini-flash-latest")
+# Pro: Usado para raciocínio complexo no pipeline
 model_pro   = genai.GenerativeModel("gemini-pro-latest")
 ARQUIVO_CONFIG = "protocolo_fragmentacao_visao-3.json"
 PASTA_CACHE = "cache_processamento"
+MAX_WORKERS = 4  # Ajustado para evitar Rate Limit do Gemini no OCR
 os.makedirs(PASTA_CACHE, exist_ok=True)
     except:
         proto = [
             {
+                "nome": "ANALISTA_PRINCIPAL",
                 "missao": (
+                    "Analise o conteúdo transcrito. "
+                    "Identifique datas, nomes e o objetivo do documento. "
+                    "Retorne um resumo estruturado."
                 ),
+                "tipo_saida": "texto",
                 "modelo": "pro",
             }
         ]
             return json.load(f)
     return None
+# ==================== 3. ENGINE OCR (SUBSTITUI PYPDF) ====================
+def transcrever_pagina_imagem(imagem, indice):
+    """Função auxiliar para transcrever uma única imagem via Gemini Vision"""
+    try:
+        prompt_ocr = (
+            "Atue como um sistema OCR. Transcreva fielmente todo o texto desta imagem. "
+            "Se houver tabelas, represente-as em Markdown. "
+            "Não faça comentários, apenas retorne o texto."
+        )
+        response = model_flash.generate_content([prompt_ocr, imagem])
+        texto = response.text if response.text else "[Página vazia ou ilegível]"
+        return indice, f"=== PÁGINA {indice} ===\n{texto}\n"
+    except Exception as e:
+        return indice, f"=== PÁGINA {indice} (ERRO OCR) ===\nErro: {str(e)}\n"
 def ler_anexo_e_fragmentar(arquivo, paginas_por_fragmento=5, logs=""):
+    """
+    V45: Converte PDF em Imagens -> Gemini Vision OCR -> Fragmentos de Texto.
+    """
+    logs = log_point("ler_anexo_e_fragmentar (OCR V45) chamado", logs)
     if arquivo is None:
         return [], "", logs
     if not os.path.exists(filename):
         return [], f"[ERRO: Arquivo não encontrado]", logs
+    anexo_info = f"[DOC: {os.path.basename(filename)}]"
+    # 1. Se for TXT/MD simples
     if not filename.lower().endswith(".pdf"):
         logs = log_point("Arquivo texto simples detectado", logs)
         try:
             with open(filename, "r", encoding="utf-8") as f:
                 texto = f.read()
             return [texto], f"[TXT: {os.path.basename(filename)}]", logs
         except:
             return [], "[ERRO LEITURA TXT]", logs
+    # 2. Processamento PDF com OCR (Vision)
     try:
+        logs = log_point("Convertendo PDF em imagens (pdf2image)...", logs)
+        # Importante: No HuggingFace, o poppler deve estar instalado via packages.txt
+        imagens = convert_from_path(filename)
+        total_pages = len(imagens)
+        logs = log_point(f"PDF convertido: {total_pages} páginas (imagens). Iniciando OCR...", logs)
+        # Transcriç��o Paralela das Imagens
+        textos_paginas = [""] * total_pages
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            # Submete tarefas
+            futuros = {executor.submit(transcrever_pagina_imagem, img, i+1): i for i, img in enumerate(imagens)}
+            completed_count = 0
+            for future in as_completed(futuros):
+                idx, texto_transcrito = future.result()
+                # O índice retornado é baseado em 1, ajustamos para lista 0-based
+                textos_paginas[idx-1] = texto_transcrito
+                completed_count += 1
+                if completed_count % 2 == 0:
+                    print(f"OCR Progresso: {completed_count}/{total_pages}")
+        logs = log_point("OCR concluído. Agrupando fragmentos...", logs)
+        # 3. Agrupar páginas transcritas em fragmentos
         fragments = []
+        full_ocr_text = "" # Opcional: manter tudo junto se precisar
         for i in range(0, total_pages, paginas_por_fragmento):
+            start = i
             end = min(i + paginas_por_fragmento, total_pages)
+            bloco_texto = "\n".join(textos_paginas[start:end])
             fragment = (
                 f"=== FRAG {i//paginas_por_fragmento + 1} "
+                f"(PÁGS {start+1}-{end}/{total_pages}) ===\n"
+                f"{bloco_texto}"
             )
             fragments.append(fragment)
         logs = log_point(f"Total de fragmentos criados: {len(fragments)}", logs)
         return fragments, anexo_info, logs
     except Exception as e:
+        err_msg = f"ERRO CRÍTICO OCR: {str(e)}"
+        if "poppler" in str(e).lower():
+            err_msg += " (DICA: Verifique se poppler-utils está instalado no sistema/packages.txt)"
+        logs = log_point(err_msg, logs)
+        return [], f"[ERRO: {err_msg}]", logs
+# ==================== 4. ENGINE DE EXECUÇÃO (PIPELINE) ====================
 def _extrair_json_possivel(out_raw: str) -> str:
     cleaned = out_raw.strip()
 def executar_no(timeline, config, fragmento_input=None):
     """
+    Função Worker que executa a análise lógica sobre o texto já extraído.
     """
     modelo = model_pro if config.get("modelo") == "pro" else model_flash
                 out = resp.text or ""
                 break
             except Exception as e:
+                if "429" in str(e): # Rate limit
                     time.sleep(2 * (tentativa + 1))
                     continue
                 raise e
         content = out
+        if config.get("tipo_saida") == "json":
             cleaned = _extrair_json_possivel(out)
             try:
                 content = json.loads(cleaned)
             except:
+                content = [] # Fallback
         return {"role": "assistant", "agent": config["nome"], "content": content}, None
     except Exception as e:
         return {"role": "system", "error": str(e)}, str(e)
+# ==================== 5. ORQUESTRADOR ====================
 def orquestrador(texto, arquivo, history, json_config, confext_state):
+    logs = f"🚀 START V45 (OCR): {datetime.now().strftime('%H:%M:%S')}\n"
+    logs = log_point("Orquestrador iniciado", logs)
     # 1. Preparação
     if history is None: history = []
     nome_arquivo = os.path.basename(getattr(arquivo, "name", "sem_arquivo")) if arquivo else "sem_arquivo"
+    # O hash agora considera o arquivo físico para evitar refazer OCR caro
+    hash_op = gerar_hash_arquivo(nome_arquivo + "V45_OCR")
     # 2. Verifica Cache
     cache_existente = carregar_cache(hash_op) if arquivo else None
+    fragmentos = []
     if cache_existente:
         logs = log_point(f"♻️ Cache encontrado para {nome_arquivo}", logs)
         confext_upload = cache_existente["confext_upload"]
         timeline = cache_existente.get("timeline", [])
+        fragmentos = cache_existente.get("fragmentos_cached", [])
+        history.append([texto, "✅ Arquivo carregado do cache! OCR já realizado."])
         yield history, timeline, logs, confext_upload
         if not texto:
             return
     else:
+        # 3. Processamento: OCR via Gemini Vision
+        history.append([texto + (" 📎" if arquivo else ""), "⏳ Lendo documento (OCR com Gemini Vision)... isso pode levar alguns segundos."])
+        yield history, {}, logs, confext_state
         fragmentos, anexo_info, logs = ler_anexo_e_fragmentar(
             arquivo, paginas_por_fragmento=5, logs=logs
         )
+        if not fragmentos and arquivo:
+            history[-1][1] = "❌ Falha ao ler o arquivo. Verifique se é um PDF válido."
+            yield history, {}, logs, confext_state
+            return
+        logs = log_point("Texto extraído via OCR com sucesso.", logs)
         try:
             protocolo = json.loads(json_config)
             return
         timeline = [{"role": "user", "content": texto}]
+        # Cria estrutura inicial de dados
         confext_upload = {
             "arquivo": nome_arquivo,
             "meta": anexo_info,
+            "conteudo_ocr": fragmentos # Salva o texto bruto aqui
         }
+        # Salva Cache logo após o OCR (que é a parte cara/demorada)
+        if arquivo:
+            salvar_cache(hash_op, {
+                "confext_upload": confext_upload,
+                "timeline": timeline,
+                "fragmentos_cached": fragmentos
+            })
+            logs = log_point("OCR salvo em Cache", logs)
+        history[-1][1] = f"✅ OCR Concluído. Texto extraído. Iniciando análise..."
+        yield history, timeline, logs, confext_upload
+    # 4. Injeta contexto extraído no timeline para os agentes lerem
+    timeline_context = timeline.copy()
+    timeline_context.append({
+        "role": "system",
+        "agent": "SYSTEM_OCR",
+        "content": f"Conteúdo do Documento (Extraído via OCR):\n{json.dumps(fragmentos, ensure_ascii=False)}"
+    })
+    # 5. Execução dos Agentes de Análise (Baseado no Protocolo)
+    if not json_config: return
+    try:
+        protocolo = json.loads(json_config)
+    except:
+        return
+    for cfg in protocolo:
+        history[-1][1] = f"⚙️ {cfg['nome']} analisando..."
+        logs = log_point(f"Iniciando agente: {cfg['nome']}", logs)
+        yield history, timeline, logs, confext_upload
+        # O agente recebe o timeline com o contexto do documento
+        res, erro = executar_no(timeline_context, cfg, fragmento_input=None)
+        if erro:
+            logs = log_point(f"Erro agente {cfg['nome']}: {erro}", logs)
+            history[-1][1] = f"❌ Erro em {cfg['nome']}: {erro}"
+        else:
+            timeline.append(res)
+            # Atualiza contexto para o próximo agente
+            timeline_context.append(res)
+            if cfg.get("tipo_saida") == "texto":
+                history[-1][1] = res["content"]
+            elif cfg.get("tipo_saida") == "json":
+                 # Se for JSON, mostra bonitinho ou apenas avisa
+                history[-1][1] = f"✅ {cfg['nome']} finalizou análise estruturada."
+        yield history, timeline, logs, confext_upload
     if not texto and arquivo:
+        history[-1][1] = "✅ Documento digitalizado e analisado."
     logs = log_point("Processo Finalizado", logs)
     yield history, timeline, logs, confext_upload
+# ==================== 6. UI ====================
 def ui_clean():
     css = """
     config_init = carregar_protocolo()
+    with gr.Blocks(title="AI Forensics V45 (OCR Edition)", css=css, theme=gr.themes.Soft()) as app:
         confext_state = gr.State(value=None)
         with gr.Tabs():
                 def _on_upload(x):
                     nome = os.path.basename(getattr(x, "name", x))
+                    return f"📎 Anexo pronto para OCR: {nome}"
                 file_in.upload(_on_upload, inputs=file_in, outputs=file_status)
             with gr.Tab("🕵️ Auditoria & Debug"):
+                gr.Markdown("### 🧠 Processo Interno")
                 with gr.Row():
+                    out_dna = gr.JSON(label="Timeline da IA")
                     out_logs = gr.Textbox(label="Logs do Sistema", lines=20)
+                gr.Markdown("### 📂 Dados Estruturados")
                 confext_view = gr.JSON(label="Conteúdo Extraído")
             with gr.Tab("⚙️ Config"):
             trig(
                 _orq_wrapper,
                 inputs=[txt_in, file_in, chatbot, code_json, confext_state],
+                outputs=[chatbot, out_dna, out_logs, confext_state],
             ).then(
                 lambda c: (None, None, "", c)[1:],
                 inputs=confext_state,
             ).then(
                 lambda c: c,
                 inputs=confext_state,
+                outputs=confext_view,
             )
     return app