Spaces:

alsojulha
/

pdfrag

Runtime error

File size: 5,471 Bytes

import json
import os
import re
import gradio as gr
import unicodedata
import torch
import shutil
import tempfile


from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain_community.document_loaders import PyMuPDFLoader

# Configurações do modelo


MODEL_PATH = "numind/NuExtract-1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, 
    torch_dtype=torch.float16  # Usa FP16 para reduzir o uso de VRAM
).to(device)
torch.cuda.empty_cache()

# Constantes
MAX_INPUT_SIZE = 4000
OVERLAP = 128
TEMPLATE_PATH = "template.json"

def load_template():
    with open(TEMPLATE_PATH, "r", encoding="utf-8") as file:
        return json.load(file)

def extract_text_from_pdf(pdf_file_path):
    if not os.path.exists(pdf_file_path):
        raise FileNotFoundError(f"Arquivo PDF não encontrado: {pdf_file_path}")
    loader = PyMuPDFLoader(pdf_file_path)
    data = loader.load()
    return "\n".join([doc.page_content.strip() for doc in data])

def split_document(document, window_size=MAX_INPUT_SIZE, overlap=OVERLAP):
    words = document.split()
    chunks = []
    if len(words) > window_size:
        for i in range(0, len(words), window_size - overlap):
            chunk = " ".join(words[i:i + window_size])
            chunks.append(chunk)
            if i + len(words[i:i + window_size]) >= len(words):
                break
    else:
        chunks.append(document)
    return chunks

def structure_text(text):
    lines = text.split("\n")
    structured_data = {"titulo": "", "autor": "", "conteudo": []}
    current_section = None
    for line in lines:
        line = line.strip()
        if not structured_data["titulo"] and len(line) > 5 and line.istitle():
            structured_data["titulo"] = line
            continue
        if not structured_data["autor"] and re.search(r"\b[A-Z][a-z]+ [A-Z][a-z]+", line):
            structured_data["autor"] = line
            continue
        if len(line) < 60 and line.isupper():
            current_section = {"secao": line, "conteudo": []}
            structured_data["conteudo"].append(current_section)
            continue
        if current_section:
            current_section["conteudo"].append(line)
    return structured_data

def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_SIZE).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_new_tokens=512,  # Limita o tamanho da resposta
            pad_token_id=tokenizer.eos_token_id  # Evita erros na geração
        )

    torch.cuda.empty_cache()  # Libera VRAM após a geração

    return tokenizer.decode(output[0], skip_special_tokens=True)



def process_chunk(text, template, current):
    input_text = f"### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n"
    output_text = generate_text(input_text)
    
    try:
        parsed_output = json.loads(output_text)
        return json.dumps(parsed_output, indent=2, ensure_ascii=False)
    except json.JSONDecodeError as e:
        print(f"[Erro JSON] {e}: {output_text}")  # Log do erro para depuração
        return json.dumps({"erro": "Saída inválida do modelo", "output_bruto": output_text}, indent=2, ensure_ascii=False)

def handle_broken_outputs(pred, prev):
    try: 
        if all([(v in ["", []]) for v in json.loads(pred).values()]):
            pred = prev  # Se a saída for vazia, mantém a anterior
    except json.JSONDecodeError: 
        pred = prev  # Se houver erro no JSON, mantém a saída anterior
    return pred


def send_chunk_to_model(text, template, current):
    """Envia um chunk de texto para o modelo local e processa a saída."""
    input_text = f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>" + "{"
    output_text = process_chunk(input_text, template, current)

    return handle_broken_outputs(output_text, current)


def process_and_generate(pdf_file):
    if not pdf_file:
        return "Nenhum arquivo enviado."

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        with open(pdf_file.name, "rb") as f:
            tmp_file.write(f.read())  # Lê e grava corretamente
        tmp_file.flush()
        pdf_path = tmp_file.name

    try:
        extracted_text = extract_text_from_pdf(pdf_path)
        if not extracted_text:
            return "Falha ao extrair texto do PDF."




        structured_data = structure_text(extracted_text)
        template = json.dumps(load_template(), ensure_ascii=False)
        current = json.dumps(structured_data, ensure_ascii=False)
        chunks = split_document(extracted_text)

        for chunk in chunks:
            current = send_chunk_to_model(chunk, template, current)



        return json.dumps(json.loads(current), indent=2, ensure_ascii=False)
    except Exception as e:
        return f"Erro durante o processamento: {e}"
    finally:
        os.remove(pdf_path)  # Remove o arquivo temporário após o uso





interface = gr.Interface(
    fn=process_and_generate,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.JSON(label="Dados Extraídos"),
    title="Extração de Dados com Modelo Local",
    description="Envie um PDF para extrair e processar informações automaticamente.",
)

interface.launch()