alsojulha commited on
Commit
5b17ef7
·
verified ·
1 Parent(s): 74305bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -35
app.py CHANGED
@@ -5,13 +5,14 @@ import gradio as gr
5
  import unicodedata
6
  import torch
7
  import shutil
 
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
9
  from langchain_community.document_loaders import PyMuPDFLoader
10
 
11
  # Configurações do modelo
12
  MODEL_PATH = "numind/NuExtract-1.5"
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
14
- device = "cuda"
15
  model = AutoModelForCausalLM.from_pretrained(
16
  MODEL_PATH,
17
  torch_dtype=torch.float16 # Usa FP16 para reduzir o uso de VRAM
@@ -68,14 +69,21 @@ def structure_text(text):
68
  return structured_data
69
 
70
  def generate_text(prompt):
71
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Detecta GPU ou CPU
72
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_SIZE).to(device) # Move inputs para GPU
73
 
74
  with torch.no_grad():
75
- output = model.generate(**inputs, max_new_tokens=512)
 
 
 
 
 
 
76
 
77
  return tokenizer.decode(output[0], skip_special_tokens=True)
78
 
 
 
79
  def process_chunk(text, template, current):
80
  input_text = f"### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n"
81
  output_text = generate_text(input_text)
@@ -84,8 +92,8 @@ def process_chunk(text, template, current):
84
  parsed_output = json.loads(output_text)
85
  return json.dumps(parsed_output, indent=2, ensure_ascii=False)
86
  except json.JSONDecodeError as e:
87
- print("Erro ao decodificar JSON:", e)
88
- return output_text
89
 
90
  def handle_broken_outputs(pred, prev):
91
  try:
@@ -96,56 +104,50 @@ def handle_broken_outputs(pred, prev):
96
  return pred
97
 
98
 
99
-
100
  def send_chunk_to_model(text, template, current):
101
  """Envia um chunk de texto para o modelo local e processa a saída."""
102
  input_text = f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>" + "{"
103
  output_text = process_chunk(input_text, template, current)
104
 
105
- return handle_broken_output(output_text, current)
106
 
107
 
108
  def process_and_generate(pdf_file):
109
  if not pdf_file:
110
  return "Nenhum arquivo enviado."
111
 
112
- # Caminho temporário para salvar o arquivo
113
- pdf_path = "/tmp/uploaded_file.pdf"
114
-
115
- # Copia o conteúdo do arquivo para o caminho correto
116
- shutil.copy(pdf_file, pdf_path)
117
-
118
- # Extrai o texto do PDF
119
- extracted_text = extract_text_from_pdf(pdf_path)
120
- if not extracted_text:
121
- return "Falha ao extrair texto do PDF."
122
 
123
- # Estrutura os dados extraídos
124
- structured_data = structure_text(extracted_text)
125
-
126
- # Carrega o template
127
- template = json.dumps(load_template(), ensure_ascii=False)
128
- current = json.dumps(structured_data, ensure_ascii=False)
129
 
130
- # Divide o texto extraído em partes menores
131
- chunks = split_document(extracted_text)
 
 
132
 
133
- # Processa cada chunk com o modelo
134
- for chunk in chunks:
135
- current = send_chunk_to_model(chunk, template, current)
136
 
137
- # Retorna o JSON formatado
138
- try:
139
  return json.dumps(json.loads(current), indent=2, ensure_ascii=False)
140
- except json.JSONDecodeError:
141
- return "Erro ao processar os dados gerados pelo modelo."
 
 
 
142
 
143
 
144
  interface = gr.Interface(
145
  fn=process_and_generate,
146
  inputs=gr.File(label="Upload PDF"),
147
- outputs="text",
148
  title="Extração de Dados com Modelo Local",
149
- description="Extrai texto de PDFs e processa utilizando um modelo local de linguagem."
150
  )
 
151
  interface.launch()
 
5
  import unicodedata
6
  import torch
7
  import shutil
8
+ import tempfile
9
  from transformers import AutoModelForCausalLM, AutoTokenizer
10
  from langchain_community.document_loaders import PyMuPDFLoader
11
 
12
  # Configurações do modelo
13
  MODEL_PATH = "numind/NuExtract-1.5"
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  model = AutoModelForCausalLM.from_pretrained(
17
  MODEL_PATH,
18
  torch_dtype=torch.float16 # Usa FP16 para reduzir o uso de VRAM
 
69
  return structured_data
70
 
71
  def generate_text(prompt):
72
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_SIZE).to(device)
 
73
 
74
  with torch.no_grad():
75
+ output = model.generate(
76
+ **inputs,
77
+ max_new_tokens=512, # Limita o tamanho da resposta
78
+ pad_token_id=tokenizer.eos_token_id # Evita erros na geração
79
+ )
80
+
81
+ torch.cuda.empty_cache() # Libera VRAM após a geração
82
 
83
  return tokenizer.decode(output[0], skip_special_tokens=True)
84
 
85
+
86
+
87
  def process_chunk(text, template, current):
88
  input_text = f"### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n"
89
  output_text = generate_text(input_text)
 
92
  parsed_output = json.loads(output_text)
93
  return json.dumps(parsed_output, indent=2, ensure_ascii=False)
94
  except json.JSONDecodeError as e:
95
+ print(f"[Erro JSON] {e}: {output_text}") # Log do erro para depuração
96
+ return json.dumps({"erro": "Saída inválida do modelo", "output_bruto": output_text}, indent=2, ensure_ascii=False)
97
 
98
  def handle_broken_outputs(pred, prev):
99
  try:
 
104
  return pred
105
 
106
 
 
107
  def send_chunk_to_model(text, template, current):
108
  """Envia um chunk de texto para o modelo local e processa a saída."""
109
  input_text = f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>" + "{"
110
  output_text = process_chunk(input_text, template, current)
111
 
112
+ return handle_broken_outputs(output_text, current)
113
 
114
 
115
  def process_and_generate(pdf_file):
116
  if not pdf_file:
117
  return "Nenhum arquivo enviado."
118
 
119
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
120
+ tmp_file.write(pdf_file.read()) # Grava o arquivo diretamente
121
+ tmp_file.flush() # Garante que os dados sejam salvos
122
+ pdf_path = tmp_file.name
 
 
 
 
 
 
123
 
124
+ try:
125
+ extracted_text = extract_text_from_pdf(pdf_path)
126
+ if not extracted_text:
127
+ return "Falha ao extrair texto do PDF."
 
 
128
 
129
+ structured_data = structure_text(extracted_text)
130
+ template = json.dumps(load_template(), ensure_ascii=False)
131
+ current = json.dumps(structured_data, ensure_ascii=False)
132
+ chunks = split_document(extracted_text)
133
 
134
+ for chunk in chunks:
135
+ current = send_chunk_to_model(chunk, template, current)
 
136
 
 
 
137
  return json.dumps(json.loads(current), indent=2, ensure_ascii=False)
138
+ except Exception as e:
139
+ return f"Erro durante o processamento: {e}"
140
+ finally:
141
+ os.remove(pdf_path) # Remove o arquivo temporário após o uso
142
+
143
 
144
 
145
  interface = gr.Interface(
146
  fn=process_and_generate,
147
  inputs=gr.File(label="Upload PDF"),
148
+ outputs=gr.JSON(label="Dados Extraídos"),
149
  title="Extração de Dados com Modelo Local",
150
+ description="Envie um PDF para extrair e processar informações automaticamente.",
151
  )
152
+
153
  interface.launch()