alsojulha commited on
Commit
7a4df85
·
verified ·
1 Parent(s): e95a76a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -101
app.py CHANGED
@@ -1,36 +1,36 @@
1
  import json
2
- import ollama
3
  import os
4
  import re
5
  import gradio as gr
6
- from langchain_community.document_loaders import PyMuPDFLoader
7
  import unicodedata
 
 
 
 
 
 
 
 
8
 
9
  # Constantes
10
  MAX_INPUT_SIZE = 4000
11
  OVERLAP = 128
12
- TEMPLATE_PATH = "E:/TCC/Local-Gradio-APP-for-RAG/Local-Gradio-App-for-RAG/json/template.json"
13
- MODEL_NAME = "nuextract" # Certifique-se de que o modelo está carregado no Ollama
14
 
15
- # Função para carregar o template JSON
16
  def load_template():
17
  with open(TEMPLATE_PATH, "r", encoding="utf-8") as file:
18
  return json.load(file)
19
 
20
- # Função para extrair texto puro do PDF
21
  def extract_text_from_pdf(pdf_file_path):
22
  if not os.path.exists(pdf_file_path):
23
  raise FileNotFoundError(f"Arquivo PDF não encontrado: {pdf_file_path}")
24
-
25
  loader = PyMuPDFLoader(pdf_file_path)
26
  data = loader.load()
27
  return "\n".join([doc.page_content.strip() for doc in data])
28
 
29
- # Função para dividir o documento em chunks de forma otimizada
30
  def split_document(document, window_size=MAX_INPUT_SIZE, overlap=OVERLAP):
31
  words = document.split()
32
  chunks = []
33
-
34
  if len(words) > window_size:
35
  for i in range(0, len(words), window_size - overlap):
36
  chunk = " ".join(words[i:i + window_size])
@@ -39,135 +39,65 @@ def split_document(document, window_size=MAX_INPUT_SIZE, overlap=OVERLAP):
39
  break
40
  else:
41
  chunks.append(document)
42
-
43
  return chunks
44
 
45
- # Função para estruturar qualquer texto extraído
46
  def structure_text(text):
47
  lines = text.split("\n")
48
- structured_data = {
49
- "titulo": "",
50
- "autor": "",
51
- "conteudo": []
52
- }
53
-
54
  current_section = None
55
-
56
  for line in lines:
57
  line = line.strip()
58
-
59
- # Detecta títulos prováveis (primeiras linhas, letras maiúsculas, tamanho grande)
60
  if not structured_data["titulo"] and len(line) > 5 and line.istitle():
61
  structured_data["titulo"] = line
62
  continue
63
-
64
- # Detecta possíveis autores (normalmente perto do título, contém nome e sobrenome)
65
  if not structured_data["autor"] and re.search(r"\b[A-Z][a-z]+ [A-Z][a-z]+", line):
66
  structured_data["autor"] = line
67
  continue
68
-
69
- # Detecta cabeçalhos de seção (geralmente curtos, sem pontuação, em negrito ou maiúsculas)
70
  if len(line) < 60 and line.isupper():
71
- current_section = {
72
- "secao": line,
73
- "conteudo": []
74
- }
75
  structured_data["conteudo"].append(current_section)
76
  continue
77
-
78
- # Adiciona conteúdo dentro da seção correta
79
  if current_section:
80
  current_section["conteudo"].append(line)
81
-
82
  return structured_data
83
 
84
- # Função para limpar JSON mal formatado
85
- def clean_json_text(text):
86
- text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove caracteres não ASCII
87
- text = re.sub(r'\s+', ' ', text) # Remove múltiplos espaços
88
- return text.strip()
89
-
90
- def normalize_text(text):
91
- """Normaliza caracteres acentuados malformados no texto."""
92
- return unicodedata.normalize("NFC", text)
93
-
94
- def fix_json(output_text):
95
- """Corrige erros no JSON gerado pelo modelo."""
96
- # Remover qualquer coisa após <|end-output|>
97
- output_text = re.split(r"<\|end-output\|>", output_text)[0].strip()
98
-
99
- # Normalizar caracteres acentuados
100
- output_text = normalize_text(output_text)
101
 
 
 
 
102
  try:
103
- # Tentar carregar JSON diretamente
104
- parsed_json = json.loads(output_text)
105
- return json.dumps(parsed_json, indent=2, ensure_ascii=False)
106
- except json.JSONDecodeError as e:
107
- print("⚠️ Erro ao corrigir JSON:", e)
108
  return output_text
109
 
110
- # Função para enviar chunk para o Ollama com template estruturado
111
- def predict_chunk(text, template, current):
112
- current = clean_json_text(current)
113
-
114
- input_llm = (
115
- f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>" + "{"
116
- )
117
-
118
- response = ollama.chat(
119
- model=MODEL_NAME,
120
- messages=[{"role": "user", "content": input_llm}],
121
- options={"num_ctx": 4000}
122
- )
123
-
124
- output_text = response["message"]["content"]
125
-
126
- # 🔍 Debug: Mostrar saída bruta do modelo
127
- print("======= RAW OUTPUT FROM OLLAMA =======")
128
- print(output_text)
129
- print("======================================")
130
-
131
- # Remove o marcador "<|end-output|>" se ele existir
132
- output_text_cleaned = output_text.replace("<|end-output|>", "").strip()
133
-
134
- try:
135
- # Tenta carregar diretamente como JSON
136
- return json.dumps(json.loads(output_text_cleaned), indent=2, ensure_ascii=False)
137
- except json.JSONDecodeError:
138
- print("⚠️ WARNING: Invalid JSON output. Returning raw text.")
139
- return clean_json_text(output_text_cleaned)
140
 
141
- # Função principal para processar PDF e enviar ao modelo
142
  def process_and_generate(pdf_file):
143
  pdf_path = pdf_file.name
144
  extracted_text = extract_text_from_pdf(pdf_path)
145
-
146
  if not extracted_text:
147
  return "Falha ao extrair texto do PDF."
148
-
149
- # Aplica a estruturação antes de enviar para o modelo
150
  structured_data = structure_text(extracted_text)
151
-
152
- # Serializa para JSON e envia para o modelo
153
- template = json.dumps(load_template(), ensure_ascii=False) # Template inicial
154
- current = json.dumps(structured_data, ensure_ascii=False) # Estado inicial estruturado
155
-
156
  chunks = split_document(extracted_text)
157
-
158
- for i, chunk in enumerate(chunks):
159
- print(f"Processando chunk {i}...")
160
- current = predict_chunk(chunk, template, current)
161
-
162
  return json.dumps(json.loads(current), indent=2, ensure_ascii=False)
163
 
164
- # Interface Gradio
165
  interface = gr.Interface(
166
  fn=process_and_generate,
167
  inputs=gr.File(label="Upload PDF"),
168
  outputs="text",
169
- title="Extração de Dados com NuExtract via Ollama",
170
- description="Extrai texto de PDFs e envia ao NuExtract via Ollama para processamento estruturado."
171
  )
172
-
173
  interface.launch()
 
1
  import json
 
2
  import os
3
  import re
4
  import gradio as gr
 
5
  import unicodedata
6
+ import torch
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
8
+ from langchain_community.document_loaders import PyMuPDFLoader
9
+
10
+ # Configurações do modelo
11
+ MODEL_PATH = "numind/NuExtract-v1.5" # Substitua pelo caminho do modelo local
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
13
+ model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float32)
14
 
15
  # Constantes
16
  MAX_INPUT_SIZE = 4000
17
  OVERLAP = 128
18
+ TEMPLATE_PATH = # colocar template
 
19
 
 
20
  def load_template():
21
  with open(TEMPLATE_PATH, "r", encoding="utf-8") as file:
22
  return json.load(file)
23
 
 
24
  def extract_text_from_pdf(pdf_file_path):
25
  if not os.path.exists(pdf_file_path):
26
  raise FileNotFoundError(f"Arquivo PDF não encontrado: {pdf_file_path}")
 
27
  loader = PyMuPDFLoader(pdf_file_path)
28
  data = loader.load()
29
  return "\n".join([doc.page_content.strip() for doc in data])
30
 
 
31
  def split_document(document, window_size=MAX_INPUT_SIZE, overlap=OVERLAP):
32
  words = document.split()
33
  chunks = []
 
34
  if len(words) > window_size:
35
  for i in range(0, len(words), window_size - overlap):
36
  chunk = " ".join(words[i:i + window_size])
 
39
  break
40
  else:
41
  chunks.append(document)
 
42
  return chunks
43
 
 
44
  def structure_text(text):
45
  lines = text.split("\n")
46
+ structured_data = {"titulo": "", "autor": "", "conteudo": []}
 
 
 
 
 
47
  current_section = None
 
48
  for line in lines:
49
  line = line.strip()
 
 
50
  if not structured_data["titulo"] and len(line) > 5 and line.istitle():
51
  structured_data["titulo"] = line
52
  continue
 
 
53
  if not structured_data["autor"] and re.search(r"\b[A-Z][a-z]+ [A-Z][a-z]+", line):
54
  structured_data["autor"] = line
55
  continue
 
 
56
  if len(line) < 60 and line.isupper():
57
+ current_section = {"secao": line, "conteudo": []}
 
 
 
58
  structured_data["conteudo"].append(current_section)
59
  continue
 
 
60
  if current_section:
61
  current_section["conteudo"].append(line)
 
62
  return structured_data
63
 
64
+ def generate_text(prompt):
65
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_SIZE)
66
+ with torch.no_grad():
67
+ output = model.generate(**inputs, max_new_tokens=512)
68
+ return tokenizer.decode(output[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ def process_chunk(text, template, current):
71
+ input_text = f"### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n"
72
+ output_text = generate_text(input_text)
73
  try:
74
+ return json.dumps(json.loads(output_text), indent=2, ensure_ascii=False)
75
+ except json.JSONDecodeError:
 
 
 
76
  return output_text
77
 
78
+ def send_chunk_to_model(text, template, current):
79
+ """Envia um chunk de texto para o modelo local e processa a saída."""
80
+ input_text = f"<|input|>\n### Template:\n{template}\n### Current:\n{current}\n### Text:\n{text}\n\n<|output|>"
81
+ return process_chunk(input_text, template, current)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
 
83
  def process_and_generate(pdf_file):
84
  pdf_path = pdf_file.name
85
  extracted_text = extract_text_from_pdf(pdf_path)
 
86
  if not extracted_text:
87
  return "Falha ao extrair texto do PDF."
 
 
88
  structured_data = structure_text(extracted_text)
89
+ template = json.dumps(load_template(), ensure_ascii=False)
90
+ current = json.dumps(structured_data, ensure_ascii=False)
 
 
 
91
  chunks = split_document(extracted_text)
92
+ for chunk in chunks:
93
+ current = send_chunk_to_model(chunk, template, current)
 
 
 
94
  return json.dumps(json.loads(current), indent=2, ensure_ascii=False)
95
 
 
96
  interface = gr.Interface(
97
  fn=process_and_generate,
98
  inputs=gr.File(label="Upload PDF"),
99
  outputs="text",
100
+ title="Extração de Dados com Modelo Local",
101
+ description="Extrai texto de PDFs e processa utilizando um modelo local de linguagem."
102
  )
 
103
  interface.launch()