energee / app.py
Guilherme Favaron
Add application file
998fd78
import gradio as gr
import fitz # PyMuPDF
import re
import os
import tempfile
from typing import List, Dict, Any
import json
from openai import OpenAI
import logging
from datetime import datetime
import time
from dotenv import load_dotenv
from queue import Queue
from threading import Thread
# Load environment variables
load_dotenv()
# Configuração do logger
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('pdf_processor.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class PDFExtractor:
"""Classe responsável pela extração de texto do PDF."""
@staticmethod
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extrai todo o texto legível de um arquivo PDF."""
try:
doc = fitz.open(pdf_path)
full_text = []
for page_num in range(len(doc)):
page = doc[page_num]
# Extrai texto com formatação mantida
text = page.get_text("text")
full_text.append(text)
doc.close()
return "\n".join(full_text)
except Exception as e:
logger.error(f"Erro na extração do PDF: {e}")
raise
class BillDataExtractor:
"""Classe responsável pela extração inicial de dados usando regex."""
@staticmethod
def extract_initial_data(text: str) -> Dict[str, Any]:
"""Extrai dados iniciais usando expressões regulares."""
patterns = {
'unidade_consumidora': r'Unidade Consumidora:?\s*(\d+)',
'mes_referencia': r'Referente a:?\s*([A-Za-z]+/\d{4})',
'medidor': r'Medidor:?\s*(\d+)',
'energia_injetada': r'Energia Injetada:?\s*(\d+(?:[.,]\d+)?)',
'valores': r'R\$\s*(\d+(?:[.,]\d+)?)',
'quantidades': r'(\d+(?:[.,]\d+)?)\s*kWh',
'geradores': r'Geradores:?\s*([\s\S]+?)(?=Unidade Consumidora|$)', # Adicionado para capturar linhas em azul
}
extracted_data = {}
for key, pattern in patterns.items():
matches = re.findall(pattern, text)
extracted_data[key] = matches if matches else []
return extracted_data
class AIProcessor:
"""Classe responsável pelo processamento de dados usando IA."""
def __init__(self):
self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
self.rate_limit_delay = 1
def process_with_ai(self, text: str, initial_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Processa o texto usando IA para extrair informações estruturadas."""
try:
prompt = f"""Analise esta fatura de energia e retorne um JSON com os itens encontrados.
Texto da fatura: {text}
Dados pré-extraídos: {json.dumps(initial_data)}
Retorne apenas o JSON no seguinte formato:
{{
"items": [
{{
"unidade_consumidora": "string",
"mes_referencia": "string",
"item_fatura": "string",
"quantidade": number,
"valor": number,
"energia_injetada": number,
"medidor": "string",
"unidade": "string",
"tarifa_unitaria": number
}}
]
}}"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Você é um especialista em análise de faturas de energia. Extraia todos os Itens da fatura de energia. Use o seguinte formato: {\"items\": [...]}"},
{"role": "user", "content": prompt}
],
max_tokens=4000,
temperature=0.1
)
content = response.choices[0].message.content.strip()
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
return json.loads(json_match.group(0))['items']
return []
except Exception as e:
logger.error(f"Erro no processamento com IA: {e}")
raise
def process_pdf(pdf_file) -> tuple:
"""Função principal de processamento do PDF."""
try:
if pdf_file is None:
return "Nenhum arquivo foi enviado.", None, None
# Get the file path directly from Gradio's file component
pdf_path = pdf_file.name
# Extrai texto do PDF
pdf_extractor = PDFExtractor()
extracted_text = pdf_extractor.extract_text_from_pdf(pdf_path)
# Extrai dados iniciais
bill_extractor = BillDataExtractor()
initial_data = bill_extractor.extract_initial_data(extracted_text)
# Processa com IA
ai_processor = AIProcessor()
processed_items = ai_processor.process_with_ai(extracted_text, initial_data)
# Prepara resultado para exibição
json_str = json.dumps(processed_items, indent=2, ensure_ascii=False)
# Create a temporary file for the JSON
with tempfile.NamedTemporaryFile(delete=False, suffix='.json', mode='w', encoding='utf-8') as json_file:
json.dump(processed_items, json_file, indent=2, ensure_ascii=False)
json_path = json_file.name
return (
f"Processamento concluído! {len(processed_items)} itens encontrados.",
processed_items,
json_path
)
except Exception as e:
logger.error(f"Erro durante o processamento: {e}")
return (
f"Erro durante o processamento: {str(e)}",
None,
None
)
def process_queue(queue: Queue, status_output, json_output, download_output):
results = []
while not queue.empty():
pdf_file = queue.get()
status, json_data, json_path = process_pdf(pdf_file)
results.append((status, json_data, json_path))
queue.task_done()
# Update the outputs with the accumulated results
status_output.value = "\n".join([result[0] for result in results])
json_output.value = [result[1] for result in results]
download_output.value = [result[2] for result in results]
def create_interface():
with gr.Blocks(title="Processador de Faturas de Energia") as interface:
gr.Markdown("# Processador de Faturas de Energia")
gr.Markdown("Faça upload de arquivos PDF de fatura de energia para processamento.")
with gr.Row():
file_input = gr.File(
label="Upload dos PDFs",
file_types=[".pdf"],
file_count="multiple", # Allow multiple files
type="filepath" # This is important! Tells Gradio to pass the file path
)
with gr.Row():
process_button = gr.Button("Processar Faturas")
with gr.Row():
status_output = gr.Textbox(label="Status")
with gr.Row():
json_output = gr.JSON(label="Resultados")
with gr.Row():
download_output = gr.File(label="Download JSON")
def process_files(files):
queue = Queue()
for file in files:
queue.put(file)
status_output.value = ""
json_output.value = []
download_output.value = []
thread = Thread(target=process_queue, args=(queue, status_output, json_output, download_output))
thread.start()
thread.join()
return status_output.value, json_output.value, download_output.value
process_button.click(
fn=process_files,
inputs=[file_input],
outputs=[status_output, json_output, download_output]
)
return interface
if __name__ == "__main__":
# Verify OpenAI API key is set
if not os.getenv('OPENAI_API_KEY'):
print("Error: OPENAI_API_KEY not found in environment variables")
print("Please set your OpenAI API key in the .env file")
exit(1)
print("Starting Gradio interface...")
interface = create_interface()
interface.launch(share=False)