luansouza4444's picture
Update app.py
dc2efd2 verified
raw
history blame
7.08 kB
# -*- coding: utf-8 -*-
"""app
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1PhcQoTZvxdPQe6E1HMx_Nl4Zs_tY7J_y
"""
import gradio as gr
import os
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import tempfile
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# ✅ Carrega variáveis de ambiente
load_dotenv()
OPENROUTER_API_KEY = os.getenv("ROUTER_API_KEY")
if not OPENROUTER_API_KEY:
raise ValueError("❌ A variável de ambiente ROUTER_API_KEY não está definida. Verifique o arquivo .env.")
# Inicialização
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
qa_chain = None
processed_file = None
# 🔁 Carrega automaticamente o legisMiner.pdf ao iniciar
def load_default_pdf():
global qa_chain, processed_file
try:
loader = PyPDFLoader("LegisMiner.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
vectorstore = FAISS.from_documents(docs, embeddings)
llm = ChatOpenAI(
openai_api_key=OPENROUTER_API_KEY,
openai_api_base="https://openrouter.ai/api/v1",
model="deepseek/deepseek-r1-0528:free",
temperature=0.7
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(),
return_source_documents=True
)
processed_file = "LegisMiner.pdf"
print("✅ LegisMiner.pdf carregado automaticamente.")
except Exception as e:
print(f"❌ Erro ao carregar LegisMiner.pdf automaticamente: {e}")
def calculate_rag_metrics(query, response, source_docs):
metrics = {}
try:
query_embedding = embeddings.embed_query(query)
response_embedding = embeddings.embed_query(response)
metrics["query_response_similarity"] = cosine_similarity(
[query_embedding], [response_embedding]
)[0][0]
doc_similarities = []
for doc in source_docs:
doc_embedding = embeddings.embed_query(doc.page_content[:1000])
similarity = cosine_similarity([response_embedding], [doc_embedding])[0][0]
doc_similarities.append(similarity)
metrics["avg_response_source_similarity"] = np.mean(doc_similarities) if doc_similarities else 0
metrics["max_response_source_similarity"] = max(doc_similarities) if doc_similarities else 0
metrics["num_source_documents"] = len(source_docs)
except Exception as e:
metrics["error"] = str(e)
return metrics
def process_pdf(file):
global qa_chain, processed_file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(file)
pdf_path = tmp.name
try:
loader = PyPDFLoader(pdf_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
vectorstore = FAISS.from_documents(docs, embeddings)
llm = ChatOpenAI(
openai_api_key=OPENROUTER_API_KEY,
openai_api_base="https://openrouter.ai/api/v1",
model="deepseek/deepseek-r1-0528-qwen3-8b:free",
temperature=0.7
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(),
return_source_documents=True
)
processed_file = os.path.basename(pdf_path)
return f"✅ PDF processado com sucesso: {processed_file}"
except Exception as e:
return f"❌ Erro ao processar PDF: {str(e)}"
def ask_question(question):
global qa_chain
if qa_chain is None:
return "⚠️ Por favor, carregue um PDF primeiro", "", {}
try:
# ✅ Prompt especializado: Analista de Normas Regulatórias de Mineração
system_prompt = (
"Você é um Analista Especialista em Normas Regulatórias de Mineração no Brasil. "
"Baseie suas respostas exclusivamente no conteúdo técnico do documento LegisMiner.pdf. "
"Forneça respostas claras, técnicas e fundamentadas. "
"Se a informação não estiver presente no documento, informe isso ao usuário."
)
resposta = qa_chain.invoke({
"query": f"{system_prompt}\n\nPergunta: {question}"
})
sources = "\n\n".join(
[f"📄 Fonte {i+1}:\n{doc.page_content[:500]}..."
for i, doc in enumerate(resposta['source_documents'])]
)
metrics = calculate_rag_metrics(
question,
resposta['result'],
resposta['source_documents']
)
metrics_text = "\n".join(
[f"{k.replace('_', ' ').title()}: {v:.2f}" if isinstance(v, float) else f"{k.replace('_', ' ').title()}: {v}"
for k, v in metrics.items() if k != "error"]
)
return resposta['result'], sources, metrics_text
except Exception as e:
return f"❌ Erro ao processar pergunta: {str(e)}", "", {}
# Interface Gradio
with gr.Blocks(title="Chat com PDF usando OpenRouter", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 Artificial Intelligence Applied to Regulatory Standard Processing in Mining\n### 💡 Development of a Decision Support Tool")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="📤 Envie um PDF", type="binary")
process_btn = gr.Button("Processar PDF", variant="primary")
status_output = gr.Textbox(label="Status")
with gr.Column(scale=2):
question_input = gr.Textbox(label="Faça uma pergunta sobre Normas da Mineração", lines=3)
ask_btn = gr.Button("Enviar Pergunta", variant="primary")
answer_output = gr.Textbox(label="✅ Resposta", interactive=False)
with gr.Accordion("📄 Fontes usadas", open=False):
sources_output = gr.Textbox(label="Trechos relevantes", lines=10)
with gr.Accordion("📊 Métricas RAG", open=False):
metrics_output = gr.Textbox(label="Métricas", lines=4)
process_btn.click(
fn=process_pdf,
inputs=file_input,
outputs=status_output
)
ask_btn.click(
fn=ask_question,
inputs=question_input,
outputs=[answer_output, sources_output, metrics_output]
)
# 🔁 Carrega o PDF fixo ao iniciar
load_default_pdf()
# Compartilhamento opcional no Colab ou Hugging Face
share = True if 'COLAB_JUPYTER_TRANSPORT' in os.environ else False
demo.launch(share=share, debug=False)