leitor_pdf / app.py
danielspba's picture
Create app.py
8c4d9eb verified
import os
from dotenv import load_dotenv
import gradio as gr
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
# Carrega variáveis de ambiente
load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")
# Inicializa LLM
llm = ChatOpenAI(
openai_api_base="https://openrouter.ai/api/v1",
openai_api_key=api_key,
model="deepseek/deepseek-r1-zero:free"
)
def processar_pdf(pdf_file, pergunta):
# Usa diretamente o caminho fornecido pelo Gradio
pdf_path = pdf_file.name
# Carrega e divide o PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(documents)
# Embeddings e índice
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
# Cadeia QA
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(),
return_source_documents=True
)
# Resposta
resposta = qa_chain.invoke({"query": pergunta})
result = resposta["result"]
fontes = "\n\n".join([f"Fonte {i+1}: {doc.page_content[:300]}..." for i, doc in enumerate(resposta["source_documents"])])
return result, fontes
# Interface Gradio
interface = gr.Interface(
fn=processar_pdf,
inputs=[
gr.File(label="Envie um PDF"),
gr.Textbox(label="Sua pergunta", placeholder="Ex: Qual a duração do curso?")
],
outputs=[
gr.Textbox(label="Resposta"),
gr.Textbox(label="Fontes utilizadas")
],
title="Chat com PDF (LangChain)",
description="Carregue um PDF e faça perguntas sobre ele. Powered by LangChain + Hugging Face Embeddings"
)
if __name__ == "__main__":
interface.launch()