QuestionPDF / app.py
segoedu's picture
Update app.py
0cc2f4b verified
import os
from datetime import datetime
from pathlib import Path
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
# -------------------------
# Configuración general
# -------------------------
st.set_page_config(
page_title="Lectorín",
page_icon="📄",
layout="wide"
)
st.title("📄 Lectorín 2026")
st.caption("Pregunta a tu PDF con RAG, FAISS y Groq")
# Secrets / env vars
# Preferencia:
# 1) st.secrets["GROQ_API_KEY"]
# 2) variable de entorno GROQ_API_KEY
GROQ_API_KEY = st.secrets.get("GROQ_API_KEY", os.getenv("GROQ_API_KEY", ""))
# LangSmith opcional
LANGCHAIN_API_KEY = st.secrets.get("LANGCHAIN_API_KEY", os.getenv("LANGCHAIN_API_KEY", ""))
if LANGCHAIN_API_KEY:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = "qpdf-2026"
# Carpeta de datos local
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
HISTORIAL_PATH = DATA_DIR / "historial.txt"
# -------------------------
# Estado de sesión
# -------------------------
if "logs" not in st.session_state:
st.session_state.logs = []
if "knowledge_base" not in st.session_state:
st.session_state.knowledge_base = None
if "current_pdf_name" not in st.session_state:
st.session_state.current_pdf_name = None
# -------------------------
# Modelos
# -------------------------
modelos_embeddings = {
"multilingual-e5-small (rápido)": ("intfloat/multilingual-e5-small", 512),
"multi-qa-MiniLM-L6-cos-v1 (ligero)": ("multi-qa-MiniLM-L6-cos-v1", 256),
"bge-m3 (mejor multilingüe, más pesado)": ("BAAI/bge-m3", 2048),
}
modelos_llm = {
"Llama 3.3 70B Versatile": "llama-3.3-70b-versatile",
"openai/gpt-oss-120b": "openai/gpt-oss-120b",
"moonshotai/kimi-k2-instruct-0905": "moonshotai/kimi-k2-instruct-0905",
}
with st.sidebar:
st.header("Configuración")
embedding_label = st.selectbox("Modelo de embeddings", list(modelos_embeddings.keys()))
embedding_model_name, sequence = modelos_embeddings[embedding_label]
llm_label = st.selectbox("Modelo LLM", list(modelos_llm.keys()))
llm_model_name = modelos_llm[llm_label]
k_docs = st.slider("Chunks recuperados", min_value=2, max_value=8, value=4)
chunk_size = st.slider("Chunk size", min_value=500, max_value=3000, value=min(sequence * 4, 2000), step=100)
chunk_overlap = st.slider("Chunk overlap", min_value=50, max_value=400, value=150, step=25)
st.divider()
st.write("Para producción, configura `GROQ_API_KEY` en secretos o variables de entorno.")
# -------------------------
# Utilidades
# -------------------------
def extract_text_from_pdf(uploaded_file) -> str:
reader = PdfReader(uploaded_file)
pages = []
for page in reader.pages:
text = page.extract_text() or ""
if text.strip():
pages.append(text)
return "\n\n".join(pages)
@st.cache_resource(show_spinner=False)
def load_embeddings_model(model_name: str):
return HuggingFaceEmbeddings(model_name=model_name)
@st.cache_data(show_spinner=False)
def split_text_to_chunks(text: str, chunk_size: int, chunk_overlap: int):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""]
)
return splitter.split_text(text)
def build_knowledge_base(uploaded_file, embedding_model_name: str, chunk_size: int, chunk_overlap: int):
text = extract_text_from_pdf(uploaded_file)
if not text.strip():
raise ValueError("No se pudo extraer texto del PDF.")
chunks = split_text_to_chunks(text, chunk_size, chunk_overlap)
embeddings = load_embeddings_model(embedding_model_name)
vectorstore = FAISS.from_texts(chunks, embeddings)
return vectorstore, len(chunks)
def save_to_file(file_name: str, question: str, answer: str):
with open(HISTORIAL_PATH, "a", encoding="utf-8") as f:
fecha_hora_actual = datetime.now().strftime("%Y-%m-%d %H:%M")
f.write("-" * 25)
f.write(f" {fecha_hora_actual} ")
f.write(f" ({file_name}) ")
f.write("-" * 25 + "\n")
f.write(f"Pregunta: {question}\n")
f.write(f"Respuesta: {answer}\n\n")
def build_rag_chain(vectorstore, groq_api_key: str, model_name: str, k: int = 4):
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
llm = ChatGroq(
groq_api_key=groq_api_key,
model=model_name,
temperature=0
)
prompt = ChatPromptTemplate.from_messages([
(
"system",
"Responde usando solo el contexto recuperado. "
"Si la respuesta no está en el documento, di claramente que no aparece en el PDF. "
"Contesta en español y de forma precisa.\n\nContexto:\n{context}"
),
("human", "{input}")
])
qa_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, qa_chain)
return rag_chain
def render_logs():
with st.sidebar:
st.subheader("Historial de preguntas")
if not st.session_state.logs:
st.caption("Todavía no hay preguntas.")
else:
for i, entry in enumerate(reversed(st.session_state.logs), start=1):
with st.expander(f"{i}. {entry['Pregunta'][:60]}"):
st.write(entry["Respuesta"])
# -------------------------
# Interfaz principal
# -------------------------
pdf_obj = st.file_uploader("Carga tu documento PDF", type="pdf")
if pdf_obj is not None:
if st.session_state.current_pdf_name != pdf_obj.name:
st.session_state.current_pdf_name = pdf_obj.name
st.session_state.logs = []
st.session_state.knowledge_base = None
col1, col2 = st.columns([1, 1])
with col1:
if st.button("Procesar PDF", type="primary", use_container_width=True):
with st.spinner("Procesando PDF y creando índice vectorial..."):
try:
kb, n_chunks = build_knowledge_base(
pdf_obj,
embedding_model_name,
chunk_size,
chunk_overlap
)
st.session_state.knowledge_base = kb
st.success(f"PDF procesado correctamente. Chunks generados: {n_chunks}")
except Exception as e:
st.error(f"Error procesando el PDF: {e}")
with col2:
if st.session_state.knowledge_base is not None:
st.success("Base vectorial lista.")
else:
st.info("Sube un PDF y pulsa 'Procesar PDF'.")
if not GROQ_API_KEY:
st.warning("Falta GROQ_API_KEY. Añádela en Streamlit secrets o en variables de entorno.")
elif st.session_state.knowledge_base is not None:
user_question = st.text_input("Haz una pregunta sobre tu PDF")
if user_question:
with st.spinner("Consultando el documento..."):
try:
rag_chain = build_rag_chain(
st.session_state.knowledge_base,
GROQ_API_KEY,
llm_model_name,
k=k_docs
)
result = rag_chain.invoke({"input": user_question})
answer = result["answer"]
context_docs = result.get("context", [])
st.subheader("Respuesta")
st.write(answer)
with st.expander("Ver fragmentos recuperados"):
if context_docs:
for i, doc in enumerate(context_docs, start=1):
st.markdown(f"**Chunk {i}**")
st.write(doc.page_content)
st.markdown("---")
else:
st.caption("No se devolvieron fragmentos.")
st.session_state.logs.append({
"Pregunta": user_question,
"Respuesta": answer
})
save_to_file(pdf_obj.name, user_question, answer)
except Exception as e:
st.error(f"Error al consultar el PDF: {e}")
render_logs()