Spaces:
Sleeping
Sleeping
| import os | |
| from datetime import datetime | |
| from pathlib import Path | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_groq import ChatGroq | |
| from langchain.chains import create_retrieval_chain | |
| from langchain.chains.combine_documents import create_stuff_documents_chain | |
| # ------------------------- | |
| # Configuración general | |
| # ------------------------- | |
| st.set_page_config( | |
| page_title="Lectorín", | |
| page_icon="📄", | |
| layout="wide" | |
| ) | |
| st.title("📄 Lectorín 2026") | |
| st.caption("Pregunta a tu PDF con RAG, FAISS y Groq") | |
| # Secrets / env vars | |
| # Preferencia: | |
| # 1) st.secrets["GROQ_API_KEY"] | |
| # 2) variable de entorno GROQ_API_KEY | |
| GROQ_API_KEY = st.secrets.get("GROQ_API_KEY", os.getenv("GROQ_API_KEY", "")) | |
| # LangSmith opcional | |
| LANGCHAIN_API_KEY = st.secrets.get("LANGCHAIN_API_KEY", os.getenv("LANGCHAIN_API_KEY", "")) | |
| if LANGCHAIN_API_KEY: | |
| os.environ["LANGCHAIN_TRACING_V2"] = "true" | |
| os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY | |
| os.environ["LANGCHAIN_PROJECT"] = "qpdf-2026" | |
| # Carpeta de datos local | |
| DATA_DIR = Path("data") | |
| DATA_DIR.mkdir(exist_ok=True) | |
| HISTORIAL_PATH = DATA_DIR / "historial.txt" | |
| # ------------------------- | |
| # Estado de sesión | |
| # ------------------------- | |
| if "logs" not in st.session_state: | |
| st.session_state.logs = [] | |
| if "knowledge_base" not in st.session_state: | |
| st.session_state.knowledge_base = None | |
| if "current_pdf_name" not in st.session_state: | |
| st.session_state.current_pdf_name = None | |
| # ------------------------- | |
| # Modelos | |
| # ------------------------- | |
| modelos_embeddings = { | |
| "multilingual-e5-small (rápido)": ("intfloat/multilingual-e5-small", 512), | |
| "multi-qa-MiniLM-L6-cos-v1 (ligero)": ("multi-qa-MiniLM-L6-cos-v1", 256), | |
| "bge-m3 (mejor multilingüe, más pesado)": ("BAAI/bge-m3", 2048), | |
| } | |
| modelos_llm = { | |
| "Llama 3.3 70B Versatile": "llama-3.3-70b-versatile", | |
| "openai/gpt-oss-120b": "openai/gpt-oss-120b", | |
| "moonshotai/kimi-k2-instruct-0905": "moonshotai/kimi-k2-instruct-0905", | |
| } | |
| with st.sidebar: | |
| st.header("Configuración") | |
| embedding_label = st.selectbox("Modelo de embeddings", list(modelos_embeddings.keys())) | |
| embedding_model_name, sequence = modelos_embeddings[embedding_label] | |
| llm_label = st.selectbox("Modelo LLM", list(modelos_llm.keys())) | |
| llm_model_name = modelos_llm[llm_label] | |
| k_docs = st.slider("Chunks recuperados", min_value=2, max_value=8, value=4) | |
| chunk_size = st.slider("Chunk size", min_value=500, max_value=3000, value=min(sequence * 4, 2000), step=100) | |
| chunk_overlap = st.slider("Chunk overlap", min_value=50, max_value=400, value=150, step=25) | |
| st.divider() | |
| st.write("Para producción, configura `GROQ_API_KEY` en secretos o variables de entorno.") | |
| # ------------------------- | |
| # Utilidades | |
| # ------------------------- | |
| def extract_text_from_pdf(uploaded_file) -> str: | |
| reader = PdfReader(uploaded_file) | |
| pages = [] | |
| for page in reader.pages: | |
| text = page.extract_text() or "" | |
| if text.strip(): | |
| pages.append(text) | |
| return "\n\n".join(pages) | |
| def load_embeddings_model(model_name: str): | |
| return HuggingFaceEmbeddings(model_name=model_name) | |
| def split_text_to_chunks(text: str, chunk_size: int, chunk_overlap: int): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| separators=["\n\n", "\n", ". ", " ", ""] | |
| ) | |
| return splitter.split_text(text) | |
| def build_knowledge_base(uploaded_file, embedding_model_name: str, chunk_size: int, chunk_overlap: int): | |
| text = extract_text_from_pdf(uploaded_file) | |
| if not text.strip(): | |
| raise ValueError("No se pudo extraer texto del PDF.") | |
| chunks = split_text_to_chunks(text, chunk_size, chunk_overlap) | |
| embeddings = load_embeddings_model(embedding_model_name) | |
| vectorstore = FAISS.from_texts(chunks, embeddings) | |
| return vectorstore, len(chunks) | |
| def save_to_file(file_name: str, question: str, answer: str): | |
| with open(HISTORIAL_PATH, "a", encoding="utf-8") as f: | |
| fecha_hora_actual = datetime.now().strftime("%Y-%m-%d %H:%M") | |
| f.write("-" * 25) | |
| f.write(f" {fecha_hora_actual} ") | |
| f.write(f" ({file_name}) ") | |
| f.write("-" * 25 + "\n") | |
| f.write(f"Pregunta: {question}\n") | |
| f.write(f"Respuesta: {answer}\n\n") | |
| def build_rag_chain(vectorstore, groq_api_key: str, model_name: str, k: int = 4): | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": k}) | |
| llm = ChatGroq( | |
| groq_api_key=groq_api_key, | |
| model=model_name, | |
| temperature=0 | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ( | |
| "system", | |
| "Responde usando solo el contexto recuperado. " | |
| "Si la respuesta no está en el documento, di claramente que no aparece en el PDF. " | |
| "Contesta en español y de forma precisa.\n\nContexto:\n{context}" | |
| ), | |
| ("human", "{input}") | |
| ]) | |
| qa_chain = create_stuff_documents_chain(llm, prompt) | |
| rag_chain = create_retrieval_chain(retriever, qa_chain) | |
| return rag_chain | |
| def render_logs(): | |
| with st.sidebar: | |
| st.subheader("Historial de preguntas") | |
| if not st.session_state.logs: | |
| st.caption("Todavía no hay preguntas.") | |
| else: | |
| for i, entry in enumerate(reversed(st.session_state.logs), start=1): | |
| with st.expander(f"{i}. {entry['Pregunta'][:60]}"): | |
| st.write(entry["Respuesta"]) | |
| # ------------------------- | |
| # Interfaz principal | |
| # ------------------------- | |
| pdf_obj = st.file_uploader("Carga tu documento PDF", type="pdf") | |
| if pdf_obj is not None: | |
| if st.session_state.current_pdf_name != pdf_obj.name: | |
| st.session_state.current_pdf_name = pdf_obj.name | |
| st.session_state.logs = [] | |
| st.session_state.knowledge_base = None | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| if st.button("Procesar PDF", type="primary", use_container_width=True): | |
| with st.spinner("Procesando PDF y creando índice vectorial..."): | |
| try: | |
| kb, n_chunks = build_knowledge_base( | |
| pdf_obj, | |
| embedding_model_name, | |
| chunk_size, | |
| chunk_overlap | |
| ) | |
| st.session_state.knowledge_base = kb | |
| st.success(f"PDF procesado correctamente. Chunks generados: {n_chunks}") | |
| except Exception as e: | |
| st.error(f"Error procesando el PDF: {e}") | |
| with col2: | |
| if st.session_state.knowledge_base is not None: | |
| st.success("Base vectorial lista.") | |
| else: | |
| st.info("Sube un PDF y pulsa 'Procesar PDF'.") | |
| if not GROQ_API_KEY: | |
| st.warning("Falta GROQ_API_KEY. Añádela en Streamlit secrets o en variables de entorno.") | |
| elif st.session_state.knowledge_base is not None: | |
| user_question = st.text_input("Haz una pregunta sobre tu PDF") | |
| if user_question: | |
| with st.spinner("Consultando el documento..."): | |
| try: | |
| rag_chain = build_rag_chain( | |
| st.session_state.knowledge_base, | |
| GROQ_API_KEY, | |
| llm_model_name, | |
| k=k_docs | |
| ) | |
| result = rag_chain.invoke({"input": user_question}) | |
| answer = result["answer"] | |
| context_docs = result.get("context", []) | |
| st.subheader("Respuesta") | |
| st.write(answer) | |
| with st.expander("Ver fragmentos recuperados"): | |
| if context_docs: | |
| for i, doc in enumerate(context_docs, start=1): | |
| st.markdown(f"**Chunk {i}**") | |
| st.write(doc.page_content) | |
| st.markdown("---") | |
| else: | |
| st.caption("No se devolvieron fragmentos.") | |
| st.session_state.logs.append({ | |
| "Pregunta": user_question, | |
| "Respuesta": answer | |
| }) | |
| save_to_file(pdf_obj.name, user_question, answer) | |
| except Exception as e: | |
| st.error(f"Error al consultar el PDF: {e}") | |
| render_logs() |