Spaces:
Build error
Build error
| import os | |
| import streamlit as st | |
| from dotenv import load_dotenv | |
| # Lector de PDFs | |
| from PyPDF2 import PdfReader | |
| # Fragmentador de texto | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| # Embeddings y VectorStore | |
| from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| # Librer铆as de LangChain para RAG | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.memory import ConversationBufferMemory | |
| from mi_prompt import tu_prompt_personalizado | |
| # Cargar .env si lo necesitas | |
| load_dotenv() | |
| # Ajuste puntual (opcional en ciertos entornos Windows) | |
| os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" | |
| # Embeddings con spaCy (puedes cambiarlo por OpenAIEmbeddings, etc.) | |
| embeddings = SpacyEmbeddings(model_name="en_core_web_sm") | |
| # --------------------------------------------- | |
| # Funciones auxiliares | |
| # --------------------------------------------- | |
| def read_pdfs(pdf_files): | |
| """Lee cada PDF y concatena su texto.""" | |
| text = "" | |
| for pdf_file in pdf_files: | |
| pdf_reader = PdfReader(pdf_file) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() or "" | |
| return text | |
| def chunk_text(text): | |
| """Divide el texto en chunks.""" | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def create_vectorstore(chunks): | |
| """Crea el FAISS VectorStore a partir de la lista de chunks.""" | |
| vectorstore = FAISS.from_texts(chunks, embedding=embeddings) | |
| return vectorstore | |
| # --------------------------------------------- | |
| # Aplicaci贸n principal | |
| # --------------------------------------------- | |
| def main(): | |
| st.set_page_config(page_title="Chat PDF (RAG)", layout="wide") | |
| st.header("RAG-based Chat con PDFs") | |
| # Iniciamos el estado de la conversaci贸n en la app | |
| if "conversation_chain" not in st.session_state: | |
| st.session_state["conversation_chain"] = None | |
| # Guardamos el historial en session_state (para la UI) | |
| if "messages" not in st.session_state: | |
| st.session_state["messages"] = [] | |
| # Barra lateral: subir PDFs y procesarlos | |
| with st.sidebar: | |
| st.title("Men煤:") | |
| uploaded_pdfs = st.file_uploader( | |
| "Sube tus PDFs y haz clic en 'Procesar PDFs'.", | |
| accept_multiple_files=True | |
| ) | |
| if st.button("Procesar PDFs"): | |
| if uploaded_pdfs: | |
| with st.spinner("Procesando e indexando..."): | |
| # 1) Leer PDFs | |
| raw_text = read_pdfs(uploaded_pdfs) | |
| # 2) Fragmentar texto | |
| text_chunks = chunk_text(raw_text) | |
| # 3) Crear FAISS VectorStore | |
| vectorstore = create_vectorstore(text_chunks) | |
| # 4) Crear la cadena conversacional con retrieval | |
| # - ConversationalRetrievalChain maneja preguntas + contexto | |
| llm = ChatOpenAI( | |
| model_name="gpt-4o-mini", # o "gpt-4", seg煤n tu acceso | |
| temperature=0 | |
| ) | |
| # Memoria para la conversaci贸n | |
| memory = ConversationBufferMemory( | |
| memory_key="chat_history", | |
| return_messages=True | |
| ) | |
| # Creamos la cadena RAG: | |
| conversation_chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=vectorstore.as_retriever(search_kwargs={"k": 6}), | |
| memory=memory, | |
| # Opcionalmente, ajusta c贸mo combinar la pregunta con los documentos: | |
| combine_docs_chain_kwargs={"prompt": tu_prompt_personalizado}, | |
| ) | |
| # Guardamos la cadena en session_state | |
| st.session_state["conversation_chain"] = conversation_chain | |
| st.success("隆PDFs procesados y VectorStore creado!") | |
| else: | |
| st.warning("No subiste ning煤n PDF") | |
| # Bot贸n para reiniciar | |
| if st.button("Reiniciar VectorStore"): | |
| st.session_state["conversation_chain"] = None | |
| st.session_state["messages"] = [] | |
| st.info("Base vectorial reiniciada. Sube nuevos PDFs si lo deseas.") | |
| st.subheader("Chat con tus PDFs") | |
| # Mostrar historial previo | |
| for msg in st.session_state["messages"]: | |
| st.write(f"**{msg['role'].capitalize()}:** {msg['content']}") | |
| # Input del usuario | |
| user_input = st.text_input("Haz una pregunta sobre el/los PDF(s)...") | |
| if user_input: | |
| if st.session_state["conversation_chain"] is None: | |
| st.warning("No hay PDFs procesados. Sube y procesa al menos un PDF.") | |
| else: | |
| # Guardamos mensaje del usuario en el historial | |
| st.session_state["messages"].append({"role": "user", "content": user_input}) | |
| # Usar la cadena conversacional para obtener respuesta | |
| response = st.session_state["conversation_chain"]({ | |
| "question": user_input | |
| }) | |
| # El output viene en la llave "answer" por defecto con ConversationalRetrievalChain | |
| answer = response["answer"] | |
| # A帽adir respuesta al historial | |
| st.session_state["messages"].append({"role": "assistant", "content": answer}) | |
| # Mostrar la respuesta | |
| st.write(f"**Asistente:** {answer}") | |
| if __name__ == "__main__": | |
| main() | |