Spaces:

Segizu
/

PDF_CHATBOT

Build error

App Files Files Community

Segizu commited on Feb 17, 2025

Commit

f585950

1 Parent(s): 23cc930

ptompting

Browse files

Files changed (5) hide show

.env +1 -0
app.py +154 -67
faiss_db/index.faiss +0 -0
faiss_db/index.pkl +3 -0
requirements.txt +3 -1

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=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

app.py CHANGED Viewed

@@ -1,99 +1,186 @@
 import streamlit as st
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.tools.retriever import create_retriever_tool
-from dotenv import load_dotenv
-from langchain_anthropic import ChatAnthropic
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain.agents import AgentExecutor, create_tool_calling_agent
-import os
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
-def pdf_read(pdf_doc):
     text = ""
-    for pdf in pdf_doc:
         pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
-            text += page.extract_text()
     return text
 def get_chunks(text):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    chunks = text_splitter.split_text(text)
-    return chunks
-def vector_store(text_chunks):
     vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
-    vector_store.save_local("faiss_db")
-def get_conversational_chain(tools,ques):
-    #os.environ["ANTHROPIC_API_KEY"]=os.getenv["ANTHROPIC_API_KEY"]
-    #llm = ChatAnthropic(model="claude-3-sonnet-20240229", temperature=0, api_key=os.getenv("ANTHROPIC_API_KEY"),verbose=True)
-    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, api_key="")
-    prompt = ChatPromptTemplate.from_messages(
-    [
         (
             "system",
-            """You are a helpful assistant. Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
-    provided context just say, "answer is not available in the context", don't provide the wrong answer""",
         ),
         ("placeholder", "{chat_history}"),
         ("human", "{input}"),
         ("placeholder", "{agent_scratchpad}"),
-    ]
-)
-    tool=[tools]
-    agent = create_tool_calling_agent(llm, tool, prompt)
-    agent_executor = AgentExecutor(agent=agent, tools=tool, verbose=True)
-    response=agent_executor.invoke({"input": ques})
-    print(response)
-    st.write("Reply: ", response['output'])
-def user_input(user_question):
-    new_db = FAISS.load_local("faiss_db", embeddings,allow_dangerous_deserialization=True)
-    retriever=new_db.as_retriever()
-    retrieval_chain= create_retriever_tool(retriever,"pdf_extractor","This tool is to give answer to queries from the pdf")
-    get_conversational_chain(retrieval_chain,user_question)
 def main():
-    st.set_page_config("Chat PDF")
-    st.header("RAG based Chat with PDF")
-    user_question = st.text_input("Ask a Question from the PDF Files")
-    if user_question:
-        user_input(user_question)
     with st.sidebar:
-        st.title("Menu:")
-        pdf_doc = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
-        if st.button("Submit & Process"):
-            with st.spinner("Processing..."):
-                raw_text = pdf_read(pdf_doc)
-                text_chunks = get_chunks(raw_text)
-                vector_store(text_chunks)
-                st.success("Done")
 if __name__ == "__main__":
-    main()

+import os
 import streamlit as st
+from dotenv import load_dotenv
+# Lectura y procesamiento de PDFs
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Embeddings y VectorStores
 from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
 from langchain_community.vectorstores import FAISS
+# LLM y Herramientas
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
 from langchain.tools.retriever import create_retriever_tool
 from langchain.agents import AgentExecutor, create_tool_calling_agent
+# Cargar variables de entorno
+load_dotenv()
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # A veces necesario en Windows o entornos concretos
+# Inicializamos el embedding con spaCy
 embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
+# -----------------------------------------------------------
+# Funciones auxiliares
+# -----------------------------------------------------------
+def pdf_read(pdf_docs):
+    """
+    Lee cada PDF y concatena su texto.
+    """
     text = ""
+    for pdf in pdf_docs:
         pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
+            text += page.extract_text() or ""
     return text
 def get_chunks(text):
+    """
+    Divide el texto en chunks para indexarlo en FAISS.
+    """
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    return text_splitter.split_text(text)
+def create_vector_store(text_chunks):
+    """
+    Crea un FAISS VectorStore a partir de los chunks.
+    """
     vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
+    return vector_store
+def get_conversational_chain(tool, question):
+    """
+    Genera la respuesta a la pregunta usando la herramienta de recuperación.
+    """
+    api_key = os.getenv("OPENAI_API_KEY")
+    # Modelo LLM (adaptar model_name según lo que tengas disponible)
+    llm = ChatOpenAI(
+        model_name="gpt-4o-mini",  # O "gpt-3.5-turbo", etc.
+        temperature=0.4,
+        api_key=api_key
+    )
+    # Plantilla de prompt
+    prompt = ChatPromptTemplate.from_messages([
         (
             "system",
+            """Eres un asistente útil. Responde la pregunta de la forma más completa posible
+            utilizando solo el contexto disponible. Si la respuesta no está en el contexto,
+            di: "answer is not available in the context"."""
         ),
         ("placeholder", "{chat_history}"),
         ("human", "{input}"),
         ("placeholder", "{agent_scratchpad}"),
+    ])
+    # Creamos el agente con la herramienta y ejecutamos
+    agent = create_tool_calling_agent(llm, tools=[tool], prompt=prompt)
+    agent_executor = AgentExecutor(agent=agent, tools=[tool], verbose=False)
+    response = agent_executor.invoke({"input": question})
+    return response["output"]
+def generate_answer(user_question):
+    """
+    Usa la base vectorial en session_state y retorna la respuesta.
+    """
+    # Verifica si tenemos FAISS cargado
+    if "faiss_db" not in st.session_state or st.session_state["faiss_db"] is None:
+        return "No hay PDF(s) procesado(s). Por favor, carga y procesa algún PDF."
+    # Crea la herramienta de recuperación
+    db = st.session_state["faiss_db"]
+    retriever = db.as_retriever()
+    retrieval_tool = create_retriever_tool(
+        retriever,
+        name="pdf_extractor",
+        description="This tool gives answers to queries from the PDF(s)."
+    )
+    # Obtiene la respuesta final usando la cadena conversacional
+    answer = get_conversational_chain(retrieval_tool, user_question)
+    return answer
+# -----------------------------------------------------------
+# Aplicación principal
+# -----------------------------------------------------------
 def main():
+    st.set_page_config(page_title="Chat PDF", layout="wide")
+    st.header("RAG-based Chat con PDF")
+    # Inicializa el historial de mensajes en session_state si no existe
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = []
+    # Inicializa la base vectorial (None si aún no se ha creado)
+    if "faiss_db" not in st.session_state:
+        st.session_state["faiss_db"] = None
+    # ----------------------------------------------------------------
+    # SIDEBAR: subir y procesar PDFs
+    # ----------------------------------------------------------------
     with st.sidebar:
+        st.title("Menú:")
+        pdf_docs = st.file_uploader(
+            "Sube tus archivos PDF y haz clic en 'Procesar PDFs'.",
+            accept_multiple_files=True
+        )
+        if st.button("Procesar PDFs"):
+            if pdf_docs:
+                with st.spinner("Procesando..."):
+                    # Leemos y fragmentamos los PDFs en chunks
+                    raw_text = pdf_read(pdf_docs)
+                    text_chunks = get_chunks(raw_text)
+                    # Creamos la base vectorial FAISS y la guardamos en session_state
+                    new_vector_store = create_vector_store(text_chunks)
+                    st.session_state["faiss_db"] = new_vector_store
+                st.success("¡Hecho! Se han indexado los PDF.")
+            else:
+                st.warning("No has seleccionado ningún PDF.")
+        # Opción para borrar la base vectorial y subir otros PDFs
+        if st.button("Borrar vector store"):
+            st.session_state["faiss_db"] = None
+            st.info("Vector store borrado. Ahora puedes subir nuevos PDFs.")
+    # ----------------------------------------------------------------
+    # MAIN CHAT
+    # ----------------------------------------------------------------
+    st.subheader("Chat")
+    # Muestra los mensajes previos del historial
+    for msg in st.session_state["messages"]:
+        # Si quieres un formato sencillo:
+        st.write(f"**{msg['role'].capitalize()}:** {msg['content']}")
+        # O bien, podrías usar el componente experimental de chat si tu versión de Streamlit lo soporta:
+        # if msg["role"] == "user":
+        #     with st.chat_message("user"):
+        #         st.write(msg["content"])
+        # else:
+        #     with st.chat_message("assistant"):
+        #         st.write(msg["content"])
+    # Input de chat del usuario
+    user_input = st.text_input("Escribe tu pregunta aquí...")
+    if user_input:
+        # Guarda el mensaje del usuario
+        st.session_state["messages"].append({"role": "user", "content": user_input})
+        # Genera la respuesta
+        answer = generate_answer(user_input)
+        # Guarda la respuesta en el historial
+        st.session_state["messages"].append({"role": "assistant", "content": answer})
+        # Para forzar el refresco (opcional en Streamlit 1.x).
+        # Puedes comentarlo si te da problemas o no lo necesitas.
+        #st.experimental_rerun()
 if __name__ == "__main__":
+    main()

faiss_db/index.faiss ADDED Viewed

Binary file (53.4 kB). View file

faiss_db/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1940baa6a5f93292bd16ac70eed201d34e88a1d855e08238d3eacf194a343f73
+size 147745

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ langchain-anthropic
 langchain-openai
 faiss-cpu
 python-dotenv
-spacy

 langchain-openai
 faiss-cpu
 python-dotenv
+spacy
+en-core-web-sm==3.5.0
+altair==4.2.2