Spaces:

hail75
/

chatbot

Sleeping

App Files Files Community

hail75 commited on Jul 17, 2024

Commit

7f07a51

1 Parent(s): a007d8f

add rag

Browse files

Files changed (9) hide show

.vscode/launch.json +17 -0
app.py +128 -65
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/data_level0.bin +0 -3
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/header.bin +0 -3
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/length.bin +0 -3
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/link_lists.bin +0 -0
docs/chroma/chroma.sqlite3 +0 -0
docs/ttdn.pdf +0 -0
requirements.txt +1 -0

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python:Streamlit",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "streamlit",
+            "args": [
+                "run",
+                "${file}",
+                "--server.port",
+                "2000"
+            ]
+        }
+    ]
+}

app.py CHANGED Viewed

@@ -7,11 +7,18 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders.generic import GenericLoader
 from langchain_community.document_loaders.parsers import OpenAIWhisperParser
-from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
 from langchain_community.vectorstores import Chroma
-from langchain_core.prompts import ChatPromptTemplate
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import RetrievalQA
 st.set_page_config(page_title="Chat with your data", page_icon="🤖")
 st.title("Chat with your data")
@@ -19,79 +26,135 @@ st.header("Add your data for RAG")
 data_type = st.radio("Choose the type of data to add:", ("Text", "PDF", "YouTube URL"))
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
-pages = None
 if data_type == "Text":
     user_text = st.text_area("Enter text data")
     if st.button("Add"):
-        pages = user_text
 elif data_type == "PDF":
     uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
     if st.button("Add"):
-        loader = PyPDFLoader("docs/ttdn.pdf")
-        pages = loader.load()
-elif data_type == "YouTube URL":
     youtube_url = st.text_input("Enter YouTube URL")
     if st.button("Add"):
-        save_dir="docs/youtube"
-        loader = GenericLoader(
-            YoutubeAudioLoader([youtube_url], save_dir),
-            OpenAIWhisperParser()
-        )
-        pages = loader.load()
 llm = ChatOpenAI(
-    api_key=os.environ.get("OPENAI_API_KEY"),
-    temperature=0.2,
-    model='gpt-3.5-turbo')
-template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
-Context: {context}
-Question: {question}
-Helpful Answer:"""
-prompt = ChatPromptTemplate.from_template(template)
-if pages:
-    embedding = OpenAIEmbeddings()
-    if data_type == "Text":
-        texts = text_splitter.split_text(pages)
-        vectordb = Chroma.from_texts(
-            texts=texts,
-            embedding=embedding,
-            persist_directory='docs/chroma/'
-        )
-    else:
-        docs = text_splitter.split_documents(pages)
-        vectordb = Chroma.from_documents(
-            documents=docs,
-            embedding=embedding,
-            persist_directory='docs/chroma/'
-        )
-    qa_chain = RetrievalQA.from_chain_type(
-        llm,
-        retriever=vectordb.as_retriever(),
-        return_source_documents=True,
-        chain_type_kwargs={"prompt": prompt}
     )
-    result = qa_chain.invoke({"query": "What is BSM Labs"})
-    st.write(result["result"])
-#     st.session_state.retriever = vectordb.as_retriever()
-# if "retriever" in st.session_state:
-#     user_query = st.chat_input("Ask a question")
-#     if user_query:
-#         chain = prompt | llm | parser
-#         response = chain.invoke(input={
-#             "context": st.session_state.retriever,
-#             "question": user_query
-#         })
-#         st.write(response)

 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders.generic import GenericLoader
 from langchain_community.document_loaders.parsers import OpenAIWhisperParser
+from langchain_community.document_loaders.blob_loaders.youtube_audio import (
+    YoutubeAudioLoader,
+)
 from langchain_community.vectorstores import Chroma
+from langchain_core.messages import HumanMessage, AIMessage
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+openai_api_key = os.getenv("OPENAI_API_KEY")
 st.set_page_config(page_title="Chat with your data", page_icon="🤖")
 st.title("Chat with your data")
 data_type = st.radio("Choose the type of data to add:", ("Text", "PDF", "YouTube URL"))
+if "vectordb" not in st.session_state:
+    st.session_state.vectordb = None
+def add_text_to_chroma(text):
+    embeddings = OpenAIEmbeddings()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
+    texts = text_splitter.split_text(text)
+    vectordb = Chroma.from_texts(
+        texts=texts,
+        embedding=embeddings,
+    )
+    return vectordb
+def add_pdf_to_chroma(uploaded_pdf):
+    loader = PyPDFLoader(uploaded_pdf)
+    pages = loader.load()
+    embeddings = OpenAIEmbeddings()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
+    docs = text_splitter.split_documents(pages)
+    vectordb = Chroma.from_documents(
+        documents=docs,
+        embedding=embeddings,
+    )
+    return vectordb
+def add_youtube_to_chroma(youtube_url):
+    save_dir = "docs/youtube"
+    loader = GenericLoader(
+        YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
+    )
+    pages = loader.load()
+    embeddings = OpenAIEmbeddings()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
+    docs = text_splitter.split_documents(pages)
+    vectordb = Chroma.from_documents(
+        documents=docs, embedding=embeddings, persist_directory="chroma"
+    )
+    return vectordb
 if data_type == "Text":
     user_text = st.text_area("Enter text data")
     if st.button("Add"):
+        st.session_state.vectordb = add_text_to_chroma(user_text)
 elif data_type == "PDF":
     uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
     if st.button("Add"):
+        st.session_state.vectordb = add_pdf_to_chroma(uploaded_pdf)
+else:
     youtube_url = st.text_input("Enter YouTube URL")
     if st.button("Add"):
+        st.session_state.vectordb = add_youtube_to_chroma(youtube_url)
 llm = ChatOpenAI(
+    api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo"
+)
+def get_context_retreiver_chain(vectordb):
+    retriever = vectordb.as_retriever()
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            MessagesPlaceholder(variable_name="chat_history"),
+            ("user", "{input}"),
+            (
+                "user",
+                "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation",
+            ),
+        ]
     )
+    retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
+    return retriever_chain
+def get_conversational_rag_chain(retriever_chain):
+    prompt = ChatPromptTemplate.from_messages([
+      ("system", "Answer the user's questions based on the below context:\n\n{context}"),
+      MessagesPlaceholder(variable_name="chat_history"),
+      ("user", "{input}"),
+    ])
+    stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
+    return create_retrieval_chain(retriever_chain, stuff_domain_chain)
+def get_response(user_input):
+    if st.session_state.vectordb is None:
+        return "Please add data first"
+    retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
+    converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
+    response = converasational_rag_chain.invoke({
+        "chat_history": st.session_state.chat_history,
+        "input": user_input
+    })
+    return response
+user_query = st.chat_input("Your message")
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+for message in st.session_state.chat_history:
+    if isinstance(message, HumanMessage):
+        with st.chat_message("Human"):
+            st.markdown(message.content)
+    else:
+        with st.chat_message("AI"):
+            st.markdown(message.content)
+if user_query and user_query != "":
+    with st.chat_message("Human"):
+        st.markdown(user_query)
+    with st.chat_message("AI"):
+        ai_response = get_response(user_query)
+        st.markdown(ai_response)
+    st.session_state.chat_history.append(HumanMessage(user_query))
+    st.session_state.chat_history.append(AIMessage(ai_response))

docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/data_level0.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f18abd8c514282db82706e52b0a33ed659cd534e925a6f149deb7af9ce34bd8e
-size 6284000

docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/header.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:effaa959ce2b30070fdafc2fe82096fc46e4ee7561b75920dd3ce43d09679b21
-size 100

docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/length.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
-size 4000

docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/link_lists.bin DELETED Viewed

File without changes

docs/chroma/chroma.sqlite3 DELETED Viewed

Binary file (479 kB)

docs/ttdn.pdf DELETED Viewed

Binary file (147 kB)

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 langchain
 langchain_community
 langchain_openai
 pypdf
 yt_dlp
 pydub

 langchain
 langchain_community
 langchain_openai
+langchain_pinecone
 pypdf
 yt_dlp
 pydub