add website option
Browse files- app.py +47 -23
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import tempfile
|
|
| 5 |
from langchain_openai import OpenAIEmbeddings
|
| 6 |
from langchain_openai.chat_models import ChatOpenAI
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
-
from langchain_community.document_loaders import PyPDFLoader
|
| 9 |
from langchain_community.document_loaders.generic import GenericLoader
|
| 10 |
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
|
| 11 |
from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
|
@@ -13,7 +13,6 @@ from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
|
| 13 |
)
|
| 14 |
from langchain_community.vectorstores import Chroma
|
| 15 |
from langchain_core.messages import HumanMessage, AIMessage
|
| 16 |
-
from langchain_core.output_parsers import StrOutputParser
|
| 17 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 18 |
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
| 19 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
@@ -25,13 +24,20 @@ st.set_page_config(page_title="Chat with your data", page_icon="🤖")
|
|
| 25 |
st.title("Chat with your data")
|
| 26 |
st.header("Add your data for RAG")
|
| 27 |
|
| 28 |
-
data_type = st.radio(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
if "vectordb" not in st.session_state:
|
| 31 |
st.session_state.vectordb = None
|
| 32 |
|
| 33 |
|
| 34 |
-
def
|
| 35 |
embeddings = OpenAIEmbeddings()
|
| 36 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
| 37 |
texts = text_splitter.split_text(text)
|
|
@@ -42,7 +48,7 @@ def add_text_to_chroma(text):
|
|
| 42 |
return vectordb
|
| 43 |
|
| 44 |
|
| 45 |
-
def
|
| 46 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 47 |
tmp_file.write(uploaded_pdf.read())
|
| 48 |
tmp_file_path = tmp_file.name
|
|
@@ -58,7 +64,19 @@ def add_pdf_to_chroma(uploaded_pdf):
|
|
| 58 |
return vectordb
|
| 59 |
|
| 60 |
|
| 61 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
save_dir = "docs/youtube"
|
| 63 |
loader = GenericLoader(
|
| 64 |
YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
|
|
@@ -76,21 +94,23 @@ def add_youtube_to_chroma(youtube_url):
|
|
| 76 |
if data_type == "Text":
|
| 77 |
user_text = st.text_area("Enter text data")
|
| 78 |
if st.button("Add"):
|
| 79 |
-
st.session_state.vectordb =
|
| 80 |
|
| 81 |
elif data_type == "PDF":
|
| 82 |
uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
|
| 83 |
if st.button("Add"):
|
| 84 |
-
st.session_state.vectordb =
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
else:
|
| 87 |
youtube_url = st.text_input("Enter YouTube URL")
|
| 88 |
if st.button("Add"):
|
| 89 |
-
st.session_state.vectordb =
|
| 90 |
|
| 91 |
-
llm = ChatOpenAI(
|
| 92 |
-
api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo"
|
| 93 |
-
)
|
| 94 |
|
| 95 |
|
| 96 |
def get_context_retreiver_chain(vectordb):
|
|
@@ -113,11 +133,16 @@ def get_context_retreiver_chain(vectordb):
|
|
| 113 |
|
| 114 |
|
| 115 |
def get_conversational_rag_chain(retriever_chain):
|
| 116 |
-
prompt = ChatPromptTemplate.from_messages(
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
|
| 123 |
|
|
@@ -127,16 +152,15 @@ def get_conversational_rag_chain(retriever_chain):
|
|
| 127 |
def get_response(user_input):
|
| 128 |
if st.session_state.vectordb is None:
|
| 129 |
return "Please add data first"
|
| 130 |
-
|
| 131 |
retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
|
| 132 |
converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
|
| 133 |
|
| 134 |
-
response = converasational_rag_chain.invoke(
|
| 135 |
-
"chat_history": st.session_state.chat_history,
|
| 136 |
-
|
| 137 |
-
})
|
| 138 |
|
| 139 |
-
return response[
|
| 140 |
|
| 141 |
|
| 142 |
user_query = st.chat_input("Your message")
|
|
|
|
| 5 |
from langchain_openai import OpenAIEmbeddings
|
| 6 |
from langchain_openai.chat_models import ChatOpenAI
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
|
| 9 |
from langchain_community.document_loaders.generic import GenericLoader
|
| 10 |
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
|
| 11 |
from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
|
|
|
| 13 |
)
|
| 14 |
from langchain_community.vectorstores import Chroma
|
| 15 |
from langchain_core.messages import HumanMessage, AIMessage
|
|
|
|
| 16 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 17 |
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
| 18 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
|
|
| 24 |
st.title("Chat with your data")
|
| 25 |
st.header("Add your data for RAG")
|
| 26 |
|
| 27 |
+
data_type = st.radio(
|
| 28 |
+
"Choose the type of data to add:", ("Text", "PDF", "Website", "YouTube")
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
if data_type == "YouTube":
|
| 32 |
+
st.warning(
|
| 33 |
+
"Note: Processing YouTube videos can be quite costly for me in terms of money. Please use this option sparingly. Thank you for your understanding!"
|
| 34 |
+
)
|
| 35 |
|
| 36 |
if "vectordb" not in st.session_state:
|
| 37 |
st.session_state.vectordb = None
|
| 38 |
|
| 39 |
|
| 40 |
+
def get_vectordb_from_text(text):
|
| 41 |
embeddings = OpenAIEmbeddings()
|
| 42 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
| 43 |
texts = text_splitter.split_text(text)
|
|
|
|
| 48 |
return vectordb
|
| 49 |
|
| 50 |
|
| 51 |
+
def get_vectordb_from_pdf(uploaded_pdf):
|
| 52 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 53 |
tmp_file.write(uploaded_pdf.read())
|
| 54 |
tmp_file_path = tmp_file.name
|
|
|
|
| 64 |
return vectordb
|
| 65 |
|
| 66 |
|
| 67 |
+
def get_vectordb_from_website(website_url):
|
| 68 |
+
loader = WebBaseLoader(website_url)
|
| 69 |
+
pages = loader.load()
|
| 70 |
+
embeddings = OpenAIEmbeddings()
|
| 71 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
| 72 |
+
docs = text_splitter.split_documents(pages)
|
| 73 |
+
vectordb = Chroma.from_documents(
|
| 74 |
+
documents=docs,
|
| 75 |
+
embedding=embeddings,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_vectordb_from_youtube(youtube_url):
|
| 80 |
save_dir = "docs/youtube"
|
| 81 |
loader = GenericLoader(
|
| 82 |
YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
|
|
|
|
| 94 |
if data_type == "Text":
|
| 95 |
user_text = st.text_area("Enter text data")
|
| 96 |
if st.button("Add"):
|
| 97 |
+
st.session_state.vectordb = get_vectordb_from_text(user_text)
|
| 98 |
|
| 99 |
elif data_type == "PDF":
|
| 100 |
uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
|
| 101 |
if st.button("Add"):
|
| 102 |
+
st.session_state.vectordb = get_vectordb_from_pdf(uploaded_pdf)
|
| 103 |
|
| 104 |
+
elif data_type == "Website":
|
| 105 |
+
website_url = st.text_input("Enter website URL")
|
| 106 |
+
if st.button("Add"):
|
| 107 |
+
st.session_state.vectordb = get_vectordb_from_website(website_url)
|
| 108 |
else:
|
| 109 |
youtube_url = st.text_input("Enter YouTube URL")
|
| 110 |
if st.button("Add"):
|
| 111 |
+
st.session_state.vectordb = get_vectordb_from_youtube(youtube_url)
|
| 112 |
|
| 113 |
+
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo")
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
def get_context_retreiver_chain(vectordb):
|
|
|
|
| 133 |
|
| 134 |
|
| 135 |
def get_conversational_rag_chain(retriever_chain):
|
| 136 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 137 |
+
[
|
| 138 |
+
(
|
| 139 |
+
"system",
|
| 140 |
+
"Answer the user's questions based on the below context:\n\n{context}",
|
| 141 |
+
),
|
| 142 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
| 143 |
+
("user", "{input}"),
|
| 144 |
+
]
|
| 145 |
+
)
|
| 146 |
|
| 147 |
stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
|
| 148 |
|
|
|
|
| 152 |
def get_response(user_input):
|
| 153 |
if st.session_state.vectordb is None:
|
| 154 |
return "Please add data first"
|
| 155 |
+
|
| 156 |
retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
|
| 157 |
converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
|
| 158 |
|
| 159 |
+
response = converasational_rag_chain.invoke(
|
| 160 |
+
{"chat_history": st.session_state.chat_history, "input": user_input}
|
| 161 |
+
)
|
|
|
|
| 162 |
|
| 163 |
+
return response["answer"]
|
| 164 |
|
| 165 |
|
| 166 |
user_query = st.chat_input("Your message")
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ langchain_community
|
|
| 3 |
langchain_openai
|
| 4 |
langchain_pinecone
|
| 5 |
pypdf
|
|
|
|
| 6 |
yt_dlp
|
| 7 |
pydub
|
| 8 |
chromadb
|
|
|
|
| 3 |
langchain_openai
|
| 4 |
langchain_pinecone
|
| 5 |
pypdf
|
| 6 |
+
beautifulsoup4
|
| 7 |
yt_dlp
|
| 8 |
pydub
|
| 9 |
chromadb
|