emgoggles commited on
Commit
72c6641
·
verified ·
1 Parent(s): a936558

Upload 3 files

Browse files
Files changed (3) hide show
  1. assets/logo_harve.png +0 -0
  2. harve_app.py +221 -0
  3. requirements.txt +12 -0
assets/logo_harve.png ADDED
harve_app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # STREAMLIT VERSION 2.1 - PDF WORKING
2
+
3
+ import streamlit as st
4
+ from langchain_core.messages import AIMessage, HumanMessage
5
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
6
+ from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.vectorstores import Qdrant
9
+ from langchain_openai import OpenAIEmbeddings
10
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
11
+ from langchain.chains.combine_documents import create_stuff_documents_chain
12
+ from langchain_openai import ChatOpenAI
13
+ from PIL import Image
14
+ from PyPDF2 import PdfReader
15
+ # from dotenv import load_dotenv
16
+
17
+ # Load secrets from .env file
18
+ # load_dotenv()
19
+
20
+
21
+ def extract_data_from_url(url):
22
+ '''
23
+ Extract the url content and return as a document.
24
+
25
+ args: url (str): The url of the web page to extract content from
26
+ '''
27
+ loader = WebBaseLoader(url)
28
+ doc = loader.load()
29
+
30
+ return doc
31
+
32
+
33
+ def extract_transcript_from_youtube_url(youtube_url):
34
+ '''
35
+ Extract the transcript of a YouTube video.
36
+
37
+ args: url (str): The url of the YouTube video
38
+ '''
39
+ youtube_loader = YoutubeLoader.from_youtube_url(
40
+ youtube_url, add_video_info=False)
41
+ transcript = youtube_loader.load()
42
+
43
+ return transcript
44
+
45
+
46
+ def create_vectorstore_from_pdf(uploaded_pdf):
47
+ '''
48
+ Extract the text content of a PDF file, embed it and store in a vector db.
49
+
50
+ args: uploaded pdf (file)
51
+ '''
52
+ pdf_reader = PdfReader(uploaded_pdf)
53
+
54
+ text = ""
55
+ for page in pdf_reader.pages:
56
+ text += page.extract_text()
57
+
58
+ text_splitter = RecursiveCharacterTextSplitter(
59
+ separators=["\n", "\n\n", "\r", "\t", " "],
60
+ chunk_size=1000,
61
+ chunk_overlap=0,
62
+ )
63
+ text_chunks = text_splitter.split_text(text)
64
+
65
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
66
+ vector_db = Qdrant.from_texts(
67
+ text_chunks,
68
+ embeddings,
69
+ location=":memory:", # Using in-memory storage
70
+ collection_name="HarveDocs")
71
+
72
+ return vector_db
73
+
74
+
75
+ def create_vectorstore_from_data(data):
76
+ '''
77
+ 1. Split the text data into text chunks.
78
+ 2. Vectorize text chunks and store in a vector db.
79
+ 3. Return the vector db.
80
+
81
+ args: data (str): The text data to be vectorized and stored in vector store.
82
+ '''
83
+ text_splitter = RecursiveCharacterTextSplitter(
84
+ separators=["\n", "\n\n", "\r", "\t", " "],
85
+ chunk_size=1000,
86
+ chunk_overlap=0,
87
+ )
88
+ text_chunks = text_splitter.split_documents(data)
89
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
90
+ vector_db = Qdrant.from_documents(
91
+ text_chunks,
92
+ embeddings,
93
+ location=":memory:", # Using in-memory storage
94
+ collection_name="HarveDocs")
95
+
96
+ return vector_db
97
+
98
+
99
+ def create_context_retriever_chain(vec_store):
100
+ '''
101
+ Get the context retriever chain to be used in the dialog chain.
102
+ '''
103
+ llm = ChatOpenAI(temperature=0.1, max_tokens=500)
104
+ retriever = vec_store.as_retriever()
105
+ prompt = ChatPromptTemplate.from_messages([
106
+ MessagesPlaceholder(variable_name="chat_history"),
107
+ ("user", "{input}"),
108
+ ("user", "Based on the conversation above, create a search query that you will refer to, to get information that is relevant to the conversation.")
109
+ ])
110
+
111
+ retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
112
+ return retriever_chain
113
+
114
+
115
+ def create_dialog_rag_chain(retriever_chain):
116
+ '''
117
+ Get the conversation chain
118
+ '''
119
+ llm = ChatOpenAI(temperature=0.1, max_tokens=500)
120
+ prompt = ChatPromptTemplate.from_messages([
121
+ MessagesPlaceholder(variable_name="chat_history"),
122
+ ("system",
123
+ "Answer the user's questions based on the context below:\n{context}"),
124
+ MessagesPlaceholder(variable_name="chat_history"),
125
+ ("user", "{input}"),
126
+ ])
127
+ stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
128
+
129
+ return create_retrieval_chain(retriever_chain, stuff_documents_chain)
130
+
131
+
132
+ def get_response(query):
133
+ '''
134
+ Get response from the AI model
135
+ '''
136
+ # Dialog chain
137
+ retrieval_chain = create_context_retriever_chain(
138
+ st.session_state.vec_store)
139
+
140
+ dialog_rag_chain = create_dialog_rag_chain(retrieval_chain)
141
+ response = dialog_rag_chain.invoke({
142
+ "chat_history": st.session_state.chat_history,
143
+ "input": user_input
144
+ })
145
+ return response["answer"]
146
+
147
+
148
+ def chat(user_input):
149
+ if user_input and user_input.strip() != "":
150
+ response = get_response(user_input)
151
+ st.session_state.chat_history.append(
152
+ HumanMessage(content=user_input))
153
+ st.session_state.chat_history.append(AIMessage(content=response))
154
+
155
+ # Dialog flow
156
+ for message in st.session_state.chat_history:
157
+ if isinstance(message, AIMessage):
158
+ with st.chat_message("AI"):
159
+ st.write(message.content)
160
+ elif isinstance(message, HumanMessage):
161
+ with st.chat_message("Human"):
162
+ st.write(message.content)
163
+
164
+
165
+ def get_chat_history():
166
+ if "chat_history" not in st.session_state:
167
+ st.session_state.chat_history = [
168
+ AIMessage(content="Hello! How can I help you?")
169
+ ]
170
+ return st.session_state.chat_history
171
+
172
+
173
+ # UI Config
174
+ logo = Image.open("assets/logo_harve.png")
175
+ st.set_page_config(page_title="HarveGPT", page_icon=logo, layout="wide")
176
+ st.title("HarveGPT")
177
+
178
+
179
+ # Sidebar
180
+ with st.sidebar:
181
+ st.header("Options")
182
+ url = st.text_input("Enter Website or YouTube URL")
183
+ uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
184
+ start_button = st.button("Start Chat")
185
+
186
+ # Options to start chat
187
+ if not url or url.strip() == "" or url is None:
188
+ if uploaded_pdf is not None:
189
+ chat_history = get_chat_history()
190
+
191
+ if "vec_store" not in st.session_state:
192
+ st.session_state.vec_store = create_vectorstore_from_pdf(
193
+ uploaded_pdf)
194
+
195
+ user_input = st.chat_input("Type a message...")
196
+ chat(user_input)
197
+
198
+ else:
199
+ st.success("👈 Please provide Harve with a source to start the chat.")
200
+
201
+ else:
202
+ try:
203
+ if "youtube.com" in url or "youtu.be" in url:
204
+ data = extract_transcript_from_youtube_url(url)
205
+ else:
206
+ data = extract_data_from_url(url)
207
+
208
+ except Exception as e:
209
+ st.warning(
210
+ f"An error occurred: {e} Enter a valid link to continue.")
211
+ st.stop()
212
+
213
+ # Use `st.session_state`` to store chat history and avoid reinitializing the entire session
214
+ chat_history = get_chat_history()
215
+
216
+ if "vec_store" not in st.session_state:
217
+ st.session_state.vec_store = create_vectorstore_from_data(data)
218
+
219
+ # Chat input
220
+ user_input = st.chat_input("Type a message...")
221
+ chat(user_input)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain_community
4
+ langchain_core
5
+ langchain_openai
6
+ python-dotenv
7
+ streamlit
8
+ beautifulsoup4
9
+ huggingface_hub
10
+ qdrant-client
11
+ youtube-transcript-api
12
+ PyPDF2