Spaces:

emgoggles
/

HarveGPT

Sleeping

App Files Files Community

emgoggles commited on Feb 11, 2024

Commit

72c6641

verified ·

1 Parent(s): a936558

Upload 3 files

Browse files

Files changed (3) hide show

assets/logo_harve.png +0 -0
harve_app.py +221 -0
requirements.txt +12 -0

assets/logo_harve.png ADDED Viewed

harve_app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# STREAMLIT VERSION 2.1 - PDF WORKING
+import streamlit as st
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Qdrant
+from langchain_openai import OpenAIEmbeddings
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_openai import ChatOpenAI
+from PIL import Image
+from PyPDF2 import PdfReader
+# from dotenv import load_dotenv
+# Load secrets from .env file
+# load_dotenv()
+def extract_data_from_url(url):
+    '''
+    Extract the url content and return as a document.
+    args: url (str): The url of the web page to extract content from
+    '''
+    loader = WebBaseLoader(url)
+    doc = loader.load()
+    return doc
+def extract_transcript_from_youtube_url(youtube_url):
+    '''
+    Extract the transcript of a YouTube video.
+    args: url (str): The url of the YouTube video
+    '''
+    youtube_loader = YoutubeLoader.from_youtube_url(
+        youtube_url, add_video_info=False)
+    transcript = youtube_loader.load()
+    return transcript
+def create_vectorstore_from_pdf(uploaded_pdf):
+    '''
+    Extract the text content of a PDF file, embed it and store in a vector db.
+    args: uploaded pdf (file)
+    '''
+    pdf_reader = PdfReader(uploaded_pdf)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n", "\n\n", "\r", "\t", " "],
+        chunk_size=1000,
+        chunk_overlap=0,
+    )
+    text_chunks = text_splitter.split_text(text)
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+    vector_db = Qdrant.from_texts(
+        text_chunks,
+        embeddings,
+        location=":memory:",  # Using in-memory storage
+        collection_name="HarveDocs")
+    return vector_db
+def create_vectorstore_from_data(data):
+    '''
+    1. Split the text data into text chunks.
+    2. Vectorize text chunks and store in a vector db.
+    3. Return the vector db.
+    args: data (str): The text data to be vectorized and stored in vector store.
+    '''
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n", "\n\n", "\r", "\t", " "],
+        chunk_size=1000,
+        chunk_overlap=0,
+    )
+    text_chunks = text_splitter.split_documents(data)
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+    vector_db = Qdrant.from_documents(
+        text_chunks,
+        embeddings,
+        location=":memory:",  # Using in-memory storage
+        collection_name="HarveDocs")
+    return vector_db
+def create_context_retriever_chain(vec_store):
+    '''
+    Get the context retriever chain to be used in the dialog chain.
+    '''
+    llm = ChatOpenAI(temperature=0.1, max_tokens=500)
+    retriever = vec_store.as_retriever()
+    prompt = ChatPromptTemplate.from_messages([
+        MessagesPlaceholder(variable_name="chat_history"),
+        ("user", "{input}"),
+        ("user", "Based on the conversation above, create a search query that you will refer to, to get information that is relevant to the conversation.")
+    ])
+    retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
+    return retriever_chain
+def create_dialog_rag_chain(retriever_chain):
+    '''
+    Get the conversation chain
+    '''
+    llm = ChatOpenAI(temperature=0.1, max_tokens=500)
+    prompt = ChatPromptTemplate.from_messages([
+        MessagesPlaceholder(variable_name="chat_history"),
+        ("system",
+         "Answer the user's questions based on the context below:\n{context}"),
+        MessagesPlaceholder(variable_name="chat_history"),
+        ("user", "{input}"),
+    ])
+    stuff_documents_chain = create_stuff_documents_chain(llm, prompt)
+    return create_retrieval_chain(retriever_chain, stuff_documents_chain)
+def get_response(query):
+    '''
+    Get response from the AI model
+    '''
+    # Dialog chain
+    retrieval_chain = create_context_retriever_chain(
+        st.session_state.vec_store)
+    dialog_rag_chain = create_dialog_rag_chain(retrieval_chain)
+    response = dialog_rag_chain.invoke({
+        "chat_history": st.session_state.chat_history,
+        "input": user_input
+    })
+    return response["answer"]
+def chat(user_input):
+    if user_input and user_input.strip() != "":
+        response = get_response(user_input)
+        st.session_state.chat_history.append(
+            HumanMessage(content=user_input))
+        st.session_state.chat_history.append(AIMessage(content=response))
+    # Dialog flow
+    for message in st.session_state.chat_history:
+        if isinstance(message, AIMessage):
+            with st.chat_message("AI"):
+                st.write(message.content)
+        elif isinstance(message, HumanMessage):
+            with st.chat_message("Human"):
+                st.write(message.content)
+def get_chat_history():
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = [
+            AIMessage(content="Hello! How can I help you?")
+        ]
+        return st.session_state.chat_history
+# UI Config
+logo = Image.open("assets/logo_harve.png")
+st.set_page_config(page_title="HarveGPT", page_icon=logo, layout="wide")
+st.title("HarveGPT")
+# Sidebar
+with st.sidebar:
+    st.header("Options")
+    url = st.text_input("Enter Website or YouTube URL")
+    uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
+    start_button = st.button("Start Chat")
+# Options to start chat
+if not url or url.strip() == "" or url is None:
+    if uploaded_pdf is not None:
+        chat_history = get_chat_history()
+        if "vec_store" not in st.session_state:
+            st.session_state.vec_store = create_vectorstore_from_pdf(
+                uploaded_pdf)
+        user_input = st.chat_input("Type a message...")
+        chat(user_input)
+    else:
+        st.success("👈  Please provide Harve with a source to start the chat.")
+else:
+    try:
+        if "youtube.com" in url or "youtu.be" in url:
+            data = extract_transcript_from_youtube_url(url)
+        else:
+            data = extract_data_from_url(url)
+    except Exception as e:
+        st.warning(
+            f"An error occurred: {e} Enter a valid link to continue.")
+        st.stop()
+    # Use `st.session_state`` to store chat history and avoid reinitializing the entire session
+    chat_history = get_chat_history()
+    if "vec_store" not in st.session_state:
+        st.session_state.vec_store = create_vectorstore_from_data(data)
+    # Chat input
+    user_input = st.chat_input("Type a message...")
+    chat(user_input)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit
+langchain
+langchain_community
+langchain_core
+langchain_openai
+python-dotenv
+streamlit
+beautifulsoup4
+huggingface_hub
+qdrant-client
+youtube-transcript-api
+PyPDF2