Spaces:

Darpan07
/

Personalized-ChatBot

Sleeping

App Files Files Community

Darpan07 commited on Feb 22, 2024

Commit

4cf62a0

verified ·

1 Parent(s): e556297

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -58

app.py CHANGED Viewed

@@ -4,62 +4,125 @@ import time
 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
-from langchain_openai import OpenAI, OpenAIEmbeddings
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
 from langchain.memory import ConversationBufferWindowMemory
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 # load the environment variables into the python script
-load_dotenv()
 # fetching the openai_api_key environment variable
-openai_api_key = os.getenv('OPENAI_API_KEY')
 # Initialize session states
-if 'vectorDB' not in st.session_state:
     st.session_state.vectorDB = None
 if "messages" not in st.session_state:
     st.session_state.messages = []
-if 'bot_name' not in st.session_state:
-    st.session_state.bot_name = ''
-if 'chain' not in st.session_state:
     st.session_state.chain = None
-def get_pdf_text(pdf) -> str:
-    """ This function extracts the text from the PDF file """
     text = ""
-    pdf_reader = PdfReader(pdf)
-    for page in pdf_reader.pages:
-        text += page.extract_text()
     return text
 def get_vectorstore(text_chunks):
-    """ This function will create a vector database as well as create and store the embedding of the text chunks into the VectorDB """
     embeddings = OpenAIEmbeddings()
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 def get_text_chunks(text: str):
-    """ This function will split the text into the smaller chunks"""
     text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=1000,
-    chunk_overlap=100,
-    length_function=len,
-    is_separator_regex=False,
     )
     chunks = text_splitter.split_text(text)
     return chunks
-def processing(pdf):
-    """This function divides the PDF into smaller chunks and saves these segmented chunks in a vector database. And return the Vector Database"""
-    # getting all the raw text from the PDF
-    raw_text = get_pdf_text(pdf)
     # divinding the raw text into smaller chunks
     text_chunks = get_text_chunks(raw_text)
@@ -69,81 +132,100 @@ def processing(pdf):
     return vectorDB
 def get_response(query: str):
-    """This function will return the output of the user query! """
     # getting the context from the database that is similar to the user query
-    query_context = st.session_state.vectorDB.similarity_search(query=query,k=4)
     # calling the chain to get the output from the LLM
-    response = st.session_state.chain.invoke({'human_input':query,'context':query_context,'name':st.session_state.bot_name})['text']
     # Iterate through each word in the 'response' string after splitting it based on whitespace
     for word in response.split():
         # Yield the current word followed by a space, effectively creating a generator
         yield word + " "
         # Pause execution for 0.05 seconds (50 milliseconds) to introduce a delay
         time.sleep(0.05)
 def get_conversation_chain(vectorDB):
-    """ This function will create and return a LLM-Chain"""
-    # using OPENAI LLM
-    llm = OpenAI(temperature=0.4)
     # creating a template to pass into LLM
     template = """You are a friendly customer support ChatBot with a name: {name} for the company, aiming to enhance the customer experience by providing tailored assistance and information.
-    Answer the question as detailed as possible and to the point from the context: {context}\n , if the answer is not in the provided context just say, "answer is not available in the context", do not provide the wrong answer\n\n
     {chat_history}
     Human: {human_input}
     AI: """
     # creating a prompt that is used to format the input of the user
-    prompt = PromptTemplate(template = template,input_variables=['chat_history','human_input','name','context'])
     # creating a memory that will store the chat history between chatbot and user
-    memory = ConversationBufferWindowMemory(memory_key='chat_history',input_key="human_input",k=5)
-    chain = LLMChain(llm=llm,prompt=prompt,memory=memory,verbose=True)
     return chain
-if __name__ =='__main__':
-    #setting the config of WebPage
-    st.set_page_config(page_title="Personalized ChatBot",page_icon="🤖")
-    st.header('Personalized Customer Support Chatbot 🤖',divider='rainbow')
     # taking input( bot name and pdf file) from the user
     with st.sidebar:
-        st.caption('Please enter the **Bot Name** and Upload **PDF** File!')
-        bot_name = st.text_input(label='Bot Name',placeholder='Enter the bot name here....',key="bot_name")
-        file = st.file_uploader("Upload a PDF file!",type='pdf')
         # moving forward only when both the inputs are given by the user
-        if file and bot_name:
             # the Process File button will process the pdf file and save the chunks into the vector database
-            if st.button('Process File'):
-                # if there is existing chat history we will delete it
                 if st.session_state.messages != []:
                     st.session_state.messages = []
-                with st.spinner('Processing.....'):
-                    st.session_state['vectorDB'] = processing(file)
-                    st.session_state['chain'] = get_conversation_chain(st.session_state['vectorDB'])
-                st.success('File Processed',icon="✅")
     # if the vector database is ready to use then only show the chatbot interface
-    if st.session_state.vectorDB:
         # Display chat messages from history on app rerun
         for message in st.session_state.messages:
             with st.chat_message(message["role"]):
                 st.write(message["content"])
         # taking the input i.e. query from the user (walrus operator)
         if prompt := st.chat_input(f"Message {st.session_state.bot_name}"):
             # Add user message to chat history

 import streamlit as st
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
+from docx import Document
+from docx.text.paragraph import Paragraph
+from docx.table import Table
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
 from langchain.memory import ConversationBufferWindowMemory
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 # load the environment variables into the python script
+load_dotenv()
 # fetching the openai_api_key environment variable
+openai_api_key = os.getenv("OPENAI_API_KEY")
 # Initialize session states
+if "vectorDB" not in st.session_state:
     st.session_state.vectorDB = None
 if "messages" not in st.session_state:
     st.session_state.messages = []
+if "bot_name" not in st.session_state:
+    st.session_state.bot_name = ""
+if "chain" not in st.session_state:
     st.session_state.chain = None
+def process_paragraph(paragraph):
+    """This Function returns the content of the paragraph present inside the DOC file"""
+    return paragraph.text
+def process_table(table):
+    """This function extracts the content from the table present inside the DOC file"""
     text = ""
+    for row in table.rows:
+        for cell in row.cells:
+            text += cell.text
     return text
+def read_docx(file_path):
+    """This function extracts the text from the DOC file"""
+    doc = Document(file_path)
+    text = []
+    for element in doc.iter_inner_content():
+        if isinstance(element, Paragraph):
+            text.append(process_paragraph(element))
+        elif isinstance(element, Table):
+            text.append(process_table(element))
+    return " ".join(text)
+def read_text_file(text_file):
+    """This function extracts the text from the TEXT file"""
+    try:
+        text = text_file.read().decode("utf-8")
+        return text
+    except Exception as e:
+        st.error(f"Error while reading {text_file.name} file : **{e}**")
+        return None
+def get_pdf_text(pdf):
+    """This function extracts the text from the PDF file"""
+    try:
+        text = []
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text.append(page.extract_text())
+        return " ".join(text)
+    except Exception as e:
+        st.error(f"Error while reading {pdf.name} file : **{e}**")
+        return None
 def get_vectorstore(text_chunks):
+    """This function will create a vector database as well as create & store the embedding of the text chunks into the VectorDB"""
     embeddings = OpenAIEmbeddings()
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 def get_text_chunks(text: str):
+    """This function will split the text into the smaller chunks"""
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=50,
+        length_function=len,
+        is_separator_regex=False,
     )
     chunks = text_splitter.split_text(text)
     return chunks
+def processing(files):
+    """This function"""
+    data = []
+    for file in files:
+        if file.name.endswith(".docx"):
+            text = read_docx(file)
+        elif file.name.endswith(".pdf"):
+            text = get_pdf_text(file)
+        else:
+            text = read_text_file(file)
+        data.append(text)
+    raw_text = " ".join(data)
     # divinding the raw text into smaller chunks
     text_chunks = get_text_chunks(raw_text)
     return vectorDB
 def get_response(query: str):
+    """This function will return the output of the user query!"""
     # getting the context from the database that is similar to the user query
+    query_context = st.session_state.vectorDB.similarity_search(query=query)
     # calling the chain to get the output from the LLM
+    response = st.session_state.chain.invoke(
+        {
+            "human_input": query,
+            "context": query_context[0].page_content,
+            "name": st.session_state.bot_name,
+        }
+    )["text"]
     # Iterate through each word in the 'response' string after splitting it based on whitespace
     for word in response.split():
         # Yield the current word followed by a space, effectively creating a generator
         yield word + " "
         # Pause execution for 0.05 seconds (50 milliseconds) to introduce a delay
         time.sleep(0.05)
 def get_conversation_chain(vectorDB):
+    """This function will create and return a LLM-Chain"""
+    # using OPENAI ChatModel
+    llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo-16k")
     # creating a template to pass into LLM
     template = """You are a friendly customer support ChatBot with a name: {name} for the company, aiming to enhance the customer experience by providing tailored assistance and information.
+    Answer the question as detailed as possible and to the point from the context: {context}\n\n.
+    If the answer is not in the provided context then only just say, "answer is not available in the context", do not provide the wrong answer\n\n
     {chat_history}
     Human: {human_input}
     AI: """
     # creating a prompt that is used to format the input of the user
+    prompt = PromptTemplate(
+        template=template,
+        input_variables=["chat_history", "human_input", "name", "context"],
+    )
     # creating a memory that will store the chat history between chatbot and user
+    memory = ConversationBufferWindowMemory(
+        memory_key="chat_history", input_key="human_input", k=5
+    )
+    chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=True)
     return chain
+if __name__ == "__main__":
+    # setting the config of WebPage
+    st.set_page_config(page_title="Personalized ChatBot", page_icon="🤖")
+    st.header("Personalized Customer Support Chatbot 🤖", divider="rainbow")
     # taking input( bot name and pdf file) from the user
     with st.sidebar:
+        st.caption("Please enter the **Bot Name** and Upload **PDF** File!")
+        bot_name = st.text_input(
+            label="Bot Name", placeholder="Enter the bot name here....", key="bot_name"
+        )
+        files = st.file_uploader(
+            label="Upload Files!",
+            type=["pdf", "txt", "docx"],
+            accept_multiple_files=True,
+        )
         # moving forward only when both the inputs are given by the user
+        if files and bot_name:
             # the Process File button will process the pdf file and save the chunks into the vector database
+            if st.button("Process File"):
+                # if there is existing chat history we will delete it
                 if st.session_state.messages != []:
                     st.session_state.messages = []
+                with st.spinner("Processing....."):
+                    st.session_state["vectorDB"] = processing(files)
+                    st.session_state["chain"] = get_conversation_chain(
+                        st.session_state["vectorDB"]
+                    )
+                st.success("File Processed", icon="✅")
     # if the vector database is ready to use then only show the chatbot interface
+    if st.session_state.vectorDB:
         # Display chat messages from history on app rerun
         for message in st.session_state.messages:
             with st.chat_message(message["role"]):
                 st.write(message["content"])
         # taking the input i.e. query from the user (walrus operator)
         if prompt := st.chat_input(f"Message {st.session_state.bot_name}"):
             # Add user message to chat history