Cleanup and removal of unused code
Browse files
app.py
CHANGED
|
@@ -9,46 +9,34 @@ from langchain.document_loaders import DataFrameLoader
|
|
| 9 |
from langchain.embeddings import OpenAIEmbeddings
|
| 10 |
from langchain.vectorstores import Chroma
|
| 11 |
|
| 12 |
-
import kagglehub
|
| 13 |
-
from kagglehub import KaggleDatasetAdapter
|
| 14 |
import pandas as pd
|
| 15 |
|
| 16 |
from sklearn.model_selection import train_test_split
|
| 17 |
|
| 18 |
-
#
|
| 19 |
file_path = "dataset-tickets-multi-lang-4-20k.csv"
|
| 20 |
-
|
| 21 |
df = pd.read_csv(file_path)
|
| 22 |
|
|
|
|
| 23 |
df = df[df['language'] == 'en']
|
| 24 |
-
# Check for non-string items in body
|
| 25 |
non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
|
| 26 |
non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
|
| 27 |
non_string_ids = non_string_body.union(non_string_answers)
|
| 28 |
-
# Drop those rows
|
| 29 |
df = df.drop(index=non_string_ids)
|
| 30 |
df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
|
| 31 |
df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
|
| 32 |
-
df_val, df_test = train_test_split(df_holdout, test_size=0.5, random_state=42)
|
| 33 |
|
|
|
|
| 34 |
persist_directory = './chroma_db'
|
| 35 |
loader = DataFrameLoader(
|
| 36 |
df_train,
|
| 37 |
page_content_column="q_and_a")
|
| 38 |
documents = loader.load()
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
# Get OpenAI setup
|
| 43 |
openai_api_key = os.getenv("openai_token")
|
| 44 |
-
# embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
| 45 |
-
|
| 46 |
-
# vectordb = Chroma.from_documents(
|
| 47 |
-
# documents=documents,
|
| 48 |
-
# embedding=embedding,
|
| 49 |
-
# persist_directory=persist_directory
|
| 50 |
-
# )
|
| 51 |
|
|
|
|
| 52 |
@st.cache_resource
|
| 53 |
def get_vectordb():
|
| 54 |
embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
|
|
@@ -59,16 +47,6 @@ def get_vectordb():
|
|
| 59 |
|
| 60 |
vectordb = get_vectordb()
|
| 61 |
|
| 62 |
-
# @st.cache_resource
|
| 63 |
-
# def get_vectordb():
|
| 64 |
-
# embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
|
| 65 |
-
# return Chroma(persist_directory="./chroma_db", embedding_function=embedding)
|
| 66 |
-
|
| 67 |
-
# vectordb = get_vectordb()
|
| 68 |
-
|
| 69 |
-
# # Setup vector database
|
| 70 |
-
# persist_directory = './chroma_db'
|
| 71 |
-
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
|
| 72 |
|
| 73 |
llm_name = "gpt-3.5-turbo"
|
| 74 |
|
|
@@ -80,8 +58,7 @@ qa_chain = RetrievalQA.from_chain_type(
|
|
| 80 |
retriever=vectordb.as_retriever(search_kwargs={"k": 5})
|
| 81 |
)
|
| 82 |
|
| 83 |
-
|
| 84 |
-
# Streamed response emulator
|
| 85 |
def response_generator(prompt):
|
| 86 |
response = qa_chain({"query": prompt})['result']
|
| 87 |
|
|
@@ -89,7 +66,6 @@ def response_generator(prompt):
|
|
| 89 |
yield word + " "
|
| 90 |
time.sleep(0.05)
|
| 91 |
|
| 92 |
-
|
| 93 |
st.title("Technical Support Chatbot")
|
| 94 |
|
| 95 |
# Initialize chat history
|
|
@@ -109,8 +85,6 @@ if prompt := st.chat_input("Enter your question here"):
|
|
| 109 |
with st.chat_message("user"):
|
| 110 |
st.markdown(prompt)
|
| 111 |
|
| 112 |
-
# Display assistant response in chat message container
|
| 113 |
with st.chat_message("assistant"):
|
| 114 |
response = st.write_stream(response_generator(prompt))
|
| 115 |
-
# Add assistant response to chat history
|
| 116 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
|
|
|
| 9 |
from langchain.embeddings import OpenAIEmbeddings
|
| 10 |
from langchain.vectorstores import Chroma
|
| 11 |
|
|
|
|
|
|
|
| 12 |
import pandas as pd
|
| 13 |
|
| 14 |
from sklearn.model_selection import train_test_split
|
| 15 |
|
| 16 |
+
# Download dataset
|
| 17 |
file_path = "dataset-tickets-multi-lang-4-20k.csv"
|
|
|
|
| 18 |
df = pd.read_csv(file_path)
|
| 19 |
|
| 20 |
+
# Pre-processing of the dataset to prepare for VectorDB creation
|
| 21 |
df = df[df['language'] == 'en']
|
|
|
|
| 22 |
non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
|
| 23 |
non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
|
| 24 |
non_string_ids = non_string_body.union(non_string_answers)
|
|
|
|
| 25 |
df = df.drop(index=non_string_ids)
|
| 26 |
df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
|
| 27 |
df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
|
|
|
|
| 28 |
|
| 29 |
+
# Setup of chromadb database
|
| 30 |
persist_directory = './chroma_db'
|
| 31 |
loader = DataFrameLoader(
|
| 32 |
df_train,
|
| 33 |
page_content_column="q_and_a")
|
| 34 |
documents = loader.load()
|
| 35 |
|
|
|
|
|
|
|
| 36 |
# Get OpenAI setup
|
| 37 |
openai_api_key = os.getenv("openai_token")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Cache the creation of chroma_db so it only runs at app startup
|
| 40 |
@st.cache_resource
|
| 41 |
def get_vectordb():
|
| 42 |
embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
|
|
|
|
| 47 |
|
| 48 |
vectordb = get_vectordb()
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
llm_name = "gpt-3.5-turbo"
|
| 52 |
|
|
|
|
| 58 |
retriever=vectordb.as_retriever(search_kwargs={"k": 5})
|
| 59 |
)
|
| 60 |
|
| 61 |
+
# Emulate a streamed response
|
|
|
|
| 62 |
def response_generator(prompt):
|
| 63 |
response = qa_chain({"query": prompt})['result']
|
| 64 |
|
|
|
|
| 66 |
yield word + " "
|
| 67 |
time.sleep(0.05)
|
| 68 |
|
|
|
|
| 69 |
st.title("Technical Support Chatbot")
|
| 70 |
|
| 71 |
# Initialize chat history
|
|
|
|
| 85 |
with st.chat_message("user"):
|
| 86 |
st.markdown(prompt)
|
| 87 |
|
|
|
|
| 88 |
with st.chat_message("assistant"):
|
| 89 |
response = st.write_stream(response_generator(prompt))
|
|
|
|
| 90 |
st.session_state.messages.append({"role": "assistant", "content": response})
|