kdevoe commited on
Commit
810c787
·
verified ·
1 Parent(s): e13dd34

Cleanup and removal of unused code

Browse files
Files changed (1) hide show
  1. app.py +5 -31
app.py CHANGED
@@ -9,46 +9,34 @@ from langchain.document_loaders import DataFrameLoader
9
  from langchain.embeddings import OpenAIEmbeddings
10
  from langchain.vectorstores import Chroma
11
 
12
- import kagglehub
13
- from kagglehub import KaggleDatasetAdapter
14
  import pandas as pd
15
 
16
  from sklearn.model_selection import train_test_split
17
 
18
- # # Download dataset
19
  file_path = "dataset-tickets-multi-lang-4-20k.csv"
20
-
21
  df = pd.read_csv(file_path)
22
 
 
23
  df = df[df['language'] == 'en']
24
- # Check for non-string items in body
25
  non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
26
  non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
27
  non_string_ids = non_string_body.union(non_string_answers)
28
- # Drop those rows
29
  df = df.drop(index=non_string_ids)
30
  df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
31
  df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
32
- df_val, df_test = train_test_split(df_holdout, test_size=0.5, random_state=42)
33
 
 
34
  persist_directory = './chroma_db'
35
  loader = DataFrameLoader(
36
  df_train,
37
  page_content_column="q_and_a")
38
  documents = loader.load()
39
 
40
-
41
-
42
  # Get OpenAI setup
43
  openai_api_key = os.getenv("openai_token")
44
- # embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
45
-
46
- # vectordb = Chroma.from_documents(
47
- # documents=documents,
48
- # embedding=embedding,
49
- # persist_directory=persist_directory
50
- # )
51
 
 
52
  @st.cache_resource
53
  def get_vectordb():
54
  embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
@@ -59,16 +47,6 @@ def get_vectordb():
59
 
60
  vectordb = get_vectordb()
61
 
62
- # @st.cache_resource
63
- # def get_vectordb():
64
- # embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
65
- # return Chroma(persist_directory="./chroma_db", embedding_function=embedding)
66
-
67
- # vectordb = get_vectordb()
68
-
69
- # # Setup vector database
70
- # persist_directory = './chroma_db'
71
- # vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
72
 
73
  llm_name = "gpt-3.5-turbo"
74
 
@@ -80,8 +58,7 @@ qa_chain = RetrievalQA.from_chain_type(
80
  retriever=vectordb.as_retriever(search_kwargs={"k": 5})
81
  )
82
 
83
-
84
- # Streamed response emulator
85
  def response_generator(prompt):
86
  response = qa_chain({"query": prompt})['result']
87
 
@@ -89,7 +66,6 @@ def response_generator(prompt):
89
  yield word + " "
90
  time.sleep(0.05)
91
 
92
-
93
  st.title("Technical Support Chatbot")
94
 
95
  # Initialize chat history
@@ -109,8 +85,6 @@ if prompt := st.chat_input("Enter your question here"):
109
  with st.chat_message("user"):
110
  st.markdown(prompt)
111
 
112
- # Display assistant response in chat message container
113
  with st.chat_message("assistant"):
114
  response = st.write_stream(response_generator(prompt))
115
- # Add assistant response to chat history
116
  st.session_state.messages.append({"role": "assistant", "content": response})
 
9
  from langchain.embeddings import OpenAIEmbeddings
10
  from langchain.vectorstores import Chroma
11
 
 
 
12
  import pandas as pd
13
 
14
  from sklearn.model_selection import train_test_split
15
 
16
+ # Download dataset
17
  file_path = "dataset-tickets-multi-lang-4-20k.csv"
 
18
  df = pd.read_csv(file_path)
19
 
20
+ # Pre-processing of the dataset to prepare for VectorDB creation
21
  df = df[df['language'] == 'en']
 
22
  non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
23
  non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
24
  non_string_ids = non_string_body.union(non_string_answers)
 
25
  df = df.drop(index=non_string_ids)
26
  df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
27
  df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
 
28
 
29
+ # Setup of chromadb database
30
  persist_directory = './chroma_db'
31
  loader = DataFrameLoader(
32
  df_train,
33
  page_content_column="q_and_a")
34
  documents = loader.load()
35
 
 
 
36
  # Get OpenAI setup
37
  openai_api_key = os.getenv("openai_token")
 
 
 
 
 
 
 
38
 
39
+ # Cache the creation of chroma_db so it only runs at app startup
40
  @st.cache_resource
41
  def get_vectordb():
42
  embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
 
47
 
48
  vectordb = get_vectordb()
49
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  llm_name = "gpt-3.5-turbo"
52
 
 
58
  retriever=vectordb.as_retriever(search_kwargs={"k": 5})
59
  )
60
 
61
+ # Emulate a streamed response
 
62
  def response_generator(prompt):
63
  response = qa_chain({"query": prompt})['result']
64
 
 
66
  yield word + " "
67
  time.sleep(0.05)
68
 
 
69
  st.title("Technical Support Chatbot")
70
 
71
  # Initialize chat history
 
85
  with st.chat_message("user"):
86
  st.markdown(prompt)
87
 
 
88
  with st.chat_message("assistant"):
89
  response = st.write_stream(response_generator(prompt))
 
90
  st.session_state.messages.append({"role": "assistant", "content": response})