File size: 3,301 Bytes
f603258
 
 
 
 
 
 
 
 
 
 
6d53aca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655120d
6d53aca
 
 
 
 
 
 
 
 
 
f603258
 
 
 
 
6d53aca
 
 
 
f603258
6d53aca
f603258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import streamlit as st
import random
import time

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd 

# Download dataset
# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "tobiasbueck/multilingual-customer-support-tickets",
  file_path,
)

df = df[df['language'] == 'en']
# Check for non-string items in body
non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
non_string_ids = non_string_body.union(non_string_answers)
# Drop those rows
df = df.drop(index=non_string_ids)
df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
df_val, df_test = train_test_split(df_holdout, test_size=0.5, random_state=42)

persist_directory = './chroma_db'
rm -rf ./chroma_db  # remove old database files if any
loader = DataFrameLoader(
    df_train,
    page_content_column="q_and_a")
documents = loader.load()

vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    persist_directory=persist_directory
)

# Get OpenAI setup
openai_api_key = os.getenv("openai_token")
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)

# @st.cache_resource
# def get_vectordb():
#     embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
#     return Chroma(persist_directory="./chroma_db", embedding_function=embedding)

# vectordb = get_vectordb()

# # Setup vector database
# persist_directory = './chroma_db'
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

llm_name = "gpt-3.5-turbo"

llm = ChatOpenAI(model_name=llm_name, temperature=0.7,
                 openai_api_key=openai_api_key)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(search_kwargs={"k": 5})
)


# Streamed response emulator
def response_generator(prompt):
    response = qa_chain({"query": prompt})['result']   
    
    for word in response.split():
        yield word + " "
        time.sleep(0.05)


st.title("Technical Support Chatbot")

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Display chat messages from history on app rerun
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Accept user input
if prompt := st.chat_input("Enter your question here"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})
    # Display user message in chat message container
    with st.chat_message("user"):
        st.markdown(prompt)

    # Display assistant response in chat message container
    with st.chat_message("assistant"):
        response = st.write_stream(response_generator(prompt))
    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": response})