File size: 3,301 Bytes
f603258 6d53aca 655120d 6d53aca f603258 6d53aca f603258 6d53aca f603258 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | import os
import streamlit as st
import random
import time
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
# Download dataset
# Load the latest version
df = kagglehub.load_dataset(
KaggleDatasetAdapter.PANDAS,
"tobiasbueck/multilingual-customer-support-tickets",
file_path,
)
df = df[df['language'] == 'en']
# Check for non-string items in body
non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
non_string_ids = non_string_body.union(non_string_answers)
# Drop those rows
df = df.drop(index=non_string_ids)
df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
df_val, df_test = train_test_split(df_holdout, test_size=0.5, random_state=42)
persist_directory = './chroma_db'
rm -rf ./chroma_db # remove old database files if any
loader = DataFrameLoader(
df_train,
page_content_column="q_and_a")
documents = loader.load()
vectordb = Chroma.from_documents(
documents=documents,
embedding=embedding,
persist_directory=persist_directory
)
# Get OpenAI setup
openai_api_key = os.getenv("openai_token")
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
# @st.cache_resource
# def get_vectordb():
# embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
# return Chroma(persist_directory="./chroma_db", embedding_function=embedding)
# vectordb = get_vectordb()
# # Setup vector database
# persist_directory = './chroma_db'
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=llm_name, temperature=0.7,
openai_api_key=openai_api_key)
qa_chain = RetrievalQA.from_chain_type(
llm,
retriever=vectordb.as_retriever(search_kwargs={"k": 5})
)
# Streamed response emulator
def response_generator(prompt):
response = qa_chain({"query": prompt})['result']
for word in response.split():
yield word + " "
time.sleep(0.05)
st.title("Technical Support Chatbot")
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Accept user input
if prompt := st.chat_input("Enter your question here"):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(prompt)
# Display assistant response in chat message container
with st.chat_message("assistant"):
response = st.write_stream(response_generator(prompt))
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response}) |