kdevoe commited on
Commit
6d53aca
·
verified ·
1 Parent(s): 7674147

Adding chromadb build

Browse files
Files changed (1) hide show
  1. app.py +40 -5
app.py CHANGED
@@ -9,17 +9,52 @@ from langchain.document_loaders import DataFrameLoader
9
  from langchain.embeddings import OpenAIEmbeddings
10
  from langchain.vectorstores import Chroma
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Get OpenAI setup
14
  openai_api_key = os.getenv("openai_token")
15
  embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
16
 
17
- @st.cache_resource
18
- def get_vectordb():
19
- embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
20
- return Chroma(persist_directory="./chroma_db", embedding_function=embedding)
21
 
22
- vectordb = get_vectordb()
23
 
24
  # # Setup vector database
25
  # persist_directory = './chroma_db'
 
9
  from langchain.embeddings import OpenAIEmbeddings
10
  from langchain.vectorstores import Chroma
11
 
12
+ import kagglehub
13
+ from kagglehub import KaggleDatasetAdapter
14
+ import pandas as pd
15
+
16
+ # Download dataset
17
+ # Load the latest version
18
+ df = kagglehub.load_dataset(
19
+ KaggleDatasetAdapter.PANDAS,
20
+ "tobiasbueck/multilingual-customer-support-tickets",
21
+ file_path,
22
+ )
23
+
24
+ df = df[df['language'] == 'en']
25
+ # Check for non-string items in body
26
+ non_string_body = df[~df['body'].apply(lambda x: isinstance(x, str))].index
27
+ non_string_answers = df[~df['answer'].apply(lambda x: isinstance(x, str))].index
28
+ non_string_ids = non_string_body.union(non_string_answers)
29
+ # Drop those rows
30
+ df = df.drop(index=non_string_ids)
31
+ df['q_and_a'] = 'Question: ' + df['body'] + ' Answer: ' + df['answer']
32
+ df_train, df_holdout = train_test_split(df, test_size=0.2, random_state=42)
33
+ df_val, df_test = train_test_split(df_holdout, test_size=0.5, random_state=42)
34
+
35
+ persist_directory = './chroma_db'
36
+ !rm -rf ./chroma_db # remove old database files if any
37
+ loader = DataFrameLoader(
38
+ df_train,
39
+ page_content_column="q_and_a")
40
+ documents = loader.load()
41
+
42
+ vectordb = Chroma.from_documents(
43
+ documents=documents,
44
+ embedding=embedding,
45
+ persist_directory=persist_directory
46
+ )
47
 
48
  # Get OpenAI setup
49
  openai_api_key = os.getenv("openai_token")
50
  embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
51
 
52
+ # @st.cache_resource
53
+ # def get_vectordb():
54
+ # embedding = OpenAIEmbeddings(openai_api_key=os.getenv("openai_token"))
55
+ # return Chroma(persist_directory="./chroma_db", embedding_function=embedding)
56
 
57
+ # vectordb = get_vectordb()
58
 
59
  # # Setup vector database
60
  # persist_directory = './chroma_db'