KanvaBhatia commited on
Commit
e9a6181
·
1 Parent(s): 88a24e9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import pandas as pd
3
+ import numpy as np
4
+ from openai.embeddings_utils import cosine_similarity
5
+ from transformers import GPT2TokenizerFast
6
+ import os
7
+ import gradio as gr
8
+
9
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
10
+ tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
11
+
12
+ COMPLETIONS_API_PARAMS = {
13
+ # We use temperature of 0.0 because it gives the most predictable, factual answer.
14
+ "temperature": 0.0,
15
+ "max_tokens": 300,
16
+ "model": "text-davinci-003",
17
+ }
18
+
19
+ def get_embedding_custom(text, model = "text-embedding-ada-002"):
20
+ return openai.Embedding.create(
21
+ input = text,
22
+ model = model
23
+ )
24
+
25
+ def construct_prompt(query, xc):
26
+ context = ''
27
+ for i in range(3):
28
+ context += xc[i] + '\n'
29
+ header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
30
+ header += context + "\n\n Q: " + query + "\n A:"
31
+ return header
32
+
33
+ def create_and_upsert(file):
34
+ df = pd.read_csv(file.name)
35
+ df = df.dropna()
36
+ df = df.head(50000)
37
+
38
+ # Create a list of column names excluding the index column
39
+ column_names = list(df.columns)
40
+
41
+ # Combine the values from all columns except the index into a new 'combined' column
42
+ df['combined'] = "Title: " + df[column_names].astype(str).apply(lambda x: '; '.join(x), axis=1)
43
+
44
+ # Remove leading/trailing whitespaces from the 'combined' column
45
+ df['combined'] = df['combined'].str.strip()
46
+ df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))
47
+ df = df[df.n_tokens < 8000]
48
+
49
+ batch_size = 2000
50
+ for i in range(0, len(df.combined), batch_size):
51
+ # find end of batch
52
+ i_end = min(i+batch_size, len(df.combined))
53
+ responses = get_embedding_custom(list(df.combined)[i:i_end], model = 'text-embedding-ada-002')
54
+ k = 0
55
+ for j in range(i, i_end):
56
+ df.loc[j+1, 'ada_vector'] = str(responses.data[k]['embedding'])
57
+ k += 1
58
+
59
+ print("Saving to CSV")
60
+ df = df.dropna()
61
+ df.to_csv('embeddings_ada.csv', index = False)
62
+ print("CSV saved as embeddings_ada.csv ")
63
+
64
+ df['ada_vector'] = df.ada_vector.apply(eval).apply(np.array)
65
+
66
+ return df
67
+
68
+
69
+ def semantic_search(df, query):
70
+ searchvector = get_embedding_custom(query).data[0]['embedding']
71
+ df['similarities'] = df.ada_vector.apply(lambda x: cosine_similarity(x, searchvector))
72
+ result = df.sort_values('similarities', ascending=False).head(3)
73
+
74
+ xc = list(result.combined)
75
+ response = openai.Completion.create(
76
+ prompt=construct_prompt(query, xc),
77
+ **COMPLETIONS_API_PARAMS
78
+ )
79
+
80
+ return response.choices[0]['text']
81
+
82
+
83
+ csv_input = gr.inputs.File(label="CSV File")
84
+ query_input = gr.inputs.Textbox(label="Search Query")
85
+ answer_output = gr.outputs.Textbox(label="Answer")
86
+
87
+ df = None # Placeholder for the processed DataFrame
88
+
89
+ def process_csv_and_search(file, query):
90
+ global df
91
+ if df is None:
92
+ df = create_and_upsert(file)
93
+ answer = semantic_search(df, query)
94
+ return answer
95
+
96
+ gr.Interface(
97
+ fn=process_csv_and_search,
98
+ inputs=[csv_input, query_input],
99
+ outputs=answer_output,
100
+ title="CSV Search App"
101
+ ).launch()