KanvaBhatia's picture
Create app.py
e9a6181
import openai
import pandas as pd
import numpy as np
from openai.embeddings_utils import cosine_similarity
from transformers import GPT2TokenizerFast
import os
import gradio as gr
openai.api_key = os.environ.get("OPENAI_API_KEY")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
COMPLETIONS_API_PARAMS = {
# We use temperature of 0.0 because it gives the most predictable, factual answer.
"temperature": 0.0,
"max_tokens": 300,
"model": "text-davinci-003",
}
def get_embedding_custom(text, model = "text-embedding-ada-002"):
return openai.Embedding.create(
input = text,
model = model
)
def construct_prompt(query, xc):
context = ''
for i in range(3):
context += xc[i] + '\n'
header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
header += context + "\n\n Q: " + query + "\n A:"
return header
def create_and_upsert(file):
df = pd.read_csv(file.name)
df = df.dropna()
df = df.head(50000)
# Create a list of column names excluding the index column
column_names = list(df.columns)
# Combine the values from all columns except the index into a new 'combined' column
df['combined'] = "Title: " + df[column_names].astype(str).apply(lambda x: '; '.join(x), axis=1)
# Remove leading/trailing whitespaces from the 'combined' column
df['combined'] = df['combined'].str.strip()
df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))
df = df[df.n_tokens < 8000]
batch_size = 2000
for i in range(0, len(df.combined), batch_size):
# find end of batch
i_end = min(i+batch_size, len(df.combined))
responses = get_embedding_custom(list(df.combined)[i:i_end], model = 'text-embedding-ada-002')
k = 0
for j in range(i, i_end):
df.loc[j+1, 'ada_vector'] = str(responses.data[k]['embedding'])
k += 1
print("Saving to CSV")
df = df.dropna()
df.to_csv('embeddings_ada.csv', index = False)
print("CSV saved as embeddings_ada.csv ")
df['ada_vector'] = df.ada_vector.apply(eval).apply(np.array)
return df
def semantic_search(df, query):
searchvector = get_embedding_custom(query).data[0]['embedding']
df['similarities'] = df.ada_vector.apply(lambda x: cosine_similarity(x, searchvector))
result = df.sort_values('similarities', ascending=False).head(3)
xc = list(result.combined)
response = openai.Completion.create(
prompt=construct_prompt(query, xc),
**COMPLETIONS_API_PARAMS
)
return response.choices[0]['text']
csv_input = gr.inputs.File(label="CSV File")
query_input = gr.inputs.Textbox(label="Search Query")
answer_output = gr.outputs.Textbox(label="Answer")
df = None # Placeholder for the processed DataFrame
def process_csv_and_search(file, query):
global df
if df is None:
df = create_and_upsert(file)
answer = semantic_search(df, query)
return answer
gr.Interface(
fn=process_csv_and_search,
inputs=[csv_input, query_input],
outputs=answer_output,
title="CSV Search App"
).launch()