Spaces:

KanvaBhatia
/

csv_semantic_search

Sleeping

App Files Files Community

csv_semantic_search / app.py

KanvaBhatia

Create app.py

e9a6181 almost 3 years ago

raw

history blame contribute delete

3.12 kB

	import openai
	import pandas as pd
	import numpy as np
	from openai.embeddings_utils import cosine_similarity
	from transformers import GPT2TokenizerFast
	import os
	import gradio as gr

	openai.api_key = os.environ.get("OPENAI_API_KEY")
	tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

	COMPLETIONS_API_PARAMS = {
	# We use temperature of 0.0 because it gives the most predictable, factual answer.
	"temperature": 0.0,
	"max_tokens": 300,
	"model": "text-davinci-003",
	}

	def get_embedding_custom(text, model = "text-embedding-ada-002"):
	return openai.Embedding.create(
	input = text,
	model = model
	)

	def construct_prompt(query, xc):
	context = ''
	for i in range(3):
	context += xc[i] + '\n'
	header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
	header += context + "\n\n Q: " + query + "\n A:"
	return header

	def create_and_upsert(file):
	df = pd.read_csv(file.name)
	df = df.dropna()
	df = df.head(50000)

	# Create a list of column names excluding the index column
	column_names = list(df.columns)

	# Combine the values from all columns except the index into a new 'combined' column
	df['combined'] = "Title: " + df[column_names].astype(str).apply(lambda x: '; '.join(x), axis=1)

	# Remove leading/trailing whitespaces from the 'combined' column
	df['combined'] = df['combined'].str.strip()
	df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))
	df = df[df.n_tokens < 8000]

	batch_size = 2000
	for i in range(0, len(df.combined), batch_size):
	# find end of batch
	i_end = min(i+batch_size, len(df.combined))
	responses = get_embedding_custom(list(df.combined)[i:i_end], model = 'text-embedding-ada-002')
	k = 0
	for j in range(i, i_end):
	df.loc[j+1, 'ada_vector'] = str(responses.data[k]['embedding'])
	k += 1

	print("Saving to CSV")
	df = df.dropna()
	df.to_csv('embeddings_ada.csv', index = False)
	print("CSV saved as embeddings_ada.csv ")

	df['ada_vector'] = df.ada_vector.apply(eval).apply(np.array)

	return df


	def semantic_search(df, query):
	searchvector = get_embedding_custom(query).data[0]['embedding']
	df['similarities'] = df.ada_vector.apply(lambda x: cosine_similarity(x, searchvector))
	result = df.sort_values('similarities', ascending=False).head(3)

	xc = list(result.combined)
	response = openai.Completion.create(
	prompt=construct_prompt(query, xc),
	**COMPLETIONS_API_PARAMS
	)

	return response.choices[0]['text']


	csv_input = gr.inputs.File(label="CSV File")
	query_input = gr.inputs.Textbox(label="Search Query")
	answer_output = gr.outputs.Textbox(label="Answer")

	df = None # Placeholder for the processed DataFrame

	def process_csv_and_search(file, query):
	global df
	if df is None:
	df = create_and_upsert(file)
	answer = semantic_search(df, query)
	return answer

	gr.Interface(
	fn=process_csv_and_search,
	inputs=[csv_input, query_input],
	outputs=answer_output,
	title="CSV Search App"
	).launch()