Spaces:

KanvaBhatia
/

csv_semantic_search

Sleeping

App Files Files Community

KanvaBhatia commited on May 25, 2023

Commit

e9a6181

1 Parent(s): 88a24e9

Create app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import openai
+import pandas as pd
+import numpy as np
+from openai.embeddings_utils import cosine_similarity
+from transformers import GPT2TokenizerFast
+import os
+import gradio as gr
+openai.api_key = os.environ.get("OPENAI_API_KEY")
+tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+COMPLETIONS_API_PARAMS = {
+  # We use temperature of 0.0 because it gives the most predictable, factual answer.
+  "temperature": 0.0,
+  "max_tokens": 300,
+  "model": "text-davinci-003",
+}
+def get_embedding_custom(text, model = "text-embedding-ada-002"):
+  return openai.Embedding.create(
+    input = text,
+    model = model
+  )
+def construct_prompt(query, xc):
+  context = ''
+  for i in range(3):
+    context += xc[i] + '\n'
+  header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
+  header += context + "\n\n Q: " + query + "\n A:"
+  return header
+def create_and_upsert(file):
+  df = pd.read_csv(file.name)
+  df = df.dropna()
+  df = df.head(50000)
+  # Create a list of column names excluding the index column
+  column_names = list(df.columns)
+  # Combine the values from all columns except the index into a new 'combined' column
+  df['combined'] = "Title: " + df[column_names].astype(str).apply(lambda x: '; '.join(x), axis=1)
+  # Remove leading/trailing whitespaces from the 'combined' column
+  df['combined'] = df['combined'].str.strip()
+  df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))
+  df = df[df.n_tokens < 8000]
+  batch_size = 2000
+  for i in range(0, len(df.combined), batch_size):
+    # find end of batch
+    i_end = min(i+batch_size, len(df.combined))
+    responses = get_embedding_custom(list(df.combined)[i:i_end], model = 'text-embedding-ada-002')
+    k = 0
+    for j in range(i, i_end):
+      df.loc[j+1, 'ada_vector'] = str(responses.data[k]['embedding'])
+      k += 1
+  print("Saving to CSV")
+  df = df.dropna()
+  df.to_csv('embeddings_ada.csv', index = False)
+  print("CSV saved as embeddings_ada.csv ")
+  df['ada_vector'] = df.ada_vector.apply(eval).apply(np.array)
+  return df
+def semantic_search(df, query):
+  searchvector = get_embedding_custom(query).data[0]['embedding']
+  df['similarities'] = df.ada_vector.apply(lambda x: cosine_similarity(x, searchvector))
+  result = df.sort_values('similarities', ascending=False).head(3)
+  xc = list(result.combined)
+  response = openai.Completion.create(
+      prompt=construct_prompt(query, xc),
+      **COMPLETIONS_API_PARAMS
+  )
+  return response.choices[0]['text']
+csv_input = gr.inputs.File(label="CSV File")
+query_input = gr.inputs.Textbox(label="Search Query")
+answer_output = gr.outputs.Textbox(label="Answer")
+df = None  # Placeholder for the processed DataFrame
+def process_csv_and_search(file, query):
+    global df
+    if df is None:
+        df = create_and_upsert(file)
+    answer = semantic_search(df, query)
+    return answer
+gr.Interface(
+  fn=process_csv_and_search,
+  inputs=[csv_input, query_input],
+  outputs=answer_output,
+  title="CSV Search App"
+).launch()