Spaces:
Sleeping
Sleeping
| import openai | |
| import pandas as pd | |
| import numpy as np | |
| from openai.embeddings_utils import cosine_similarity | |
| from transformers import GPT2TokenizerFast | |
| import os | |
| import gradio as gr | |
| openai.api_key = os.environ.get("OPENAI_API_KEY") | |
| tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| COMPLETIONS_API_PARAMS = { | |
| # We use temperature of 0.0 because it gives the most predictable, factual answer. | |
| "temperature": 0.0, | |
| "max_tokens": 300, | |
| "model": "text-davinci-003", | |
| } | |
| def get_embedding_custom(text, model = "text-embedding-ada-002"): | |
| return openai.Embedding.create( | |
| input = text, | |
| model = model | |
| ) | |
| def construct_prompt(query, xc): | |
| context = '' | |
| for i in range(3): | |
| context += xc[i] + '\n' | |
| header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n""" | |
| header += context + "\n\n Q: " + query + "\n A:" | |
| return header | |
| def create_and_upsert(file): | |
| df = pd.read_csv(file.name) | |
| df = df.dropna() | |
| df = df.head(50000) | |
| # Create a list of column names excluding the index column | |
| column_names = list(df.columns) | |
| # Combine the values from all columns except the index into a new 'combined' column | |
| df['combined'] = "Title: " + df[column_names].astype(str).apply(lambda x: '; '.join(x), axis=1) | |
| # Remove leading/trailing whitespaces from the 'combined' column | |
| df['combined'] = df['combined'].str.strip() | |
| df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x))) | |
| df = df[df.n_tokens < 8000] | |
| batch_size = 2000 | |
| for i in range(0, len(df.combined), batch_size): | |
| # find end of batch | |
| i_end = min(i+batch_size, len(df.combined)) | |
| responses = get_embedding_custom(list(df.combined)[i:i_end], model = 'text-embedding-ada-002') | |
| k = 0 | |
| for j in range(i, i_end): | |
| df.loc[j+1, 'ada_vector'] = str(responses.data[k]['embedding']) | |
| k += 1 | |
| print("Saving to CSV") | |
| df = df.dropna() | |
| df.to_csv('embeddings_ada.csv', index = False) | |
| print("CSV saved as embeddings_ada.csv ") | |
| df['ada_vector'] = df.ada_vector.apply(eval).apply(np.array) | |
| return df | |
| def semantic_search(df, query): | |
| searchvector = get_embedding_custom(query).data[0]['embedding'] | |
| df['similarities'] = df.ada_vector.apply(lambda x: cosine_similarity(x, searchvector)) | |
| result = df.sort_values('similarities', ascending=False).head(3) | |
| xc = list(result.combined) | |
| response = openai.Completion.create( | |
| prompt=construct_prompt(query, xc), | |
| **COMPLETIONS_API_PARAMS | |
| ) | |
| return response.choices[0]['text'] | |
| csv_input = gr.inputs.File(label="CSV File") | |
| query_input = gr.inputs.Textbox(label="Search Query") | |
| answer_output = gr.outputs.Textbox(label="Answer") | |
| df = None # Placeholder for the processed DataFrame | |
| def process_csv_and_search(file, query): | |
| global df | |
| if df is None: | |
| df = create_and_upsert(file) | |
| answer = semantic_search(df, query) | |
| return answer | |
| gr.Interface( | |
| fn=process_csv_and_search, | |
| inputs=[csv_input, query_input], | |
| outputs=answer_output, | |
| title="CSV Search App" | |
| ).launch() |