Spaces:
Running
Running
| import os | |
| import openai | |
| import time | |
| import numpy as np | |
| import time | |
| import pandas as pd | |
| GPT_MODEL_AUGMENT = "gpt-3.5-turbo-16k" | |
| GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k" | |
| def cosine_similarity(a, b): | |
| return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) | |
| def get_embedding(text, model="text-embedding-ada-002"): | |
| try: | |
| text = text.replace("\n", " ") | |
| except: | |
| None | |
| try: | |
| return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding'] | |
| except: | |
| time.sleep(2) | |
| def augment_query(query): | |
| SYS_PROMPT = """ | |
| On [current date: 19 July] Generate a JSON response with the following structure: | |
| { | |
| "timestamps": # Relevant timestamps in which to get data to answer the query, | |
| "query": # Repeat the user's query, | |
| } | |
| Allowed timestamps: | |
| ['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17'] | |
| Ensure the output is always in JSON format and never provide any other response. | |
| """ | |
| response = openai.chat.completions.create( | |
| model=GPT_MODEL_AUGMENT, | |
| messages= | |
| [ | |
| { | |
| "role": "system", | |
| "content": SYS_PROMPT | |
| }, | |
| { | |
| "role": "user", | |
| "content": query | |
| } | |
| ], | |
| temperature=1, | |
| max_tokens=1000, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0, | |
| ).choices[0].message.content | |
| return response | |
| def semantic_search(df_loc, query,timestamp, nb_programs_to_display=15): | |
| timestamp = str(timestamp).strip() | |
| embedding = get_embedding(query, model='text-embedding-ada-002') | |
| filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"]) | |
| def wrap_cos(x,y): | |
| try: | |
| res = cosine_similarity(x,y) | |
| except: | |
| res = 0 | |
| return res | |
| filtered_df['similarity'] = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding)) | |
| results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display) | |
| return results | |
| def get_relevant_documents(df, query, nb_programs_to_display=15): | |
| query = eval(query) | |
| all_retrieved = [] | |
| for timestamp in query["timestamps"]: | |
| all_retrieved.append({ | |
| "timestamp" : timestamp, | |
| "tweets" : semantic_search(df, query["query"],timestamp, nb_programs_to_display=nb_programs_to_display) | |
| }) | |
| return all_retrieved | |
| def get_final_answer(relevant_documents, query): | |
| context = "" | |
| for relevant_timestamp in relevant_documents: | |
| list_tweets = relevant_timestamp["tweets"] | |
| context += "\nTimestamp: " + relevant_timestamp["timestamp"] + "\nList of tweets:\n" + str((list_tweets["text"] + " --- Tweeted by: @" +list_tweets["source"] + " \n").to_list()) + "\n---" | |
| USER_PROMPT = f""" | |
| "We have provided context information below. | |
| --------------------- | |
| {context} | |
| "\n---------------------\n" | |
| Given this information, please answer the question: {query} | |
| """ | |
| response = openai.chat.completions.create( | |
| model=GPT_MODEL_ANSWER, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": USER_PROMPT | |
| } | |
| ], | |
| temperature=1, | |
| max_tokens=1000, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0, | |
| ).choices[0].message.content | |
| return response | |
| def get_answer(query, df, api_key): | |
| """This traditional RAG approach has been implemented without using deidcated libraries and include different steps. | |
| It starts by augmenting the query and then perform a semantic search on the augmented query. Finally it combines the augmented query and the retrieved documents into an answer. | |
| Args: | |
| query (String): Query given by the user | |
| df (pd.DataFrame()): corpus with embeddings | |
| api_key (String): OpenAI API key | |
| Returns: | |
| String: Answer to the original query | |
| """ | |
| openai.api_key = api_key | |
| augmented_query = augment_query(query) | |
| relevant_documents = get_relevant_documents(df, augmented_query,nb_programs_to_display=10) | |
| response = get_final_answer(relevant_documents, augmented_query,) | |
| return response | |