Spaces:

Adr740
/

Temporal-RAG-Benchmark

Sleeping

File size: 5,538 Bytes

844cee8

import os
import openai
import time
import numpy as np
import time
import pandas as pd

GPT_MODEL_AUGMENT = "gpt-3.5-turbo-16k"
GPT_MODEL_ANSWER = "gpt-3.5-turbo-16k"


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_embedding(text, model="text-embedding-ada-002"):
    try:
        text = text.replace("\n", " ")
    except:
        None
    try:
        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
    except:
        time.sleep(2)

def augment_query(query):

    SYS_PROMPT = """
        On [current date: 19 July] Generate a JSON response with the following structure:

        {
        "timestamps": # Relevant timestamps in which to get data to answer the query,
        "query": # Repeat the user's query,
        }
        Allowed timestamps:
        ['2018-07-18', '2018-07-19', '2018-07-08', '2018-07-09', '2018-07-10', '2018-07-11', '2018-07-12', '2018-07-13', '2018-07-14', '2018-07-15', '2018-07-16', '2018-07-17']

        Ensure the output is always in JSON format and never provide any other response.
        """
    response = openai.chat.completions.create(
                                            model=GPT_MODEL_AUGMENT,
                                            messages=
                                            [
                                                {
                                                "role": "system",
                                                "content": SYS_PROMPT
                                                },
                                                {
                                                "role": "user",
                                                "content": query
                                                }
                                            ],
                                            temperature=1,
                                            max_tokens=1000,
                                            top_p=1,
                                            frequency_penalty=0,
                                            presence_penalty=0,
                                            ).choices[0].message.content
    return response

def semantic_search(df_loc, query,timestamp, nb_programs_to_display=15):
    timestamp = str(timestamp).strip()
    embedding = get_embedding(query, model='text-embedding-ada-002')
    filtered_df = df_loc[df_loc["timestamp"]==timestamp].drop(columns=["url"])
    def wrap_cos(x,y):
        try:
            res = cosine_similarity(x,y)
        except:
            res = 0
        return res
    filtered_df['similarity']  = filtered_df.embedding.apply(lambda x: wrap_cos(x, embedding))

    results = filtered_df.sort_values('similarity', ascending=False).head(nb_programs_to_display)
    return results

def get_relevant_documents(df, query, nb_programs_to_display=15):

    query = eval(query)
    all_retrieved = []
    for timestamp in query["timestamps"]:
        all_retrieved.append({
            "timestamp" : timestamp,
            "tweets" : semantic_search(df, query["query"],timestamp, nb_programs_to_display=nb_programs_to_display)
        })

    return all_retrieved

def get_final_answer(relevant_documents, query):
    context = ""
    for relevant_timestamp in relevant_documents: 
        list_tweets = relevant_timestamp["tweets"]
        context += "\nTimestamp: " + relevant_timestamp["timestamp"] + "\nList of tweets:\n" + str((list_tweets["text"] + "   --- Tweeted by: @" +list_tweets["source"] +  " \n").to_list()) + "\n---"


    USER_PROMPT = f"""
    "We have provided context information below. 
    ---------------------
    {context}
    "\n---------------------\n"
    Given this information, please answer the question: {query}
    """
    response = openai.chat.completions.create(
                                                model=GPT_MODEL_ANSWER,
                                                messages=[
                                                    {
                                                    "role": "user",
                                                    "content": USER_PROMPT
                                                    }
                                                ],

                                                temperature=1,
                                                max_tokens=1000,
                                                top_p=1,
                                                frequency_penalty=0,
                                                presence_penalty=0,
                                                ).choices[0].message.content
    return response

def get_answer(query, df, api_key):
    """This traditional RAG approach has been implemented without using deidcated libraries and include different steps.
    It starts by augmenting the query and then perform a semantic search on the augmented query. Finally it combines the augmented query and the retrieved documents into an answer. 

    Args:
        query (String): Query given by the user
        df (pd.DataFrame()): corpus with embeddings
        api_key (String): OpenAI API key

    Returns:
        String: Answer to the original query
    """
    openai.api_key = api_key
    augmented_query = augment_query(query)
    relevant_documents = get_relevant_documents(df, augmented_query,nb_programs_to_display=10)
    response = get_final_answer(relevant_documents, augmented_query,)
    return response