Spaces:

robjm16
/

domain_specific_ChatGPT

Runtime error

File size: 13,754 Bytes

"""
This program demonstrates how openAI's ChatGPT language model can be used to answer questions in specific domain areas. 
The program asks a user for a question in a prescribed domain area.  The program then compares the user's query against 
pre-loaded domain content to identify the most useful sections of content. The program answers the question by leveraging 
ChatGPT's powerful general capabilities with the newly incorporated domain knowledge.  Such an approach might be used, 
for example, to provide a customized chat box for an insurance company's customers, where the company's policy materials 
are brought in as domain content.  For this example, I compiled the 2023 investment outlook summaries posted on the websites of 
Morgan Stanley (https://www.morganstanley.com/ideas/global-investment-strategy-outlook-2023), 
JPMorgan (https://www.jpmorgan.com/insights/research/market-outlook) and 
Goldman Sachs (https://www.goldmansachs.com/insights/pages/gs-research/macro-outlook-2023-this-cycle-is-different/report.pdf).  
Far more robust domain-specific responses are possible with further customization/retraining of ChatGPT.
"""

################################# LOAD LIBRARIES/IMPORTS #########################################

# !pip install openai 
# ! pip install transformers 
# ! pip install gradio 
# ! pip install PyPDF2 
# ! pip install python-docx 
# ! pip install pandas 


import docx
import pandas as pd
import numpy as np
import openai
import gradio as gr
import pickle
import os
from transformers import GPT2TokenizerFast
# import openai_secret_manager

################################# VARIABLES #########################################

USE_INTERFACE = True  # Change to False if you want to run the code without the Gradio interface, and instead see a single pre-supplied question 
filepath = '2023_investment_outlook.docx' 
                            # Path to document containing domain content.  Initial cleaning of domain content 
                            # can be done inside (eg, using Python) or outside (eg, using Word) this program,
                            # depending on needs and circumstances. 
# emb_filepath = 'PATH HERE'  # Path to document containing saved content embeddings, if applicable 
COMPLETIONS_MODEL = "text-davinci-003" 
# Get the value of confidential OpenAI API key; register at OpenAI for keys
openai.api_key = os.environ["API-KEY"]
MODEL_NAME = "curie"
DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"
MAX_SECTION_LEN =1100  # The API limits total tokens -- for the prompt containing the question and domain-specific content and the answer -- to 2048 tokens, or about 1500 words.  
SEPARATOR = "\n* "  # A string called SEPARATOR is defined as the newline character followed by an asterisk and a space. This string will be used as a separator between different pieces of text.
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

################################# FUNCTIONS #########################################

def load_text(filepath):
  """
  Loads a Microsoft Word document and returns a DataFrame containing the text of each paragraph in the document.

  Input:
    filepath (str): the filepath to the Microsoft Word document.
    
  Returns:
    df (pandas.DataFrame): a DataFrame containing the 'content' column with the text of each paragraph in the document.
  """
  # Open the Word document
  doc = docx.Document(filepath)

  # Create an empty pandas DataFrame
  df = pd.DataFrame()

  # Iterate through the paragraphs in the document and add each to the df
  for i, p in enumerate(doc.paragraphs):

      # Add the paragraph text [and index to the DataFrame]    
      df.loc[i, 'content'] = p.text
      # df.loc[i, 'paragraph_index'] = i

  # Delete empty paragraphs
  df['content'] = df['content'].replace('', np.nan)
  df = df.dropna(axis=0, subset=['content']).reset_index(drop=True)

  return df
    
def count_tokens(row):
    """count the number of tokens in a string"""
    return len(tokenizer.encode(row))

def truncate_text(df):
    """
    Truncates the text in the 'content' column of the input DataFrame if the number of tokens 
    in the text exceeds a specified maximum number. It will set the truncated text and the 
    number of tokens in the 'content' and 'tokens' columns, respectively.

    Input:
    df (pandas.DataFrame): a DataFrame containing the 'content' column

    Returns:
    df (pandas.DataFrame): the input DataFrame with modified 'content' and 'tokens' columns.

    """
    for i in range(len(df)):
        if df['tokens'][i] > 590:
            text = df['content'][i]
            tokens = tokenizer.encode(text)
            truncated_tokens = tokens[:590]
            truncated_text = tokenizer.decode(truncated_tokens)
            df.at[i, 'content'] = truncated_text
            df.at[i, 'tokens'] = len(truncated_tokens)
    return df

 
def get_embedding(text, model): 
    """
    Generates an embedding for the given text using the specified OpenAI model.
    
    Args:
        text (str): The text for which to generate an embedding.
        model (str): The name of the OpenAI model to use for generating the embedding.
    
    Returns:
        numpy.ndarray: The embedding for the given text.
    """
    result = openai.Embedding.create(
      model=model,
      input=[text]
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text):
    """
    Generates an embedding for the given text using the OpenAI document embeddings model.
    
    Args:
        text (str): The text for which to generate an embedding.
    
    Returns:
        numpy.ndarray: The embedding for the given text.
    """
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text):
   """
    Generates an embedding for the given text using the OpenAI query embeddings model.
    
    Args:
        text (str): The text for which to generate an embedding.
    
    Returns:
        numpy.ndarray: The embedding for the given text.
    """
   return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def compute_doc_embeddings(df): 
     """
    Generate embeddings for each row in a Pandas DataFrame using the OpenAI document embeddings model.
    
    Args:
        df (pandas.DataFrame): The DataFrame for which to generate embeddings.
    
    Returns:
        dict: A dictionary that maps the embedding vectors to the indices of the rows that they correspond to.
    """
     return {
        idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows() # r here refers to each row 
   }

def load_embeddings(fname): 
    """
    Load document embeddings and their keys from a CSV file.  Only if embeddings are pre-loaded.
    
    Args:
        fname (str): The path to the CSV file. The file must have exactly these named columns: 
            "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    
    Returns:
        dict: A dictionary that maps the embedding vectors to tuples of the form (title, heading).
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

def vector_similarity(x, y):
    """
    Calculate the similarity between two vectors using dot product.
    
    Args:
        x (iterable): The first vector.
        y (iterable): The second vector.
    
    Returns:
        float: The dot product of the two vectors.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
  """
  Find the query embedding for the given query, and compare it against all of the pre-calculated document embeddings
  to find the most relevant sections. 
   
  Args:
      query (str): The query for which to find relevant document sections.
      contexts (dict): A dictionary mapping document embeddings to their indices.
    
  Returns:
      list: A list of tuples, each containing the similarity score and index of a document section, sorted in descending
      order of relevance.
  """
  query_embedding = get_query_embedding(query)
  document_similarities = sorted([(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
  ], reverse=True)
    
  return document_similarities
    
def construct_prompt(question, context_embeddings, df):
    """
    Construct a prompt for answering a question using the most relevant document sections.
    
    Args:
      question (str): The question to answer.
      context_embeddings (dict): A dictionary mapping document embeddings to their indices.
      df (pandas.DataFrame): A DataFrame containing the document sections.
    
    Returns:
      str: The prompt, including the question and the relevant context.
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # # Useful diagnostic information  -- FOR TESTING PURPOSES
    # print(f"Selected {len(chosen_sections)} document sections:")
    # print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "Sorry, I don't know."\n\nContext:\n"""

    full_prompt = header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

    # print(full_prompt) # FOR TESTING PURPOSES

    return full_prompt
    

def answer_query_with_context(
    query,
    df,
    document_embeddings,
    show_prompt: bool = False):
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    """
    Answer a query using relevant context from a DataFrame.
    
    Args:
        query (str): The query to answer.
        df (pandas.DataFrame): A DataFrame containing the document sections.
        document_embeddings (dict): A dictionary mapping document embeddings to their indices.
        show_prompt (bool, optional): If `True`, print the prompt before generating a response.
    
    Returns:
        str: The generated response to the query.
    """   
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

######################### MAIN PROGRAM #########################################

# Load the text into dataframe 
df = load_text(filepath)
# print(df.head()) # FOR TESTING PURPOSES

# Count the tokens 
df = df.copy()    
df['tokens'] = df['content'].apply(count_tokens)

# print(df.head(10))   # FOR TESTING PURPOSES 
# print(df['content'][3])   # FOR TESTING PURPOSES

# Call the truncate_text function on the dataframe  
df = df.copy()    
df = truncate_text(df)

# print(df.head(10))  # FOR TESTING PURPOSES
# print(df['content'][3])  # FOR TESTING PURPOSES

#  Use code below only if importing embeddings from file,  rather than creating in real time through OpenAI API  
# document_embeddings = load_embeddings(empb_filepath)  

# Use code below if calculating the embeddings in real time via OpenAI API
document_embeddings = compute_doc_embeddings(df[:33])  # Can limit size (eg, df[:10] if run into limit on free-of-charge usage

# Embedding; embedding have 4096 dimensions, FOR TESTING ONLY
# example_entry = list(document_embeddings.items())[4]
# print(example_entry)
# print ("Length of example embedding =  ", len(example_entry[1]))

if USE_INTERFACE:
    demo = gr.Interface(
    fn=lambda query: answer_query_with_context(query, df, document_embeddings),
    inputs=gr.Textbox(lines=2,  label="Query", placeholder="Type Question Here..."),
    outputs=gr.Textbox(lines=2, label="Answer"),
    description="Example of a domain-specific chatbot, using ChatGPT with supplemental content added.<br>\
                  Here, the content relates to the investment outlook for 2023, according to Morgan Stanley, JPMorgan and Goldman Sachs.<br>\
                  Sample queries: What is Goldman's outlook for inflation? What about the bond market? What does JPMorgan think about 2023?<br>\
                  NOTE: High-level demo only. Supplemental content used here limited to about 30 paragraphs, due to limits on free-of-charge usage of ChatGPT.<br>\
                  More robust domain-specific responses are possible.",
    title="Domain-Specific Chatbot",)
    # Launch the interface   
    demo.launch()
else:
    prompt = construct_prompt(
        'What is the outlook for inflation?',
        document_embeddings,
        df
    )

    # print("===\n", prompt) # FOR TESTING ONLY

    answer_query_with_context("What is Goldman's outlook for inflation?", df, document_embeddings)