File size: 13,754 Bytes
7bd058a
 
 
 
 
 
 
 
 
 
73638f2
7bd058a
 
 
 
6716cc1
7941c79
 
 
 
 
7bd058a
 
0b26dea
7bd058a
 
 
 
 
 
 
36aa4f3
7bd058a
 
 
 
4961bf3
0f9fea5
565f877
0f9fea5
7bd058a
6159e2b
565f877
2fc37c7
7bd058a
 
 
275329e
7bd058a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565f877
 
 
7bd058a
 
83fb434
7bd058a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
"""
This program demonstrates how openAI's ChatGPT language model can be used to answer questions in specific domain areas. 
The program asks a user for a question in a prescribed domain area.  The program then compares the user's query against 
pre-loaded domain content to identify the most useful sections of content. The program answers the question by leveraging 
ChatGPT's powerful general capabilities with the newly incorporated domain knowledge.  Such an approach might be used, 
for example, to provide a customized chat box for an insurance company's customers, where the company's policy materials 
are brought in as domain content.  For this example, I compiled the 2023 investment outlook summaries posted on the websites of 
Morgan Stanley (https://www.morganstanley.com/ideas/global-investment-strategy-outlook-2023), 
JPMorgan (https://www.jpmorgan.com/insights/research/market-outlook) and 
Goldman Sachs (https://www.goldmansachs.com/insights/pages/gs-research/macro-outlook-2023-this-cycle-is-different/report.pdf).  
Far more robust domain-specific responses are possible with further customization/retraining of ChatGPT.
"""

################################# LOAD LIBRARIES/IMPORTS #########################################

# !pip install openai 
# ! pip install transformers 
# ! pip install gradio 
# ! pip install PyPDF2 
# ! pip install python-docx 
# ! pip install pandas 


import docx
import pandas as pd
import numpy as np
import openai
import gradio as gr
import pickle
import os
from transformers import GPT2TokenizerFast
# import openai_secret_manager

################################# VARIABLES #########################################

USE_INTERFACE = True  # Change to False if you want to run the code without the Gradio interface, and instead see a single pre-supplied question 
filepath = '2023_investment_outlook.docx' 
                            # Path to document containing domain content.  Initial cleaning of domain content 
                            # can be done inside (eg, using Python) or outside (eg, using Word) this program,
                            # depending on needs and circumstances. 
# emb_filepath = 'PATH HERE'  # Path to document containing saved content embeddings, if applicable 
COMPLETIONS_MODEL = "text-davinci-003" 
# Get the value of confidential OpenAI API key; register at OpenAI for keys
openai.api_key = os.environ["API-KEY"]
MODEL_NAME = "curie"
DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"
MAX_SECTION_LEN =1100  # The API limits total tokens -- for the prompt containing the question and domain-specific content and the answer -- to 2048 tokens, or about 1500 words.  
SEPARATOR = "\n* "  # A string called SEPARATOR is defined as the newline character followed by an asterisk and a space. This string will be used as a separator between different pieces of text.
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

################################# FUNCTIONS #########################################

def load_text(filepath):
  """
  Loads a Microsoft Word document and returns a DataFrame containing the text of each paragraph in the document.

  Input:
    filepath (str): the filepath to the Microsoft Word document.
    
  Returns:
    df (pandas.DataFrame): a DataFrame containing the 'content' column with the text of each paragraph in the document.
  """
  # Open the Word document
  doc = docx.Document(filepath)

  # Create an empty pandas DataFrame
  df = pd.DataFrame()

  # Iterate through the paragraphs in the document and add each to the df
  for i, p in enumerate(doc.paragraphs):

      # Add the paragraph text [and index to the DataFrame]    
      df.loc[i, 'content'] = p.text
      # df.loc[i, 'paragraph_index'] = i

  # Delete empty paragraphs
  df['content'] = df['content'].replace('', np.nan)
  df = df.dropna(axis=0, subset=['content']).reset_index(drop=True)

  return df
    
def count_tokens(row):
    """count the number of tokens in a string"""
    return len(tokenizer.encode(row))

def truncate_text(df):
    """
    Truncates the text in the 'content' column of the input DataFrame if the number of tokens 
    in the text exceeds a specified maximum number. It will set the truncated text and the 
    number of tokens in the 'content' and 'tokens' columns, respectively.

    Input:
    df (pandas.DataFrame): a DataFrame containing the 'content' column

    Returns:
    df (pandas.DataFrame): the input DataFrame with modified 'content' and 'tokens' columns.

    """
    for i in range(len(df)):
        if df['tokens'][i] > 590:
            text = df['content'][i]
            tokens = tokenizer.encode(text)
            truncated_tokens = tokens[:590]
            truncated_text = tokenizer.decode(truncated_tokens)
            df.at[i, 'content'] = truncated_text
            df.at[i, 'tokens'] = len(truncated_tokens)
    return df

 
def get_embedding(text, model): 
    """
    Generates an embedding for the given text using the specified OpenAI model.
    
    Args:
        text (str): The text for which to generate an embedding.
        model (str): The name of the OpenAI model to use for generating the embedding.
    
    Returns:
        numpy.ndarray: The embedding for the given text.
    """
    result = openai.Embedding.create(
      model=model,
      input=[text]
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text):
    """
    Generates an embedding for the given text using the OpenAI document embeddings model.
    
    Args:
        text (str): The text for which to generate an embedding.
    
    Returns:
        numpy.ndarray: The embedding for the given text.
    """
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text):
   """
    Generates an embedding for the given text using the OpenAI query embeddings model.
    
    Args:
        text (str): The text for which to generate an embedding.
    
    Returns:
        numpy.ndarray: The embedding for the given text.
    """
   return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def compute_doc_embeddings(df): 
     """
    Generate embeddings for each row in a Pandas DataFrame using the OpenAI document embeddings model.
    
    Args:
        df (pandas.DataFrame): The DataFrame for which to generate embeddings.
    
    Returns:
        dict: A dictionary that maps the embedding vectors to the indices of the rows that they correspond to.
    """
     return {
        idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows() # r here refers to each row 
   }

def load_embeddings(fname): 
    """
    Load document embeddings and their keys from a CSV file.  Only if embeddings are pre-loaded.
    
    Args:
        fname (str): The path to the CSV file. The file must have exactly these named columns: 
            "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    
    Returns:
        dict: A dictionary that maps the embedding vectors to tuples of the form (title, heading).
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

def vector_similarity(x, y):
    """
    Calculate the similarity between two vectors using dot product.
    
    Args:
        x (iterable): The first vector.
        y (iterable): The second vector.
    
    Returns:
        float: The dot product of the two vectors.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
  """
  Find the query embedding for the given query, and compare it against all of the pre-calculated document embeddings
  to find the most relevant sections. 
   
  Args:
      query (str): The query for which to find relevant document sections.
      contexts (dict): A dictionary mapping document embeddings to their indices.
    
  Returns:
      list: A list of tuples, each containing the similarity score and index of a document section, sorted in descending
      order of relevance.
  """
  query_embedding = get_query_embedding(query)
  document_similarities = sorted([(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
  ], reverse=True)
    
  return document_similarities
    
def construct_prompt(question, context_embeddings, df):
    """
    Construct a prompt for answering a question using the most relevant document sections.
    
    Args:
      question (str): The question to answer.
      context_embeddings (dict): A dictionary mapping document embeddings to their indices.
      df (pandas.DataFrame): A DataFrame containing the document sections.
    
    Returns:
      str: The prompt, including the question and the relevant context.
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # # Useful diagnostic information  -- FOR TESTING PURPOSES
    # print(f"Selected {len(chosen_sections)} document sections:")
    # print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "Sorry, I don't know."\n\nContext:\n"""

    full_prompt = header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

    # print(full_prompt) # FOR TESTING PURPOSES

    return full_prompt
    

def answer_query_with_context(
    query,
    df,
    document_embeddings,
    show_prompt: bool = False):
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    """
    Answer a query using relevant context from a DataFrame.
    
    Args:
        query (str): The query to answer.
        df (pandas.DataFrame): A DataFrame containing the document sections.
        document_embeddings (dict): A dictionary mapping document embeddings to their indices.
        show_prompt (bool, optional): If `True`, print the prompt before generating a response.
    
    Returns:
        str: The generated response to the query.
    """   
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

######################### MAIN PROGRAM #########################################

# Load the text into dataframe 
df = load_text(filepath)
# print(df.head()) # FOR TESTING PURPOSES

# Count the tokens 
df = df.copy()    
df['tokens'] = df['content'].apply(count_tokens)

# print(df.head(10))   # FOR TESTING PURPOSES 
# print(df['content'][3])   # FOR TESTING PURPOSES

# Call the truncate_text function on the dataframe  
df = df.copy()    
df = truncate_text(df)

# print(df.head(10))  # FOR TESTING PURPOSES
# print(df['content'][3])  # FOR TESTING PURPOSES

#  Use code below only if importing embeddings from file,  rather than creating in real time through OpenAI API  
# document_embeddings = load_embeddings(empb_filepath)  

# Use code below if calculating the embeddings in real time via OpenAI API
document_embeddings = compute_doc_embeddings(df[:33])  # Can limit size (eg, df[:10] if run into limit on free-of-charge usage

# Embedding; embedding have 4096 dimensions, FOR TESTING ONLY
# example_entry = list(document_embeddings.items())[4]
# print(example_entry)
# print ("Length of example embedding =  ", len(example_entry[1]))

if USE_INTERFACE:
    demo = gr.Interface(
    fn=lambda query: answer_query_with_context(query, df, document_embeddings),
    inputs=gr.Textbox(lines=2,  label="Query", placeholder="Type Question Here..."),
    outputs=gr.Textbox(lines=2, label="Answer"),
    description="Example of a domain-specific chatbot, using ChatGPT with supplemental content added.<br>\
                  Here, the content relates to the investment outlook for 2023, according to Morgan Stanley, JPMorgan and Goldman Sachs.<br>\
                  Sample queries: What is Goldman's outlook for inflation? What about the bond market? What does JPMorgan think about 2023?<br>\
                  NOTE: High-level demo only. Supplemental content used here limited to about 30 paragraphs, due to limits on free-of-charge usage of ChatGPT.<br>\
                  More robust domain-specific responses are possible.",
    title="Domain-Specific Chatbot",)
    # Launch the interface   
    demo.launch()
else:
    prompt = construct_prompt(
        'What is the outlook for inflation?',
        document_embeddings,
        df
    )

    # print("===\n", prompt) # FOR TESTING ONLY

    answer_query_with_context("What is Goldman's outlook for inflation?", df, document_embeddings)