Spaces:
Runtime error
Runtime error
| """ | |
| This program demonstrates how openAI's ChatGPT language model can be used to answer questions in specific domain areas. | |
| The program asks a user for a question in a prescribed domain area. The program then compares the user's query against | |
| pre-loaded domain content to identify the most useful sections of content. The program answers the question by leveraging | |
| ChatGPT's powerful general capabilities with the newly incorporated domain knowledge. Such an approach might be used, | |
| for example, to provide a customized chat box for an insurance company's customers, where the company's policy materials | |
| are brought in as domain content. For this example, I compiled the 2023 investment outlook summaries posted on the websites of | |
| Morgan Stanley (https://www.morganstanley.com/ideas/global-investment-strategy-outlook-2023), | |
| JPMorgan (https://www.jpmorgan.com/insights/research/market-outlook) and | |
| Goldman Sachs (https://www.goldmansachs.com/insights/pages/gs-research/macro-outlook-2023-this-cycle-is-different/report.pdf). | |
| Far more robust domain-specific responses are possible with further customization/retraining of ChatGPT. | |
| """ | |
| ################################# LOAD LIBRARIES/IMPORTS ######################################### | |
| # !pip install openai | |
| # ! pip install transformers | |
| # ! pip install gradio | |
| # ! pip install PyPDF2 | |
| # ! pip install python-docx | |
| # ! pip install pandas | |
| import docx | |
| import pandas as pd | |
| import numpy as np | |
| import openai | |
| import gradio as gr | |
| import pickle | |
| import os | |
| from transformers import GPT2TokenizerFast | |
| # import openai_secret_manager | |
| ################################# VARIABLES ######################################### | |
| USE_INTERFACE = True # Change to False if you want to run the code without the Gradio interface, and instead see a single pre-supplied question | |
| filepath = '2023_investment_outlook.docx' | |
| # Path to document containing domain content. Initial cleaning of domain content | |
| # can be done inside (eg, using Python) or outside (eg, using Word) this program, | |
| # depending on needs and circumstances. | |
| # emb_filepath = 'PATH HERE' # Path to document containing saved content embeddings, if applicable | |
| COMPLETIONS_MODEL = "text-davinci-003" | |
| # Get the value of confidential OpenAI API key; register at OpenAI for keys | |
| openai.api_key = os.environ["API-KEY"] | |
| MODEL_NAME = "curie" | |
| DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001" | |
| QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001" | |
| MAX_SECTION_LEN =1100 # The API limits total tokens -- for the prompt containing the question and domain-specific content and the answer -- to 2048 tokens, or about 1500 words. | |
| SEPARATOR = "\n* " # A string called SEPARATOR is defined as the newline character followed by an asterisk and a space. This string will be used as a separator between different pieces of text. | |
| tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
| separator_len = len(tokenizer.tokenize(SEPARATOR)) | |
| COMPLETIONS_API_PARAMS = { | |
| # We use temperature of 0.0 because it gives the most predictable, factual answer. | |
| "temperature": 0.0, | |
| "max_tokens": 300, | |
| "model": COMPLETIONS_MODEL, | |
| } | |
| ################################# FUNCTIONS ######################################### | |
| def load_text(filepath): | |
| """ | |
| Loads a Microsoft Word document and returns a DataFrame containing the text of each paragraph in the document. | |
| Input: | |
| filepath (str): the filepath to the Microsoft Word document. | |
| Returns: | |
| df (pandas.DataFrame): a DataFrame containing the 'content' column with the text of each paragraph in the document. | |
| """ | |
| # Open the Word document | |
| doc = docx.Document(filepath) | |
| # Create an empty pandas DataFrame | |
| df = pd.DataFrame() | |
| # Iterate through the paragraphs in the document and add each to the df | |
| for i, p in enumerate(doc.paragraphs): | |
| # Add the paragraph text [and index to the DataFrame] | |
| df.loc[i, 'content'] = p.text | |
| # df.loc[i, 'paragraph_index'] = i | |
| # Delete empty paragraphs | |
| df['content'] = df['content'].replace('', np.nan) | |
| df = df.dropna(axis=0, subset=['content']).reset_index(drop=True) | |
| return df | |
| def count_tokens(row): | |
| """count the number of tokens in a string""" | |
| return len(tokenizer.encode(row)) | |
| def truncate_text(df): | |
| """ | |
| Truncates the text in the 'content' column of the input DataFrame if the number of tokens | |
| in the text exceeds a specified maximum number. It will set the truncated text and the | |
| number of tokens in the 'content' and 'tokens' columns, respectively. | |
| Input: | |
| df (pandas.DataFrame): a DataFrame containing the 'content' column | |
| Returns: | |
| df (pandas.DataFrame): the input DataFrame with modified 'content' and 'tokens' columns. | |
| """ | |
| for i in range(len(df)): | |
| if df['tokens'][i] > 590: | |
| text = df['content'][i] | |
| tokens = tokenizer.encode(text) | |
| truncated_tokens = tokens[:590] | |
| truncated_text = tokenizer.decode(truncated_tokens) | |
| df.at[i, 'content'] = truncated_text | |
| df.at[i, 'tokens'] = len(truncated_tokens) | |
| return df | |
| def get_embedding(text, model): | |
| """ | |
| Generates an embedding for the given text using the specified OpenAI model. | |
| Args: | |
| text (str): The text for which to generate an embedding. | |
| model (str): The name of the OpenAI model to use for generating the embedding. | |
| Returns: | |
| numpy.ndarray: The embedding for the given text. | |
| """ | |
| result = openai.Embedding.create( | |
| model=model, | |
| input=[text] | |
| ) | |
| return result["data"][0]["embedding"] | |
| def get_doc_embedding(text): | |
| """ | |
| Generates an embedding for the given text using the OpenAI document embeddings model. | |
| Args: | |
| text (str): The text for which to generate an embedding. | |
| Returns: | |
| numpy.ndarray: The embedding for the given text. | |
| """ | |
| return get_embedding(text, DOC_EMBEDDINGS_MODEL) | |
| def get_query_embedding(text): | |
| """ | |
| Generates an embedding for the given text using the OpenAI query embeddings model. | |
| Args: | |
| text (str): The text for which to generate an embedding. | |
| Returns: | |
| numpy.ndarray: The embedding for the given text. | |
| """ | |
| return get_embedding(text, QUERY_EMBEDDINGS_MODEL) | |
| def compute_doc_embeddings(df): | |
| """ | |
| Generate embeddings for each row in a Pandas DataFrame using the OpenAI document embeddings model. | |
| Args: | |
| df (pandas.DataFrame): The DataFrame for which to generate embeddings. | |
| Returns: | |
| dict: A dictionary that maps the embedding vectors to the indices of the rows that they correspond to. | |
| """ | |
| return { | |
| idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows() # r here refers to each row | |
| } | |
| def load_embeddings(fname): | |
| """ | |
| Load document embeddings and their keys from a CSV file. Only if embeddings are pre-loaded. | |
| Args: | |
| fname (str): The path to the CSV file. The file must have exactly these named columns: | |
| "title", "heading", "0", "1", ... up to the length of the embedding vectors. | |
| Returns: | |
| dict: A dictionary that maps the embedding vectors to tuples of the form (title, heading). | |
| """ | |
| df = pd.read_csv(fname, header=0) | |
| max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"]) | |
| return { | |
| (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows() | |
| } | |
| def vector_similarity(x, y): | |
| """ | |
| Calculate the similarity between two vectors using dot product. | |
| Args: | |
| x (iterable): The first vector. | |
| y (iterable): The second vector. | |
| Returns: | |
| float: The dot product of the two vectors. | |
| """ | |
| return np.dot(np.array(x), np.array(y)) | |
| def order_document_sections_by_query_similarity(query, contexts): | |
| """ | |
| Find the query embedding for the given query, and compare it against all of the pre-calculated document embeddings | |
| to find the most relevant sections. | |
| Args: | |
| query (str): The query for which to find relevant document sections. | |
| contexts (dict): A dictionary mapping document embeddings to their indices. | |
| Returns: | |
| list: A list of tuples, each containing the similarity score and index of a document section, sorted in descending | |
| order of relevance. | |
| """ | |
| query_embedding = get_query_embedding(query) | |
| document_similarities = sorted([(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items() | |
| ], reverse=True) | |
| return document_similarities | |
| def construct_prompt(question, context_embeddings, df): | |
| """ | |
| Construct a prompt for answering a question using the most relevant document sections. | |
| Args: | |
| question (str): The question to answer. | |
| context_embeddings (dict): A dictionary mapping document embeddings to their indices. | |
| df (pandas.DataFrame): A DataFrame containing the document sections. | |
| Returns: | |
| str: The prompt, including the question and the relevant context. | |
| """ | |
| most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings) | |
| chosen_sections = [] | |
| chosen_sections_len = 0 | |
| chosen_sections_indexes = [] | |
| for _, section_index in most_relevant_document_sections: | |
| # Add contexts until we run out of space. | |
| document_section = df.loc[section_index] | |
| chosen_sections_len += document_section.tokens + separator_len | |
| if chosen_sections_len > MAX_SECTION_LEN: | |
| break | |
| chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " ")) | |
| chosen_sections_indexes.append(str(section_index)) | |
| # # Useful diagnostic information -- FOR TESTING PURPOSES | |
| # print(f"Selected {len(chosen_sections)} document sections:") | |
| # print("\n".join(chosen_sections_indexes)) | |
| header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "Sorry, I don't know."\n\nContext:\n""" | |
| full_prompt = header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:" | |
| # print(full_prompt) # FOR TESTING PURPOSES | |
| return full_prompt | |
| def answer_query_with_context( | |
| query, | |
| df, | |
| document_embeddings, | |
| show_prompt: bool = False): | |
| prompt = construct_prompt( | |
| query, | |
| document_embeddings, | |
| df | |
| ) | |
| """ | |
| Answer a query using relevant context from a DataFrame. | |
| Args: | |
| query (str): The query to answer. | |
| df (pandas.DataFrame): A DataFrame containing the document sections. | |
| document_embeddings (dict): A dictionary mapping document embeddings to their indices. | |
| show_prompt (bool, optional): If `True`, print the prompt before generating a response. | |
| Returns: | |
| str: The generated response to the query. | |
| """ | |
| if show_prompt: | |
| print(prompt) | |
| response = openai.Completion.create( | |
| prompt=prompt, | |
| **COMPLETIONS_API_PARAMS | |
| ) | |
| return response["choices"][0]["text"].strip(" \n") | |
| ######################### MAIN PROGRAM ######################################### | |
| # Load the text into dataframe | |
| df = load_text(filepath) | |
| # print(df.head()) # FOR TESTING PURPOSES | |
| # Count the tokens | |
| df = df.copy() | |
| df['tokens'] = df['content'].apply(count_tokens) | |
| # print(df.head(10)) # FOR TESTING PURPOSES | |
| # print(df['content'][3]) # FOR TESTING PURPOSES | |
| # Call the truncate_text function on the dataframe | |
| df = df.copy() | |
| df = truncate_text(df) | |
| # print(df.head(10)) # FOR TESTING PURPOSES | |
| # print(df['content'][3]) # FOR TESTING PURPOSES | |
| # Use code below only if importing embeddings from file, rather than creating in real time through OpenAI API | |
| # document_embeddings = load_embeddings(empb_filepath) | |
| # Use code below if calculating the embeddings in real time via OpenAI API | |
| document_embeddings = compute_doc_embeddings(df[:33]) # Can limit size (eg, df[:10] if run into limit on free-of-charge usage | |
| # Embedding; embedding have 4096 dimensions, FOR TESTING ONLY | |
| # example_entry = list(document_embeddings.items())[4] | |
| # print(example_entry) | |
| # print ("Length of example embedding = ", len(example_entry[1])) | |
| if USE_INTERFACE: | |
| demo = gr.Interface( | |
| fn=lambda query: answer_query_with_context(query, df, document_embeddings), | |
| inputs=gr.Textbox(lines=2, label="Query", placeholder="Type Question Here..."), | |
| outputs=gr.Textbox(lines=2, label="Answer"), | |
| description="Example of a domain-specific chatbot, using ChatGPT with supplemental content added.<br>\ | |
| Here, the content relates to the investment outlook for 2023, according to Morgan Stanley, JPMorgan and Goldman Sachs.<br>\ | |
| Sample queries: What is Goldman's outlook for inflation? What about the bond market? What does JPMorgan think about 2023?<br>\ | |
| NOTE: High-level demo only. Supplemental content used here limited to about 30 paragraphs, due to limits on free-of-charge usage of ChatGPT.<br>\ | |
| More robust domain-specific responses are possible.", | |
| title="Domain-Specific Chatbot",) | |
| # Launch the interface | |
| demo.launch() | |
| else: | |
| prompt = construct_prompt( | |
| 'What is the outlook for inflation?', | |
| document_embeddings, | |
| df | |
| ) | |
| # print("===\n", prompt) # FOR TESTING ONLY | |
| answer_query_with_context("What is Goldman's outlook for inflation?", df, document_embeddings) | |