Spaces:
Build error
Build error
Create functions.py
Browse files- functions.py +103 -0
functions.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Set your OpenAI API key
|
| 7 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 8 |
+
|
| 9 |
+
COMPLETIONS_MODEL = "gpt-3.5-turbo-instruct" # "text-davinci-003" used earlier is deprecated.
|
| 10 |
+
# EMBEDDING_MODEL = "text-embedding-3-small" #"text-embedding-ada-002"
|
| 11 |
+
EMBEDDING_MODEL = "text-embedding-3-large"
|
| 12 |
+
|
| 13 |
+
COMPLETIONS_API_PARAMS = {
|
| 14 |
+
# We use temperature of 0.0 because it gives the most predictable, factual answer.
|
| 15 |
+
"temperature": 0.0,
|
| 16 |
+
"max_tokens": 300,
|
| 17 |
+
"model": COMPLETIONS_MODEL,
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
df = pd.read_csv('services-links.csv')
|
| 21 |
+
df = df.set_index("service")
|
| 22 |
+
|
| 23 |
+
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
|
| 24 |
+
result = openai.Embedding.create(
|
| 25 |
+
model=model,
|
| 26 |
+
input=text
|
| 27 |
+
)
|
| 28 |
+
return result["data"][0]["embedding"]
|
| 29 |
+
|
| 30 |
+
def vector_similarity(x: list[float], y: list[float]) -> float:
|
| 31 |
+
"""
|
| 32 |
+
Returns the similarity between two vectors.
|
| 33 |
+
|
| 34 |
+
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
|
| 35 |
+
"""
|
| 36 |
+
return np.dot(np.array(x), np.array(y))
|
| 37 |
+
|
| 38 |
+
def select_document_section_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
|
| 39 |
+
"""
|
| 40 |
+
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
|
| 41 |
+
to find the most relevant sections.
|
| 42 |
+
|
| 43 |
+
Return the list of document sections, sorted by relevance in descending order.
|
| 44 |
+
"""
|
| 45 |
+
query_embedding = get_embedding(query)
|
| 46 |
+
|
| 47 |
+
document_similarities = sorted([
|
| 48 |
+
(vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
|
| 49 |
+
], reverse=True)
|
| 50 |
+
|
| 51 |
+
return document_similarities[0]
|
| 52 |
+
|
| 53 |
+
def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
|
| 54 |
+
"""
|
| 55 |
+
Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
|
| 56 |
+
|
| 57 |
+
Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
|
| 58 |
+
"""
|
| 59 |
+
return {
|
| 60 |
+
idx: get_embedding(r.description) for idx, r in df.iterrows()
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
document_embeddings = compute_doc_embeddings(df)
|
| 64 |
+
|
| 65 |
+
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
|
| 66 |
+
"""
|
| 67 |
+
Fetch relevant
|
| 68 |
+
"""
|
| 69 |
+
_ , chosen_service = select_document_section_by_query_similarity(question, context_embeddings)
|
| 70 |
+
|
| 71 |
+
service_description = df.loc[chosen_service].description.replace("\n", " ")
|
| 72 |
+
header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "
|
| 73 |
+
message = "I could not find an answer to your question, please reach out to Helpdesk."
|
| 74 |
+
link = df.loc[chosen_service].link
|
| 75 |
+
return header + message + "\n* " + "\n\nContext:\n" + service_description + "\n\n Q: " + question + "\n A:", link
|
| 76 |
+
|
| 77 |
+
def answer_query_with_context(
|
| 78 |
+
query: str,
|
| 79 |
+
df: pd.DataFrame,
|
| 80 |
+
document_embeddings: dict[(str, str), np.array],
|
| 81 |
+
show_prompt: bool = False
|
| 82 |
+
) -> str:
|
| 83 |
+
prompt, link = construct_prompt(
|
| 84 |
+
query,
|
| 85 |
+
document_embeddings,
|
| 86 |
+
df
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
if show_prompt:
|
| 90 |
+
print(prompt)
|
| 91 |
+
|
| 92 |
+
response = openai.Completion.create(
|
| 93 |
+
prompt=prompt,
|
| 94 |
+
**COMPLETIONS_API_PARAMS
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
end_message = "\n\nPlease check out the relevant HMC service catalogue for more details: "+ link
|
| 98 |
+
end_message += """\n\nIf not satisfied with the answer, please email helpdesk@hmc.edu, call 909.607.7777 or visit the Helpdesk located on the Sprague first floor. """
|
| 99 |
+
end_message += """Helpdesk representatives are also available for a remote chat session during normal hours on Monday - Friday, 8:00 AM - 5:00 PM PST via https://helpdesk.hmc.edu"""
|
| 100 |
+
|
| 101 |
+
reply = response["choices"][0]["text"] + end_message
|
| 102 |
+
|
| 103 |
+
return reply
|