asklethain / lib /utils.py
avysotsky's picture
remove api keys
4270957
import os
from pathlib import Path
import openai
import tiktoken
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
encoding_name = "p50k_base"
encoding = tiktoken.get_encoding(encoding_name)
embedding_model = "text-embedding-ada-002"
openai.api_key = os.environ.get("OPENAI_API_KEY", None)
# read from current directory
df = pd.read_pickle(Path(__file__).resolve().parent.__str__() + "/../data/lethain.pkl")
def search_reviews(df, query):
query_embedding = get_embedding(
query,
engine="text-embedding-ada-002"
)
df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
results = (
df.sort_values("similarity", ascending=False)
)
return results
def construct_prompt(question: str, df: pd.DataFrame) -> str:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
separator_len = len(encoding.encode(SEPARATOR))
f"Context separator contains {separator_len} tokens"
"""
Fetch relevant
"""
result = search_reviews(df, question)
chosen_sections = []
chosen_sections_len = 0
chosen_sections_indexes = []
for section_index, row in result.iterrows():
# Add contexts until we run out of space.
tokens_num = len(encoding.encode(row.content))
chosen_sections_len += tokens_num
if chosen_sections_len > MAX_SECTION_LEN:
break
chosen_sections.append(SEPARATOR + row.content.replace("\n", " "))
chosen_sections_indexes.append(str(section_index))
# Useful diagnostic information
print(f"Selected {len(chosen_sections)} document sections:")
print("\n".join(chosen_sections_indexes))
header = """You name is Will Larson, you are CTO at Calm and a blogger about engineering leadership. Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"
def ask(question):
prompt = construct_prompt(question, df)
result = openai.Completion.create(
prompt=prompt,
temperature=0,
max_tokens=300,
model="text-davinci-003"
)
return result['choices'][0]['text']