BuildPlay / slack_processing /slack_data_prep.py
Jay Patel
Init commit
7a479d7
import pandas as pd
import numpy as np
import openai, os, tiktoken, json
from datetime import datetime as dt
import re
from openai.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import classification_report, PrecisionRecallDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as coso
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from utilities.api_keys import APIKeys
from utilities.unique_queue import UniqueQueue
from slack_processing.theme import Theme
openai.api_key = APIKeys().get_key('OPENAI_API_KEY')
# Set embedding model parameters
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
SAMPLE_SIZE = 500
MAX_TOKENS = 100
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"
SIMILARITY_THRESHOLD = 0.6
INPUT_PATH = "slack_processing/data/slack.json"
OUTPUT_THEME_PATH = "slack_processing/data/themes.json"
OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json"
OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json"
TOPIC_TEXT_PATH="slack_processing/data/topics.txt"
TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt"
TOPIC_TOKENS=50
TOPIC_MODEL ="gpt-3.5-turbo"
SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics."
NUM_RESULTS=5
TEMP=.15
TOP_P=.15
NUM_RESULTS1=5
TEMP1=.35
TOP_P1=.35
TOPIC_TOKENS2=50
NUM_RESULTS2=5
TEMP2=.65
TOP_P2=.65
df=pd.DataFrame()
themes = []
unknown_themes=[]
game_topics = UniqueQueue()
def InitializeTopics():
global game_topics
topics_with_synonyms = []
with open(TOPIC_TEXT_PATH, 'r') as file:
for line in file:
main_topic_and_synonyms = line.strip().lower().split(',')
main_topic = main_topic_and_synonyms[0].strip()
synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]]
topics_with_synonyms.append((main_topic, synonyms))
for main_topic, synonyms in topics_with_synonyms:
game_topics.enqueue(main_topic, synonyms)
#print("***topics****")
for topic in game_topics._queue.queue:
print(topic)
#print("+++ synonyms for canonical")
#print(game_topics.synonyms_for_canonical("Lunch and Learn")) # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn']
def CleanMessage(message):
cleaned_message = re.sub(r':(\w+):', r'\1', message)
cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message)
custom_punctuation = ':,.!?'
translator = str.maketrans('', '', custom_punctuation)
cleaned_message = cleaned_message.translate(translator)
return cleaned_message
def TruncateWords(topic,count):
words = topic.split()
truncated_topic = " ".join(words[:count])
return truncated_topic.title()
def WriteThemes():
global themes
themes_dict = [theme for theme in themes]
with open(OUTPUT_THEME_PATH, "w") as json_file:
json.dump(themes_dict, json_file, indent=4)
def WriteUnknownThemes():
global unknown_themes
unknown_themes_dict = [theme for theme in unknown_themes]
with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file:
json.dump(unknown_themes_dict, json_file, indent=4)
def WriteTopics():
global game_topics
print(dir(game_topics))
with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file:
text_file.write(str(game_topics.all_words()))
def ProcessDateTime(date_time):
date_time = dt(2023, 7, 11, 9, 21)
formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p")
json_data = f'{formatted_time}'
return json_data
def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p):
return openai.ChatCompletion.create(
model=topic_model,
messages=[
{"role": "system", "content": sys_message},
{"role": "user", "content": user_message}, ],
max_tokens=num_tokens,
n=num_results,
temperature=temperature,
stop=None,
top_p=top_p
)
def ConcatenateMatchAndCanonicals(message):
global game_topics
game_topics_str = ', '.join(game_topics.all_words())
print("*** game_topics_str: ", game_topics_str)
prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'"
print("*** prompt_message for first round is: ", prompt_message)
return prompt_message
def ConcatenateMessageAndCanonicals(message):
prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'"
print("*** prompt_message for second round is: ", prompt_message)
return prompt_message
def ConcatenateMessageAndTopics(message):
prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'"
print("*** prompt_message for third round is: ", prompt_message)
return prompt_message
def ProcessMessageWrapper(datetime, message, replies, person, id):
global themes, unknown_themes
theme = ProcessMessage(datetime, message, replies, person, id)
print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}")
if(theme.theme=='Unknown' or theme.themeSimilarity==0):
unknown_themes.append(theme.to_dict())
WriteUnknownThemes()
else:
themes.append(theme.to_dict())
WriteThemes()
return theme
# Update the process_message function
def ProcessMessage(datetime, message, replies, person, id):
global game_topics
topMatch = True
#round 1, look for exact match
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
if all(choice == "Unknown" for choice in options):
#round 2, look for 1-2 summary, like topics
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
topMatch = False
if all(choice == "Unknown" for choice in options):
#round 3, look for 1-2 summary, wild card
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
topMatch = False
print("---options: ", options, " topMatch: ",topMatch)
if not topMatch:
similarity_scores = []
generated_topics_indices = []
counter=0
exact_match=False
most_similar_topic = "Unknown"
unidentified_topic_count=0
theme_obj=None
for generated_topic in options:
if generated_topic != "Unknown":
generated_topics_indices.append(counter)
generated_tokens = word_tokenize(generated_topic)
generated_tokens_str = " ".join(generated_tokens)
topic_similarities = []
for reference_topic in game_topics.all_canonicals():
reference_tokens = word_tokenize(reference_topic)
reference_tokens_str = " ".join(reference_tokens)
similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str)
if exact_match:
most_similar_topic = reference_topic
most_similar_score = 1.0
break
topic_similarities.append(similarity_score)
if exact_match:
break
similarity_scores.append(topic_similarities)
else:
unidentified_topic_count+=1
counter+=1
if len(similarity_scores) > 0 and not exact_match:
most_similar_score=0
# Aggregate the similarity scores for each generated topic
similarity_scores = np.array(similarity_scores)
aggregated_scores = np.sum(similarity_scores, axis=1)
most_similar_index = np.argmax(aggregated_scores)
most_similar_topic_index = generated_topics_indices[most_similar_index]
most_similar_topic = options[most_similar_topic_index].lower()
most_similar_score=similarity_scores[most_similar_index]
if most_similar_topic != "Unknown":
#check if it's in all topics
if most_similar_topic in game_topics.all_words():
if most_similar_topic in game_topics.all_synonyms():
most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic)
else:
most_similar_topic = game_topics.get_canonical(most_similar_topic)
most_similar_score=1.0
else:
#not in all words, look for similar topics, see if it's like something in list
highest_similarity = 0
best_match = None
for known_word in game_topics.all_words():
#compute similarity against all topics
similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word))
print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word)
if similarity_score > highest_similarity:
highest_similarity = similarity_score
best_match = known_word.lower()
print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity))
print("if we found similar topic, use it")
if highest_similarity > SIMILARITY_THRESHOLD:
if(best_match in game_topics.all_synonyms()):
most_similar_topic = game_topics.canonical_for_synonym(best_match)
else:
most_similar_topic = game_topics.get_canonical(best_match)
most_similar_score=highest_similarity
else:
game_topics.enqueue(most_similar_topic)
most_similar_score=1.0
theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score))
print(f"{id} From message:'{message}' to theme: {most_similar_topic}")
else:
theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0)
else:
most_similar_topic = options[0].lower()
theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1)
print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict())
WriteTopics()
return theme_obj
def CompareTopicToGameTopic(topic, game_topic):
# Exact Match
if topic == game_topic:
return 1.0
# Token Overlap
tokens_topic = set(word_tokenize(topic.lower()))
tokens_game_topic = set(word_tokenize(game_topic.lower()))
overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic)
# Semantic Similarity using OpenAI's cosine_similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([topic, game_topic])
semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value
# Combine Scores
final_score = 0.2 * overlap_score + 0.8 * semantic_similarity
print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score))
return final_score
def ComputeSimilarity(tokens1, tokens2):
if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())):
return 1.0, True
tokens1 = word_tokenize(tokens1)
tokens2 = word_tokenize(tokens2)
total_similarity = 0
num_comparisons = 0
for token1 in tokens1:
for token2 in tokens2:
token1_synsets = wordnet.synsets(token1)
token2_synsets = wordnet.synsets(token2)
if not token1_synsets or not token2_synsets:
continue
similarity_scores = [
synset1.wup_similarity(synset2)
for synset1 in token1_synsets
for synset2 in token2_synsets
if synset1.pos() == synset2.pos()
]
valid_scores = [score for score in similarity_scores if isinstance(score, float)]
if valid_scores:
max_similarity = max(valid_scores)
total_similarity += max_similarity
num_comparisons += 1
if num_comparisons > 0:
return total_similarity / num_comparisons, False
else:
return 0, False
def FetchSlack():
return pd.read_json(INPUT_PATH, orient='records')
def ProcessReactions(reactions,id):
highestcount=0
highestcount_reaction=""
if not isinstance(reactions, list):
return ""
else:
for reaction in reactions:
stripped_reaction = reaction['emoji'].strip(':')
if reaction['count'] > highestcount:
highestcount = reaction['count']
highestcount_reaction = stripped_reaction
#print("returning highestcount_reaction:", highestcount_reaction)
return highestcount_reaction
def ProcessSlack():
global df
if not os.path.exists(OUTPUT_THEME_PATH):
InitializeTopics()
# Read JSON data into DataFrame
df = pd.read_json(INPUT_PATH)
# Keep selected columns and drop rows with missing values
df = df[["person", "datetime", "message","replies", "id"]]
# Filter down to top reaction, then create theme.
#df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1)
df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1)
df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1)
else:
df = pd.read_json(OUTPUT_THEME_PATH)
return df[["person", "theme", "message"]]
def CreateEmbeddings():
global df
if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH):
ProcessSlack()
#restrict sample to 500 most recent posts and remove samples that are too long
top_n = SAMPLE_SIZE
df = df.sort_values("datetime").tail(top_n * 2)
df.drop("datetime", axis=1, inplace=True)
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
# omit posts that are too long to embed
df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x))))
df = df[df.n_tokens <= MAX_TOKENS].tail(top_n)
df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)])
df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False)
else:
df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH)
return df[["person", "theme", "message", "embedding"]]