Spaces:

kimadamsorg
/

BuildPlay

Build error

BuildPlay / slack_processing /slack_data_prep.py

Jay Patel

Init commit

7a479d7 over 2 years ago

16.6 kB

	import pandas as pd
	import numpy as np
	import openai, os, tiktoken, json
	from datetime import datetime as dt
	import re
	from openai.embeddings_utils import cosine_similarity, get_embedding
	from sklearn.metrics import classification_report, PrecisionRecallDisplay
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity as coso
	from collections import Counter
	from nltk.corpus import wordnet
	from nltk.tokenize import word_tokenize
	from utilities.api_keys import APIKeys
	from utilities.unique_queue import UniqueQueue
	from slack_processing.theme import Theme

	openai.api_key = APIKeys().get_key('OPENAI_API_KEY')

	# Set embedding model parameters
	pd.set_option('display.max_rows', None)
	pd.set_option('display.max_columns', None)

	SAMPLE_SIZE = 500
	MAX_TOKENS = 100
	EMBEDDING_MODEL = "text-embedding-ada-002"
	EMBEDDING_ENCODING = "cl100k_base"
	SIMILARITY_THRESHOLD = 0.6

	INPUT_PATH = "slack_processing/data/slack.json"
	OUTPUT_THEME_PATH = "slack_processing/data/themes.json"
	OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json"

	OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json"
	TOPIC_TEXT_PATH="slack_processing/data/topics.txt"
	TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt"

	TOPIC_TOKENS=50
	TOPIC_MODEL ="gpt-3.5-turbo"
	SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics."

	NUM_RESULTS=5
	TEMP=.15
	TOP_P=.15
	NUM_RESULTS1=5
	TEMP1=.35
	TOP_P1=.35
	TOPIC_TOKENS2=50
	NUM_RESULTS2=5
	TEMP2=.65
	TOP_P2=.65

	df=pd.DataFrame()
	themes = []
	unknown_themes=[]
	game_topics = UniqueQueue()

	def InitializeTopics():
	global game_topics
	topics_with_synonyms = []
	with open(TOPIC_TEXT_PATH, 'r') as file:
	for line in file:
	main_topic_and_synonyms = line.strip().lower().split(',')
	main_topic = main_topic_and_synonyms[0].strip()
	synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]]
	topics_with_synonyms.append((main_topic, synonyms))
	for main_topic, synonyms in topics_with_synonyms:
	game_topics.enqueue(main_topic, synonyms)
	#print("*topics**")
	for topic in game_topics._queue.queue:
	print(topic)
	#print("+++ synonyms for canonical")
	#print(game_topics.synonyms_for_canonical("Lunch and Learn")) # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn']

	def CleanMessage(message):
	cleaned_message = re.sub(r':(\w+):', r'\1', message)
	cleaned_message = re.sub(r'http\S+\|www.\S+', '', cleaned_message)
	custom_punctuation = ':,.!?'
	translator = str.maketrans('', '', custom_punctuation)
	cleaned_message = cleaned_message.translate(translator)
	return cleaned_message

	def TruncateWords(topic,count):
	words = topic.split()
	truncated_topic = " ".join(words[:count])
	return truncated_topic.title()

	def WriteThemes():
	global themes
	themes_dict = [theme for theme in themes]
	with open(OUTPUT_THEME_PATH, "w") as json_file:
	json.dump(themes_dict, json_file, indent=4)

	def WriteUnknownThemes():
	global unknown_themes
	unknown_themes_dict = [theme for theme in unknown_themes]
	with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file:
	json.dump(unknown_themes_dict, json_file, indent=4)

	def WriteTopics():
	global game_topics
	print(dir(game_topics))
	with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file:
	text_file.write(str(game_topics.all_words()))

	def ProcessDateTime(date_time):
	date_time = dt(2023, 7, 11, 9, 21)
	formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p")
	json_data = f'{formatted_time}'
	return json_data

	def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p):
	return openai.ChatCompletion.create(
	model=topic_model,
	messages=[
	{"role": "system", "content": sys_message},
	{"role": "user", "content": user_message}, ],
	max_tokens=num_tokens,
	n=num_results,
	temperature=temperature,
	stop=None,
	top_p=top_p
	)

	def ConcatenateMatchAndCanonicals(message):
	global game_topics
	game_topics_str = ', '.join(game_topics.all_words())
	print("*** game_topics_str: ", game_topics_str)
	prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'"
	print("*** prompt_message for first round is: ", prompt_message)
	return prompt_message

	def ConcatenateMessageAndCanonicals(message):
	prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'"
	print("*** prompt_message for second round is: ", prompt_message)
	return prompt_message

	def ConcatenateMessageAndTopics(message):
	prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'"
	print("*** prompt_message for third round is: ", prompt_message)
	return prompt_message

	def ProcessMessageWrapper(datetime, message, replies, person, id):
	global themes, unknown_themes
	theme = ProcessMessage(datetime, message, replies, person, id)
	print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}")
	if(theme.theme=='Unknown' or theme.themeSimilarity==0):
	unknown_themes.append(theme.to_dict())
	WriteUnknownThemes()
	else:
	themes.append(theme.to_dict())
	WriteThemes()
	return theme

	# Update the process_message function
	def ProcessMessage(datetime, message, replies, person, id):
	global game_topics
	topMatch = True
	#round 1, look for exact match
	completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P)
	options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
	if all(choice == "Unknown" for choice in options):
	#round 2, look for 1-2 summary, like topics
	completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1)
	options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
	topMatch = False
	if all(choice == "Unknown" for choice in options):
	#round 3, look for 1-2 summary, wild card
	completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2)
	options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
	topMatch = False
	print("---options: ", options, " topMatch: ",topMatch)
	if not topMatch:
	similarity_scores = []
	generated_topics_indices = []
	counter=0
	exact_match=False
	most_similar_topic = "Unknown"
	unidentified_topic_count=0
	theme_obj=None
	for generated_topic in options:
	if generated_topic != "Unknown":
	generated_topics_indices.append(counter)
	generated_tokens = word_tokenize(generated_topic)
	generated_tokens_str = " ".join(generated_tokens)
	topic_similarities = []
	for reference_topic in game_topics.all_canonicals():
	reference_tokens = word_tokenize(reference_topic)
	reference_tokens_str = " ".join(reference_tokens)
	similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str)
	if exact_match:
	most_similar_topic = reference_topic
	most_similar_score = 1.0
	break
	topic_similarities.append(similarity_score)
	if exact_match:
	break
	similarity_scores.append(topic_similarities)
	else:
	unidentified_topic_count+=1
	counter+=1
	if len(similarity_scores) > 0 and not exact_match:
	most_similar_score=0
	# Aggregate the similarity scores for each generated topic
	similarity_scores = np.array(similarity_scores)
	aggregated_scores = np.sum(similarity_scores, axis=1)
	most_similar_index = np.argmax(aggregated_scores)
	most_similar_topic_index = generated_topics_indices[most_similar_index]
	most_similar_topic = options[most_similar_topic_index].lower()
	most_similar_score=similarity_scores[most_similar_index]

	if most_similar_topic != "Unknown":
	#check if it's in all topics
	if most_similar_topic in game_topics.all_words():
	if most_similar_topic in game_topics.all_synonyms():
	most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic)
	else:
	most_similar_topic = game_topics.get_canonical(most_similar_topic)
	most_similar_score=1.0
	else:
	#not in all words, look for similar topics, see if it's like something in list
	highest_similarity = 0
	best_match = None
	for known_word in game_topics.all_words():
	#compute similarity against all topics
	similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word))
	print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word)
	if similarity_score > highest_similarity:
	highest_similarity = similarity_score
	best_match = known_word.lower()
	print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity))

	print("if we found similar topic, use it")
	if highest_similarity > SIMILARITY_THRESHOLD:
	if(best_match in game_topics.all_synonyms()):
	most_similar_topic = game_topics.canonical_for_synonym(best_match)
	else:
	most_similar_topic = game_topics.get_canonical(best_match)
	most_similar_score=highest_similarity
	else:
	game_topics.enqueue(most_similar_topic)
	most_similar_score=1.0

	theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score))
	print(f"{id} From message:'{message}' to theme: {most_similar_topic}")
	else:
	theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0)
	else:
	most_similar_topic = options[0].lower()
	theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1)
	print("\n*___ in else: theme_obj: ", theme_obj.to_dict())
	WriteTopics()
	return theme_obj

	def CompareTopicToGameTopic(topic, game_topic):
	# Exact Match
	if topic == game_topic:
	return 1.0

	# Token Overlap
	tokens_topic = set(word_tokenize(topic.lower()))
	tokens_game_topic = set(word_tokenize(game_topic.lower()))
	overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic \| tokens_game_topic)

	# Semantic Similarity using OpenAI's cosine_similarity
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform([topic, game_topic])
	semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value

	# Combine Scores
	final_score = 0.2 * overlap_score + 0.8 * semantic_similarity
	print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score))
	return final_score

	def ComputeSimilarity(tokens1, tokens2):
	if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())):
	return 1.0, True
	tokens1 = word_tokenize(tokens1)
	tokens2 = word_tokenize(tokens2)
	total_similarity = 0
	num_comparisons = 0
	for token1 in tokens1:
	for token2 in tokens2:
	token1_synsets = wordnet.synsets(token1)
	token2_synsets = wordnet.synsets(token2)
	if not token1_synsets or not token2_synsets:
	continue
	similarity_scores = [
	synset1.wup_similarity(synset2)
	for synset1 in token1_synsets
	for synset2 in token2_synsets
	if synset1.pos() == synset2.pos()
	]
	valid_scores = [score for score in similarity_scores if isinstance(score, float)]
	if valid_scores:
	max_similarity = max(valid_scores)
	total_similarity += max_similarity
	num_comparisons += 1
	if num_comparisons > 0:
	return total_similarity / num_comparisons, False
	else:
	return 0, False

	def FetchSlack():
	return pd.read_json(INPUT_PATH, orient='records')

	def ProcessReactions(reactions,id):
	highestcount=0
	highestcount_reaction=""
	if not isinstance(reactions, list):
	return ""
	else:
	for reaction in reactions:
	stripped_reaction = reaction['emoji'].strip(':')
	if reaction['count'] > highestcount:
	highestcount = reaction['count']
	highestcount_reaction = stripped_reaction
	#print("returning highestcount_reaction:", highestcount_reaction)
	return highestcount_reaction

	def ProcessSlack():
	global df
	if not os.path.exists(OUTPUT_THEME_PATH):
	InitializeTopics()
	# Read JSON data into DataFrame
	df = pd.read_json(INPUT_PATH)
	# Keep selected columns and drop rows with missing values
	df = df[["person", "datetime", "message","replies", "id"]]
	# Filter down to top reaction, then create theme.
	#df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1)
	df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1)
	df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1)
	else:
	df = pd.read_json(OUTPUT_THEME_PATH)
	return df[["person", "theme", "message"]]

	def CreateEmbeddings():
	global df
	if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH):
	ProcessSlack()
	#restrict sample to 500 most recent posts and remove samples that are too long
	top_n = SAMPLE_SIZE
	df = df.sort_values("datetime").tail(top_n * 2)
	df.drop("datetime", axis=1, inplace=True)
	encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
	# omit posts that are too long to embed
	df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x))))
	df = df[df.n_tokens <= MAX_TOKENS].tail(top_n)
	df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)])
	df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False)
	else:
	df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH)
	return df[["person", "theme", "message", "embedding"]]