import pandas as pd import numpy as np import openai, os, tiktoken, json from datetime import datetime as dt import re from openai.embeddings_utils import cosine_similarity, get_embedding from sklearn.metrics import classification_report, PrecisionRecallDisplay from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity as coso from collections import Counter from nltk.corpus import wordnet from nltk.tokenize import word_tokenize from utilities.api_keys import APIKeys from utilities.unique_queue import UniqueQueue from slack_processing.theme import Theme openai.api_key = APIKeys().get_key('OPENAI_API_KEY') # Set embedding model parameters pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) SAMPLE_SIZE = 500 MAX_TOKENS = 100 EMBEDDING_MODEL = "text-embedding-ada-002" EMBEDDING_ENCODING = "cl100k_base" SIMILARITY_THRESHOLD = 0.6 INPUT_PATH = "slack_processing/data/slack.json" OUTPUT_THEME_PATH = "slack_processing/data/themes.json" OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json" OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json" TOPIC_TEXT_PATH="slack_processing/data/topics.txt" TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt" TOPIC_TOKENS=50 TOPIC_MODEL ="gpt-3.5-turbo" SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics." NUM_RESULTS=5 TEMP=.15 TOP_P=.15 NUM_RESULTS1=5 TEMP1=.35 TOP_P1=.35 TOPIC_TOKENS2=50 NUM_RESULTS2=5 TEMP2=.65 TOP_P2=.65 df=pd.DataFrame() themes = [] unknown_themes=[] game_topics = UniqueQueue() def InitializeTopics(): global game_topics topics_with_synonyms = [] with open(TOPIC_TEXT_PATH, 'r') as file: for line in file: main_topic_and_synonyms = line.strip().lower().split(',') main_topic = main_topic_and_synonyms[0].strip() synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]] topics_with_synonyms.append((main_topic, synonyms)) for main_topic, synonyms in topics_with_synonyms: game_topics.enqueue(main_topic, synonyms) #print("***topics****") for topic in game_topics._queue.queue: print(topic) #print("+++ synonyms for canonical") #print(game_topics.synonyms_for_canonical("Lunch and Learn")) # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn'] def CleanMessage(message): cleaned_message = re.sub(r':(\w+):', r'\1', message) cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message) custom_punctuation = ':,.!?' translator = str.maketrans('', '', custom_punctuation) cleaned_message = cleaned_message.translate(translator) return cleaned_message def TruncateWords(topic,count): words = topic.split() truncated_topic = " ".join(words[:count]) return truncated_topic.title() def WriteThemes(): global themes themes_dict = [theme for theme in themes] with open(OUTPUT_THEME_PATH, "w") as json_file: json.dump(themes_dict, json_file, indent=4) def WriteUnknownThemes(): global unknown_themes unknown_themes_dict = [theme for theme in unknown_themes] with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file: json.dump(unknown_themes_dict, json_file, indent=4) def WriteTopics(): global game_topics print(dir(game_topics)) with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file: text_file.write(str(game_topics.all_words())) def ProcessDateTime(date_time): date_time = dt(2023, 7, 11, 9, 21) formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p") json_data = f'{formatted_time}' return json_data def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p): return openai.ChatCompletion.create( model=topic_model, messages=[ {"role": "system", "content": sys_message}, {"role": "user", "content": user_message}, ], max_tokens=num_tokens, n=num_results, temperature=temperature, stop=None, top_p=top_p ) def ConcatenateMatchAndCanonicals(message): global game_topics game_topics_str = ', '.join(game_topics.all_words()) print("*** game_topics_str: ", game_topics_str) prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'" print("*** prompt_message for first round is: ", prompt_message) return prompt_message def ConcatenateMessageAndCanonicals(message): prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'" print("*** prompt_message for second round is: ", prompt_message) return prompt_message def ConcatenateMessageAndTopics(message): prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'" print("*** prompt_message for third round is: ", prompt_message) return prompt_message def ProcessMessageWrapper(datetime, message, replies, person, id): global themes, unknown_themes theme = ProcessMessage(datetime, message, replies, person, id) print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}") if(theme.theme=='Unknown' or theme.themeSimilarity==0): unknown_themes.append(theme.to_dict()) WriteUnknownThemes() else: themes.append(theme.to_dict()) WriteThemes() return theme # Update the process_message function def ProcessMessage(datetime, message, replies, person, id): global game_topics topMatch = True #round 1, look for exact match completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P) options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(','))) if all(choice == "Unknown" for choice in options): #round 2, look for 1-2 summary, like topics completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1) options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(','))) topMatch = False if all(choice == "Unknown" for choice in options): #round 3, look for 1-2 summary, wild card completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2) options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(','))) topMatch = False print("---options: ", options, " topMatch: ",topMatch) if not topMatch: similarity_scores = [] generated_topics_indices = [] counter=0 exact_match=False most_similar_topic = "Unknown" unidentified_topic_count=0 theme_obj=None for generated_topic in options: if generated_topic != "Unknown": generated_topics_indices.append(counter) generated_tokens = word_tokenize(generated_topic) generated_tokens_str = " ".join(generated_tokens) topic_similarities = [] for reference_topic in game_topics.all_canonicals(): reference_tokens = word_tokenize(reference_topic) reference_tokens_str = " ".join(reference_tokens) similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str) if exact_match: most_similar_topic = reference_topic most_similar_score = 1.0 break topic_similarities.append(similarity_score) if exact_match: break similarity_scores.append(topic_similarities) else: unidentified_topic_count+=1 counter+=1 if len(similarity_scores) > 0 and not exact_match: most_similar_score=0 # Aggregate the similarity scores for each generated topic similarity_scores = np.array(similarity_scores) aggregated_scores = np.sum(similarity_scores, axis=1) most_similar_index = np.argmax(aggregated_scores) most_similar_topic_index = generated_topics_indices[most_similar_index] most_similar_topic = options[most_similar_topic_index].lower() most_similar_score=similarity_scores[most_similar_index] if most_similar_topic != "Unknown": #check if it's in all topics if most_similar_topic in game_topics.all_words(): if most_similar_topic in game_topics.all_synonyms(): most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic) else: most_similar_topic = game_topics.get_canonical(most_similar_topic) most_similar_score=1.0 else: #not in all words, look for similar topics, see if it's like something in list highest_similarity = 0 best_match = None for known_word in game_topics.all_words(): #compute similarity against all topics similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word)) print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word) if similarity_score > highest_similarity: highest_similarity = similarity_score best_match = known_word.lower() print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity)) print("if we found similar topic, use it") if highest_similarity > SIMILARITY_THRESHOLD: if(best_match in game_topics.all_synonyms()): most_similar_topic = game_topics.canonical_for_synonym(best_match) else: most_similar_topic = game_topics.get_canonical(best_match) most_similar_score=highest_similarity else: game_topics.enqueue(most_similar_topic) most_similar_score=1.0 theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score)) print(f"{id} From message:'{message}' to theme: {most_similar_topic}") else: theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0) else: most_similar_topic = options[0].lower() theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1) print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict()) WriteTopics() return theme_obj def CompareTopicToGameTopic(topic, game_topic): # Exact Match if topic == game_topic: return 1.0 # Token Overlap tokens_topic = set(word_tokenize(topic.lower())) tokens_game_topic = set(word_tokenize(game_topic.lower())) overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic) # Semantic Similarity using OpenAI's cosine_similarity vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform([topic, game_topic]) semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value # Combine Scores final_score = 0.2 * overlap_score + 0.8 * semantic_similarity print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score)) return final_score def ComputeSimilarity(tokens1, tokens2): if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())): return 1.0, True tokens1 = word_tokenize(tokens1) tokens2 = word_tokenize(tokens2) total_similarity = 0 num_comparisons = 0 for token1 in tokens1: for token2 in tokens2: token1_synsets = wordnet.synsets(token1) token2_synsets = wordnet.synsets(token2) if not token1_synsets or not token2_synsets: continue similarity_scores = [ synset1.wup_similarity(synset2) for synset1 in token1_synsets for synset2 in token2_synsets if synset1.pos() == synset2.pos() ] valid_scores = [score for score in similarity_scores if isinstance(score, float)] if valid_scores: max_similarity = max(valid_scores) total_similarity += max_similarity num_comparisons += 1 if num_comparisons > 0: return total_similarity / num_comparisons, False else: return 0, False def FetchSlack(): return pd.read_json(INPUT_PATH, orient='records') def ProcessReactions(reactions,id): highestcount=0 highestcount_reaction="" if not isinstance(reactions, list): return "" else: for reaction in reactions: stripped_reaction = reaction['emoji'].strip(':') if reaction['count'] > highestcount: highestcount = reaction['count'] highestcount_reaction = stripped_reaction #print("returning highestcount_reaction:", highestcount_reaction) return highestcount_reaction def ProcessSlack(): global df if not os.path.exists(OUTPUT_THEME_PATH): InitializeTopics() # Read JSON data into DataFrame df = pd.read_json(INPUT_PATH) # Keep selected columns and drop rows with missing values df = df[["person", "datetime", "message","replies", "id"]] # Filter down to top reaction, then create theme. #df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1) df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1) df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1) else: df = pd.read_json(OUTPUT_THEME_PATH) return df[["person", "theme", "message"]] def CreateEmbeddings(): global df if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH): ProcessSlack() #restrict sample to 500 most recent posts and remove samples that are too long top_n = SAMPLE_SIZE df = df.sort_values("datetime").tail(top_n * 2) df.drop("datetime", axis=1, inplace=True) encoding = tiktoken.get_encoding(EMBEDDING_ENCODING) # omit posts that are too long to embed df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x)))) df = df[df.n_tokens <= MAX_TOKENS].tail(top_n) df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)]) df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False) else: df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH) return df[["person", "theme", "message", "embedding"]]