Spaces:
Build error
Build error
| import pandas as pd | |
| import numpy as np | |
| import openai, os, tiktoken, json | |
| from datetime import datetime as dt | |
| import re | |
| from openai.embeddings_utils import cosine_similarity, get_embedding | |
| from sklearn.metrics import classification_report, PrecisionRecallDisplay | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity as coso | |
| from collections import Counter | |
| from nltk.corpus import wordnet | |
| from nltk.tokenize import word_tokenize | |
| from utilities.api_keys import APIKeys | |
| from utilities.unique_queue import UniqueQueue | |
| from slack_processing.theme import Theme | |
| openai.api_key = APIKeys().get_key('OPENAI_API_KEY') | |
| # Set embedding model parameters | |
| pd.set_option('display.max_rows', None) | |
| pd.set_option('display.max_columns', None) | |
| SAMPLE_SIZE = 500 | |
| MAX_TOKENS = 100 | |
| EMBEDDING_MODEL = "text-embedding-ada-002" | |
| EMBEDDING_ENCODING = "cl100k_base" | |
| SIMILARITY_THRESHOLD = 0.6 | |
| INPUT_PATH = "slack_processing/data/slack.json" | |
| OUTPUT_THEME_PATH = "slack_processing/data/themes.json" | |
| OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json" | |
| OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json" | |
| TOPIC_TEXT_PATH="slack_processing/data/topics.txt" | |
| TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt" | |
| TOPIC_TOKENS=50 | |
| TOPIC_MODEL ="gpt-3.5-turbo" | |
| SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics." | |
| NUM_RESULTS=5 | |
| TEMP=.15 | |
| TOP_P=.15 | |
| NUM_RESULTS1=5 | |
| TEMP1=.35 | |
| TOP_P1=.35 | |
| TOPIC_TOKENS2=50 | |
| NUM_RESULTS2=5 | |
| TEMP2=.65 | |
| TOP_P2=.65 | |
| df=pd.DataFrame() | |
| themes = [] | |
| unknown_themes=[] | |
| game_topics = UniqueQueue() | |
| def InitializeTopics(): | |
| global game_topics | |
| topics_with_synonyms = [] | |
| with open(TOPIC_TEXT_PATH, 'r') as file: | |
| for line in file: | |
| main_topic_and_synonyms = line.strip().lower().split(',') | |
| main_topic = main_topic_and_synonyms[0].strip() | |
| synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]] | |
| topics_with_synonyms.append((main_topic, synonyms)) | |
| for main_topic, synonyms in topics_with_synonyms: | |
| game_topics.enqueue(main_topic, synonyms) | |
| #print("***topics****") | |
| for topic in game_topics._queue.queue: | |
| print(topic) | |
| #print("+++ synonyms for canonical") | |
| #print(game_topics.synonyms_for_canonical("Lunch and Learn")) # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn'] | |
| def CleanMessage(message): | |
| cleaned_message = re.sub(r':(\w+):', r'\1', message) | |
| cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message) | |
| custom_punctuation = ':,.!?' | |
| translator = str.maketrans('', '', custom_punctuation) | |
| cleaned_message = cleaned_message.translate(translator) | |
| return cleaned_message | |
| def TruncateWords(topic,count): | |
| words = topic.split() | |
| truncated_topic = " ".join(words[:count]) | |
| return truncated_topic.title() | |
| def WriteThemes(): | |
| global themes | |
| themes_dict = [theme for theme in themes] | |
| with open(OUTPUT_THEME_PATH, "w") as json_file: | |
| json.dump(themes_dict, json_file, indent=4) | |
| def WriteUnknownThemes(): | |
| global unknown_themes | |
| unknown_themes_dict = [theme for theme in unknown_themes] | |
| with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file: | |
| json.dump(unknown_themes_dict, json_file, indent=4) | |
| def WriteTopics(): | |
| global game_topics | |
| print(dir(game_topics)) | |
| with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file: | |
| text_file.write(str(game_topics.all_words())) | |
| def ProcessDateTime(date_time): | |
| date_time = dt(2023, 7, 11, 9, 21) | |
| formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p") | |
| json_data = f'{formatted_time}' | |
| return json_data | |
| def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p): | |
| return openai.ChatCompletion.create( | |
| model=topic_model, | |
| messages=[ | |
| {"role": "system", "content": sys_message}, | |
| {"role": "user", "content": user_message}, ], | |
| max_tokens=num_tokens, | |
| n=num_results, | |
| temperature=temperature, | |
| stop=None, | |
| top_p=top_p | |
| ) | |
| def ConcatenateMatchAndCanonicals(message): | |
| global game_topics | |
| game_topics_str = ', '.join(game_topics.all_words()) | |
| print("*** game_topics_str: ", game_topics_str) | |
| prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'" | |
| print("*** prompt_message for first round is: ", prompt_message) | |
| return prompt_message | |
| def ConcatenateMessageAndCanonicals(message): | |
| prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'" | |
| print("*** prompt_message for second round is: ", prompt_message) | |
| return prompt_message | |
| def ConcatenateMessageAndTopics(message): | |
| prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'" | |
| print("*** prompt_message for third round is: ", prompt_message) | |
| return prompt_message | |
| def ProcessMessageWrapper(datetime, message, replies, person, id): | |
| global themes, unknown_themes | |
| theme = ProcessMessage(datetime, message, replies, person, id) | |
| print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}") | |
| if(theme.theme=='Unknown' or theme.themeSimilarity==0): | |
| unknown_themes.append(theme.to_dict()) | |
| WriteUnknownThemes() | |
| else: | |
| themes.append(theme.to_dict()) | |
| WriteThemes() | |
| return theme | |
| # Update the process_message function | |
| def ProcessMessage(datetime, message, replies, person, id): | |
| global game_topics | |
| topMatch = True | |
| #round 1, look for exact match | |
| completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P) | |
| options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(','))) | |
| if all(choice == "Unknown" for choice in options): | |
| #round 2, look for 1-2 summary, like topics | |
| completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1) | |
| options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(','))) | |
| topMatch = False | |
| if all(choice == "Unknown" for choice in options): | |
| #round 3, look for 1-2 summary, wild card | |
| completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2) | |
| options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(','))) | |
| topMatch = False | |
| print("---options: ", options, " topMatch: ",topMatch) | |
| if not topMatch: | |
| similarity_scores = [] | |
| generated_topics_indices = [] | |
| counter=0 | |
| exact_match=False | |
| most_similar_topic = "Unknown" | |
| unidentified_topic_count=0 | |
| theme_obj=None | |
| for generated_topic in options: | |
| if generated_topic != "Unknown": | |
| generated_topics_indices.append(counter) | |
| generated_tokens = word_tokenize(generated_topic) | |
| generated_tokens_str = " ".join(generated_tokens) | |
| topic_similarities = [] | |
| for reference_topic in game_topics.all_canonicals(): | |
| reference_tokens = word_tokenize(reference_topic) | |
| reference_tokens_str = " ".join(reference_tokens) | |
| similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str) | |
| if exact_match: | |
| most_similar_topic = reference_topic | |
| most_similar_score = 1.0 | |
| break | |
| topic_similarities.append(similarity_score) | |
| if exact_match: | |
| break | |
| similarity_scores.append(topic_similarities) | |
| else: | |
| unidentified_topic_count+=1 | |
| counter+=1 | |
| if len(similarity_scores) > 0 and not exact_match: | |
| most_similar_score=0 | |
| # Aggregate the similarity scores for each generated topic | |
| similarity_scores = np.array(similarity_scores) | |
| aggregated_scores = np.sum(similarity_scores, axis=1) | |
| most_similar_index = np.argmax(aggregated_scores) | |
| most_similar_topic_index = generated_topics_indices[most_similar_index] | |
| most_similar_topic = options[most_similar_topic_index].lower() | |
| most_similar_score=similarity_scores[most_similar_index] | |
| if most_similar_topic != "Unknown": | |
| #check if it's in all topics | |
| if most_similar_topic in game_topics.all_words(): | |
| if most_similar_topic in game_topics.all_synonyms(): | |
| most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic) | |
| else: | |
| most_similar_topic = game_topics.get_canonical(most_similar_topic) | |
| most_similar_score=1.0 | |
| else: | |
| #not in all words, look for similar topics, see if it's like something in list | |
| highest_similarity = 0 | |
| best_match = None | |
| for known_word in game_topics.all_words(): | |
| #compute similarity against all topics | |
| similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word)) | |
| print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word) | |
| if similarity_score > highest_similarity: | |
| highest_similarity = similarity_score | |
| best_match = known_word.lower() | |
| print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity)) | |
| print("if we found similar topic, use it") | |
| if highest_similarity > SIMILARITY_THRESHOLD: | |
| if(best_match in game_topics.all_synonyms()): | |
| most_similar_topic = game_topics.canonical_for_synonym(best_match) | |
| else: | |
| most_similar_topic = game_topics.get_canonical(best_match) | |
| most_similar_score=highest_similarity | |
| else: | |
| game_topics.enqueue(most_similar_topic) | |
| most_similar_score=1.0 | |
| theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score)) | |
| print(f"{id} From message:'{message}' to theme: {most_similar_topic}") | |
| else: | |
| theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0) | |
| else: | |
| most_similar_topic = options[0].lower() | |
| theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1) | |
| print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict()) | |
| WriteTopics() | |
| return theme_obj | |
| def CompareTopicToGameTopic(topic, game_topic): | |
| # Exact Match | |
| if topic == game_topic: | |
| return 1.0 | |
| # Token Overlap | |
| tokens_topic = set(word_tokenize(topic.lower())) | |
| tokens_game_topic = set(word_tokenize(game_topic.lower())) | |
| overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic) | |
| # Semantic Similarity using OpenAI's cosine_similarity | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform([topic, game_topic]) | |
| semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value | |
| # Combine Scores | |
| final_score = 0.2 * overlap_score + 0.8 * semantic_similarity | |
| print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score)) | |
| return final_score | |
| def ComputeSimilarity(tokens1, tokens2): | |
| if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())): | |
| return 1.0, True | |
| tokens1 = word_tokenize(tokens1) | |
| tokens2 = word_tokenize(tokens2) | |
| total_similarity = 0 | |
| num_comparisons = 0 | |
| for token1 in tokens1: | |
| for token2 in tokens2: | |
| token1_synsets = wordnet.synsets(token1) | |
| token2_synsets = wordnet.synsets(token2) | |
| if not token1_synsets or not token2_synsets: | |
| continue | |
| similarity_scores = [ | |
| synset1.wup_similarity(synset2) | |
| for synset1 in token1_synsets | |
| for synset2 in token2_synsets | |
| if synset1.pos() == synset2.pos() | |
| ] | |
| valid_scores = [score for score in similarity_scores if isinstance(score, float)] | |
| if valid_scores: | |
| max_similarity = max(valid_scores) | |
| total_similarity += max_similarity | |
| num_comparisons += 1 | |
| if num_comparisons > 0: | |
| return total_similarity / num_comparisons, False | |
| else: | |
| return 0, False | |
| def FetchSlack(): | |
| return pd.read_json(INPUT_PATH, orient='records') | |
| def ProcessReactions(reactions,id): | |
| highestcount=0 | |
| highestcount_reaction="" | |
| if not isinstance(reactions, list): | |
| return "" | |
| else: | |
| for reaction in reactions: | |
| stripped_reaction = reaction['emoji'].strip(':') | |
| if reaction['count'] > highestcount: | |
| highestcount = reaction['count'] | |
| highestcount_reaction = stripped_reaction | |
| #print("returning highestcount_reaction:", highestcount_reaction) | |
| return highestcount_reaction | |
| def ProcessSlack(): | |
| global df | |
| if not os.path.exists(OUTPUT_THEME_PATH): | |
| InitializeTopics() | |
| # Read JSON data into DataFrame | |
| df = pd.read_json(INPUT_PATH) | |
| # Keep selected columns and drop rows with missing values | |
| df = df[["person", "datetime", "message","replies", "id"]] | |
| # Filter down to top reaction, then create theme. | |
| #df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1) | |
| df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1) | |
| df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1) | |
| else: | |
| df = pd.read_json(OUTPUT_THEME_PATH) | |
| return df[["person", "theme", "message"]] | |
| def CreateEmbeddings(): | |
| global df | |
| if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH): | |
| ProcessSlack() | |
| #restrict sample to 500 most recent posts and remove samples that are too long | |
| top_n = SAMPLE_SIZE | |
| df = df.sort_values("datetime").tail(top_n * 2) | |
| df.drop("datetime", axis=1, inplace=True) | |
| encoding = tiktoken.get_encoding(EMBEDDING_ENCODING) | |
| # omit posts that are too long to embed | |
| df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x)))) | |
| df = df[df.n_tokens <= MAX_TOKENS].tail(top_n) | |
| df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)]) | |
| df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False) | |
| else: | |
| df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH) | |
| return df[["person", "theme", "message", "embedding"]] |