File size: 16,588 Bytes
7a479d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import pandas as pd
import numpy as np
import openai, os, tiktoken, json
from datetime import datetime as dt
import re
from openai.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import classification_report, PrecisionRecallDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as coso
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from utilities.api_keys import APIKeys
from utilities.unique_queue import UniqueQueue
from slack_processing.theme import Theme

openai.api_key = APIKeys().get_key('OPENAI_API_KEY')

# Set embedding model parameters
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

SAMPLE_SIZE = 500
MAX_TOKENS = 100
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"  
SIMILARITY_THRESHOLD = 0.6

INPUT_PATH = "slack_processing/data/slack.json"
OUTPUT_THEME_PATH = "slack_processing/data/themes.json"
OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json"

OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json"
TOPIC_TEXT_PATH="slack_processing/data/topics.txt"
TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt"

TOPIC_TOKENS=50
TOPIC_MODEL ="gpt-3.5-turbo"
SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics."

NUM_RESULTS=5
TEMP=.15
TOP_P=.15
NUM_RESULTS1=5
TEMP1=.35
TOP_P1=.35
TOPIC_TOKENS2=50
NUM_RESULTS2=5
TEMP2=.65
TOP_P2=.65

df=pd.DataFrame()
themes = []
unknown_themes=[]
game_topics = UniqueQueue()

def InitializeTopics():
    global game_topics
    topics_with_synonyms = []
    with open(TOPIC_TEXT_PATH, 'r') as file:
        for line in file:
            main_topic_and_synonyms = line.strip().lower().split(',')
            main_topic = main_topic_and_synonyms[0].strip()
            synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]]
            topics_with_synonyms.append((main_topic, synonyms))
    for main_topic, synonyms in topics_with_synonyms:
        game_topics.enqueue(main_topic, synonyms)
    #print("***topics****")
    for topic in game_topics._queue.queue:
        print(topic)
    #print("+++ synonyms for canonical")
    #print(game_topics.synonyms_for_canonical("Lunch and Learn"))  # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn']

def CleanMessage(message):
    cleaned_message = re.sub(r':(\w+):', r'\1', message)
    cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message)
    custom_punctuation = ':,.!?'
    translator = str.maketrans('', '', custom_punctuation)
    cleaned_message = cleaned_message.translate(translator)
    return cleaned_message

def TruncateWords(topic,count):
    words = topic.split()
    truncated_topic = " ".join(words[:count])
    return truncated_topic.title()

def WriteThemes():
    global themes
    themes_dict = [theme for theme in themes]
    with open(OUTPUT_THEME_PATH, "w") as json_file:
        json.dump(themes_dict, json_file, indent=4)

def WriteUnknownThemes():
    global unknown_themes
    unknown_themes_dict = [theme for theme in unknown_themes]
    with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file:
        json.dump(unknown_themes_dict, json_file, indent=4)

def WriteTopics():
    global game_topics
    print(dir(game_topics))
    with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file:
        text_file.write(str(game_topics.all_words()))

def ProcessDateTime(date_time):
    date_time = dt(2023, 7, 11, 9, 21)
    formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p")
    json_data = f'{formatted_time}'
    return json_data 

def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p):
    return openai.ChatCompletion.create(
        model=topic_model,
        messages=[
            {"role": "system", "content": sys_message},
            {"role": "user", "content": user_message}, ],
        max_tokens=num_tokens,
        n=num_results,
        temperature=temperature,
        stop=None,
        top_p=top_p
    )

def ConcatenateMatchAndCanonicals(message):
    global game_topics
    game_topics_str = ', '.join(game_topics.all_words())
    print("*** game_topics_str: ", game_topics_str)
    prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'"  
    print("*** prompt_message for first round is: ", prompt_message) 
    return prompt_message

def ConcatenateMessageAndCanonicals(message):
    prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'"  
    print("*** prompt_message for second round is: ", prompt_message) 
    return prompt_message

def ConcatenateMessageAndTopics(message):
    prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'"
    print("*** prompt_message for third round is: ", prompt_message) 
    return prompt_message

def ProcessMessageWrapper(datetime, message, replies, person, id):  
    global themes, unknown_themes
    theme = ProcessMessage(datetime, message, replies, person, id)
    print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}")   
    if(theme.theme=='Unknown' or theme.themeSimilarity==0):
        unknown_themes.append(theme.to_dict())
        WriteUnknownThemes()
    else:
        themes.append(theme.to_dict())
        WriteThemes()
    return theme
    
# Update the process_message function
def ProcessMessage(datetime, message, replies, person, id):    
    global game_topics
    topMatch = True
    #round 1, look for exact match
    completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P)
    options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
    if all(choice == "Unknown" for choice in options):
        #round 2, look for 1-2 summary, like topics
        completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1)
        options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
        topMatch = False
        if all(choice == "Unknown" for choice in options):
            #round 3, look for 1-2 summary, wild card
            completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2)
            options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))        
            topMatch = False
    print("---options: ", options, " topMatch: ",topMatch)
    if not topMatch:
        similarity_scores = []
        generated_topics_indices = [] 
        counter=0
        exact_match=False
        most_similar_topic = "Unknown"  
        unidentified_topic_count=0
        theme_obj=None
        for generated_topic in options:  
            if generated_topic != "Unknown":
                generated_topics_indices.append(counter) 
                generated_tokens = word_tokenize(generated_topic)
                generated_tokens_str = " ".join(generated_tokens)
                topic_similarities = []
                for reference_topic in game_topics.all_canonicals():
                    reference_tokens = word_tokenize(reference_topic)
                    reference_tokens_str = " ".join(reference_tokens)
                    similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str)
                    if exact_match:
                        most_similar_topic = reference_topic
                        most_similar_score = 1.0
                        break
                    topic_similarities.append(similarity_score)          
                if exact_match:
                    break  
                similarity_scores.append(topic_similarities)   
            else:
                unidentified_topic_count+=1            
            counter+=1
        if len(similarity_scores) > 0 and not exact_match:
            most_similar_score=0
            # Aggregate the similarity scores for each generated topic
            similarity_scores = np.array(similarity_scores)
            aggregated_scores = np.sum(similarity_scores, axis=1)
            most_similar_index = np.argmax(aggregated_scores)
            most_similar_topic_index = generated_topics_indices[most_similar_index]
            most_similar_topic = options[most_similar_topic_index].lower()
            most_similar_score=similarity_scores[most_similar_index]

        if most_similar_topic != "Unknown":   
            #check if it's in all topics
            if most_similar_topic in game_topics.all_words():
                if most_similar_topic in game_topics.all_synonyms():
                    most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic)                
                else:                
                    most_similar_topic = game_topics.get_canonical(most_similar_topic)
                most_similar_score=1.0
            else:  
                #not in all words, look for similar topics, see if it's like something in list 
                highest_similarity = 0
                best_match = None
                for known_word in game_topics.all_words():
                    #compute similarity against all topics
                    similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word))
                    print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word)
                    if similarity_score > highest_similarity:
                        highest_similarity = similarity_score
                        best_match = known_word.lower()
                        print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity))

                print("if we found similar topic, use it")
                if highest_similarity > SIMILARITY_THRESHOLD:
                    if(best_match in game_topics.all_synonyms()):
                        most_similar_topic = game_topics.canonical_for_synonym(best_match)
                    else:
                        most_similar_topic = game_topics.get_canonical(best_match)
                    most_similar_score=highest_similarity
                else:
                    game_topics.enqueue(most_similar_topic)
                    most_similar_score=1.0

            theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score))   
            print(f"{id} From message:'{message}' to theme: {most_similar_topic}")
        else:
            theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0)   
    else:
        most_similar_topic = options[0].lower()
        theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1)   
        print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict())
    WriteTopics()
    return theme_obj

def CompareTopicToGameTopic(topic, game_topic):
    # Exact Match
    if topic == game_topic:
        return 1.0

    # Token Overlap
    tokens_topic = set(word_tokenize(topic.lower()))
    tokens_game_topic = set(word_tokenize(game_topic.lower()))
    overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic)

    # Semantic Similarity using OpenAI's cosine_similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([topic, game_topic])
    semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value

    # Combine Scores
    final_score = 0.2 * overlap_score + 0.8 * semantic_similarity
    print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score))
    return final_score

def ComputeSimilarity(tokens1, tokens2):    
    if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())):
        return 1.0, True    
    tokens1 = word_tokenize(tokens1)
    tokens2 = word_tokenize(tokens2)        
    total_similarity = 0
    num_comparisons = 0
    for token1 in tokens1:
        for token2 in tokens2:
            token1_synsets = wordnet.synsets(token1)
            token2_synsets = wordnet.synsets(token2)            
            if not token1_synsets or not token2_synsets:
                continue
            similarity_scores = [
                synset1.wup_similarity(synset2)
                for synset1 in token1_synsets
                for synset2 in token2_synsets
                if synset1.pos() == synset2.pos()
            ]            
            valid_scores = [score for score in similarity_scores if isinstance(score, float)]
            if valid_scores:
                max_similarity = max(valid_scores)
                total_similarity += max_similarity
                num_comparisons += 1
    if num_comparisons > 0:
        return total_similarity / num_comparisons, False
    else:
        return 0, False
    
def FetchSlack():       
    return pd.read_json(INPUT_PATH, orient='records')

def ProcessReactions(reactions,id):
    highestcount=0
    highestcount_reaction=""
    if not isinstance(reactions, list):
         return ""
    else:
        for reaction in reactions:
            stripped_reaction = reaction['emoji'].strip(':')
            if reaction['count'] > highestcount:
                highestcount = reaction['count']
                highestcount_reaction = stripped_reaction
    #print("returning highestcount_reaction:", highestcount_reaction)
    return highestcount_reaction

def ProcessSlack():
    global df
    if not os.path.exists(OUTPUT_THEME_PATH):
        InitializeTopics()
        # Read JSON data into DataFrame
        df = pd.read_json(INPUT_PATH)
        # Keep selected columns and drop rows with missing values
        df = df[["person", "datetime", "message","replies", "id"]]
        # Filter down to top reaction, then create theme.
        #df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1)
        df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1)
        df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1)
    else:
        df = pd.read_json(OUTPUT_THEME_PATH)
    return df[["person", "theme", "message"]]

def CreateEmbeddings():
    global df
    if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH):
        ProcessSlack()
        #restrict sample to 500 most recent posts and remove samples that are too long
        top_n = SAMPLE_SIZE
        df = df.sort_values("datetime").tail(top_n * 2)  
        df.drop("datetime", axis=1, inplace=True)
        encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
        # omit posts that are too long to embed
        df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x))))
        df = df[df.n_tokens <= MAX_TOKENS].tail(top_n)
        df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)])        
        df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False)
    else:
        df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH)
    return df[["person", "theme", "message", "embedding"]]