Spaces:
Build error
Build error
File size: 16,588 Bytes
7a479d7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 | import pandas as pd
import numpy as np
import openai, os, tiktoken, json
from datetime import datetime as dt
import re
from openai.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import classification_report, PrecisionRecallDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as coso
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from utilities.api_keys import APIKeys
from utilities.unique_queue import UniqueQueue
from slack_processing.theme import Theme
openai.api_key = APIKeys().get_key('OPENAI_API_KEY')
# Set embedding model parameters
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
SAMPLE_SIZE = 500
MAX_TOKENS = 100
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"
SIMILARITY_THRESHOLD = 0.6
INPUT_PATH = "slack_processing/data/slack.json"
OUTPUT_THEME_PATH = "slack_processing/data/themes.json"
OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json"
OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json"
TOPIC_TEXT_PATH="slack_processing/data/topics.txt"
TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt"
TOPIC_TOKENS=50
TOPIC_MODEL ="gpt-3.5-turbo"
SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics."
NUM_RESULTS=5
TEMP=.15
TOP_P=.15
NUM_RESULTS1=5
TEMP1=.35
TOP_P1=.35
TOPIC_TOKENS2=50
NUM_RESULTS2=5
TEMP2=.65
TOP_P2=.65
df=pd.DataFrame()
themes = []
unknown_themes=[]
game_topics = UniqueQueue()
def InitializeTopics():
global game_topics
topics_with_synonyms = []
with open(TOPIC_TEXT_PATH, 'r') as file:
for line in file:
main_topic_and_synonyms = line.strip().lower().split(',')
main_topic = main_topic_and_synonyms[0].strip()
synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]]
topics_with_synonyms.append((main_topic, synonyms))
for main_topic, synonyms in topics_with_synonyms:
game_topics.enqueue(main_topic, synonyms)
#print("***topics****")
for topic in game_topics._queue.queue:
print(topic)
#print("+++ synonyms for canonical")
#print(game_topics.synonyms_for_canonical("Lunch and Learn")) # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn']
def CleanMessage(message):
cleaned_message = re.sub(r':(\w+):', r'\1', message)
cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message)
custom_punctuation = ':,.!?'
translator = str.maketrans('', '', custom_punctuation)
cleaned_message = cleaned_message.translate(translator)
return cleaned_message
def TruncateWords(topic,count):
words = topic.split()
truncated_topic = " ".join(words[:count])
return truncated_topic.title()
def WriteThemes():
global themes
themes_dict = [theme for theme in themes]
with open(OUTPUT_THEME_PATH, "w") as json_file:
json.dump(themes_dict, json_file, indent=4)
def WriteUnknownThemes():
global unknown_themes
unknown_themes_dict = [theme for theme in unknown_themes]
with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file:
json.dump(unknown_themes_dict, json_file, indent=4)
def WriteTopics():
global game_topics
print(dir(game_topics))
with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file:
text_file.write(str(game_topics.all_words()))
def ProcessDateTime(date_time):
date_time = dt(2023, 7, 11, 9, 21)
formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p")
json_data = f'{formatted_time}'
return json_data
def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p):
return openai.ChatCompletion.create(
model=topic_model,
messages=[
{"role": "system", "content": sys_message},
{"role": "user", "content": user_message}, ],
max_tokens=num_tokens,
n=num_results,
temperature=temperature,
stop=None,
top_p=top_p
)
def ConcatenateMatchAndCanonicals(message):
global game_topics
game_topics_str = ', '.join(game_topics.all_words())
print("*** game_topics_str: ", game_topics_str)
prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'"
print("*** prompt_message for first round is: ", prompt_message)
return prompt_message
def ConcatenateMessageAndCanonicals(message):
prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'"
print("*** prompt_message for second round is: ", prompt_message)
return prompt_message
def ConcatenateMessageAndTopics(message):
prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'"
print("*** prompt_message for third round is: ", prompt_message)
return prompt_message
def ProcessMessageWrapper(datetime, message, replies, person, id):
global themes, unknown_themes
theme = ProcessMessage(datetime, message, replies, person, id)
print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}")
if(theme.theme=='Unknown' or theme.themeSimilarity==0):
unknown_themes.append(theme.to_dict())
WriteUnknownThemes()
else:
themes.append(theme.to_dict())
WriteThemes()
return theme
# Update the process_message function
def ProcessMessage(datetime, message, replies, person, id):
global game_topics
topMatch = True
#round 1, look for exact match
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
if all(choice == "Unknown" for choice in options):
#round 2, look for 1-2 summary, like topics
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
topMatch = False
if all(choice == "Unknown" for choice in options):
#round 3, look for 1-2 summary, wild card
completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2)
options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
topMatch = False
print("---options: ", options, " topMatch: ",topMatch)
if not topMatch:
similarity_scores = []
generated_topics_indices = []
counter=0
exact_match=False
most_similar_topic = "Unknown"
unidentified_topic_count=0
theme_obj=None
for generated_topic in options:
if generated_topic != "Unknown":
generated_topics_indices.append(counter)
generated_tokens = word_tokenize(generated_topic)
generated_tokens_str = " ".join(generated_tokens)
topic_similarities = []
for reference_topic in game_topics.all_canonicals():
reference_tokens = word_tokenize(reference_topic)
reference_tokens_str = " ".join(reference_tokens)
similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str)
if exact_match:
most_similar_topic = reference_topic
most_similar_score = 1.0
break
topic_similarities.append(similarity_score)
if exact_match:
break
similarity_scores.append(topic_similarities)
else:
unidentified_topic_count+=1
counter+=1
if len(similarity_scores) > 0 and not exact_match:
most_similar_score=0
# Aggregate the similarity scores for each generated topic
similarity_scores = np.array(similarity_scores)
aggregated_scores = np.sum(similarity_scores, axis=1)
most_similar_index = np.argmax(aggregated_scores)
most_similar_topic_index = generated_topics_indices[most_similar_index]
most_similar_topic = options[most_similar_topic_index].lower()
most_similar_score=similarity_scores[most_similar_index]
if most_similar_topic != "Unknown":
#check if it's in all topics
if most_similar_topic in game_topics.all_words():
if most_similar_topic in game_topics.all_synonyms():
most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic)
else:
most_similar_topic = game_topics.get_canonical(most_similar_topic)
most_similar_score=1.0
else:
#not in all words, look for similar topics, see if it's like something in list
highest_similarity = 0
best_match = None
for known_word in game_topics.all_words():
#compute similarity against all topics
similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word))
print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word)
if similarity_score > highest_similarity:
highest_similarity = similarity_score
best_match = known_word.lower()
print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity))
print("if we found similar topic, use it")
if highest_similarity > SIMILARITY_THRESHOLD:
if(best_match in game_topics.all_synonyms()):
most_similar_topic = game_topics.canonical_for_synonym(best_match)
else:
most_similar_topic = game_topics.get_canonical(best_match)
most_similar_score=highest_similarity
else:
game_topics.enqueue(most_similar_topic)
most_similar_score=1.0
theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score))
print(f"{id} From message:'{message}' to theme: {most_similar_topic}")
else:
theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0)
else:
most_similar_topic = options[0].lower()
theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1)
print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict())
WriteTopics()
return theme_obj
def CompareTopicToGameTopic(topic, game_topic):
# Exact Match
if topic == game_topic:
return 1.0
# Token Overlap
tokens_topic = set(word_tokenize(topic.lower()))
tokens_game_topic = set(word_tokenize(game_topic.lower()))
overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic)
# Semantic Similarity using OpenAI's cosine_similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([topic, game_topic])
semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value
# Combine Scores
final_score = 0.2 * overlap_score + 0.8 * semantic_similarity
print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score))
return final_score
def ComputeSimilarity(tokens1, tokens2):
if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())):
return 1.0, True
tokens1 = word_tokenize(tokens1)
tokens2 = word_tokenize(tokens2)
total_similarity = 0
num_comparisons = 0
for token1 in tokens1:
for token2 in tokens2:
token1_synsets = wordnet.synsets(token1)
token2_synsets = wordnet.synsets(token2)
if not token1_synsets or not token2_synsets:
continue
similarity_scores = [
synset1.wup_similarity(synset2)
for synset1 in token1_synsets
for synset2 in token2_synsets
if synset1.pos() == synset2.pos()
]
valid_scores = [score for score in similarity_scores if isinstance(score, float)]
if valid_scores:
max_similarity = max(valid_scores)
total_similarity += max_similarity
num_comparisons += 1
if num_comparisons > 0:
return total_similarity / num_comparisons, False
else:
return 0, False
def FetchSlack():
return pd.read_json(INPUT_PATH, orient='records')
def ProcessReactions(reactions,id):
highestcount=0
highestcount_reaction=""
if not isinstance(reactions, list):
return ""
else:
for reaction in reactions:
stripped_reaction = reaction['emoji'].strip(':')
if reaction['count'] > highestcount:
highestcount = reaction['count']
highestcount_reaction = stripped_reaction
#print("returning highestcount_reaction:", highestcount_reaction)
return highestcount_reaction
def ProcessSlack():
global df
if not os.path.exists(OUTPUT_THEME_PATH):
InitializeTopics()
# Read JSON data into DataFrame
df = pd.read_json(INPUT_PATH)
# Keep selected columns and drop rows with missing values
df = df[["person", "datetime", "message","replies", "id"]]
# Filter down to top reaction, then create theme.
#df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1)
df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1)
df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1)
else:
df = pd.read_json(OUTPUT_THEME_PATH)
return df[["person", "theme", "message"]]
def CreateEmbeddings():
global df
if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH):
ProcessSlack()
#restrict sample to 500 most recent posts and remove samples that are too long
top_n = SAMPLE_SIZE
df = df.sort_values("datetime").tail(top_n * 2)
df.drop("datetime", axis=1, inplace=True)
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
# omit posts that are too long to embed
df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x))))
df = df[df.n_tokens <= MAX_TOKENS].tail(top_n)
df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)])
df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False)
else:
df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH)
return df[["person", "theme", "message", "embedding"]] |