Spaces:

kimadamsorg
/

BuildPlay

Build error

File size: 16,588 Bytes

7a479d7

import pandas as pd
import numpy as np
import openai, os, tiktoken, json
from datetime import datetime as dt
import re
from openai.embeddings_utils import cosine_similarity, get_embedding
from sklearn.metrics import classification_report, PrecisionRecallDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as coso
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from utilities.api_keys import APIKeys
from utilities.unique_queue import UniqueQueue
from slack_processing.theme import Theme

openai.api_key = APIKeys().get_key('OPENAI_API_KEY')

# Set embedding model parameters
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

SAMPLE_SIZE = 500
MAX_TOKENS = 100
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"  
SIMILARITY_THRESHOLD = 0.6

INPUT_PATH = "slack_processing/data/slack.json"
OUTPUT_THEME_PATH = "slack_processing/data/themes.json"
OUTPUT_UNKNOWN_THEME_PATH = "slack_processing/data/unknown_themes.json"

OUTPUT_THEME_EMBEDDINGS_PATH = "slack_processing/data/slack_with_theme_embeddings.json"
TOPIC_TEXT_PATH="slack_processing/data/topics.txt"
TOPIC_TEXT_OUTPUT_PATH="slack_processing/data/topics_with_synonyms.txt"

TOPIC_TOKENS=50
TOPIC_MODEL ="gpt-3.5-turbo"
SYSTEM_MESSAGE="You are categorizing slack messages into 1-2 word topics."

NUM_RESULTS=5
TEMP=.15
TOP_P=.15
NUM_RESULTS1=5
TEMP1=.35
TOP_P1=.35
TOPIC_TOKENS2=50
NUM_RESULTS2=5
TEMP2=.65
TOP_P2=.65

df=pd.DataFrame()
themes = []
unknown_themes=[]
game_topics = UniqueQueue()

def InitializeTopics():
    global game_topics
    topics_with_synonyms = []
    with open(TOPIC_TEXT_PATH, 'r') as file:
        for line in file:
            main_topic_and_synonyms = line.strip().lower().split(',')
            main_topic = main_topic_and_synonyms[0].strip()
            synonyms = [synonym.strip() for synonym in main_topic_and_synonyms[1:]]
            topics_with_synonyms.append((main_topic, synonyms))
    for main_topic, synonyms in topics_with_synonyms:
        game_topics.enqueue(main_topic, synonyms)
    #print("***topics****")
    for topic in game_topics._queue.queue:
        print(topic)
    #print("+++ synonyms for canonical")
    #print(game_topics.synonyms_for_canonical("Lunch and Learn"))  # Output: ['L&L', 'lunch-and-learn', 'Lunch & Learn']

def CleanMessage(message):
    cleaned_message = re.sub(r':(\w+):', r'\1', message)
    cleaned_message = re.sub(r'http\S+|www.\S+', '', cleaned_message)
    custom_punctuation = ':,.!?'
    translator = str.maketrans('', '', custom_punctuation)
    cleaned_message = cleaned_message.translate(translator)
    return cleaned_message

def TruncateWords(topic,count):
    words = topic.split()
    truncated_topic = " ".join(words[:count])
    return truncated_topic.title()

def WriteThemes():
    global themes
    themes_dict = [theme for theme in themes]
    with open(OUTPUT_THEME_PATH, "w") as json_file:
        json.dump(themes_dict, json_file, indent=4)

def WriteUnknownThemes():
    global unknown_themes
    unknown_themes_dict = [theme for theme in unknown_themes]
    with open(OUTPUT_UNKNOWN_THEME_PATH, "w") as json_file:
        json.dump(unknown_themes_dict, json_file, indent=4)

def WriteTopics():
    global game_topics
    print(dir(game_topics))
    with open(TOPIC_TEXT_OUTPUT_PATH, "w") as text_file:
        text_file.write(str(game_topics.all_words()))

def ProcessDateTime(date_time):
    date_time = dt(2023, 7, 11, 9, 21)
    formatted_time = date_time.strftime("%Y-%m-%d %-I:%M %p")
    json_data = f'{formatted_time}'
    return json_data 

def CompletionEngine(sys_message, user_message, num_tokens, num_results, temperature, topic_model, top_p):
    return openai.ChatCompletion.create(
        model=topic_model,
        messages=[
            {"role": "system", "content": sys_message},
            {"role": "user", "content": user_message}, ],
        max_tokens=num_tokens,
        n=num_results,
        temperature=temperature,
        stop=None,
        top_p=top_p
    )

def ConcatenateMatchAndCanonicals(message):
    global game_topics
    game_topics_str = ', '.join(game_topics.all_words())
    print("*** game_topics_str: ", game_topics_str)
    prompt_message = f"Find a topic that represents this message '{message}' from this set of topics {{{game_topics_str}}}. Your reply should be the topic. If you're not able to find a match, reply 'Unknown'"  
    print("*** prompt_message for first round is: ", prompt_message) 
    return prompt_message

def ConcatenateMessageAndCanonicals(message):
    prompt_message = f"Summarize this message '{message}' in 1-2 words. We're looking for a representative category to cluster messages. Identify subject or activity. Your reply should be one or two words representing the topic. If you're not able to summarize, reply 'Unknown'"  
    print("*** prompt_message for second round is: ", prompt_message) 
    return prompt_message

def ConcatenateMessageAndTopics(message):
    prompt_message = f"Be creative. We need 1-2 word summarization for this message: '{message}'. If you aren't able to summarize, Identify the subject or direct object. Your reply should be one or two words representing the topic. As an absolute last resort, reply 'Unknown'"
    print("*** prompt_message for third round is: ", prompt_message) 
    return prompt_message

def ProcessMessageWrapper(datetime, message, replies, person, id):  
    global themes, unknown_themes
    theme = ProcessMessage(datetime, message, replies, person, id)
    print(f"Theme id: {id}, theme:{theme.theme}, modifier:{theme.modifier}, person:{theme.person}, message:{theme.message}, similarity:{theme.themeSimilarity}")   
    if(theme.theme=='Unknown' or theme.themeSimilarity==0):
        unknown_themes.append(theme.to_dict())
        WriteUnknownThemes()
    else:
        themes.append(theme.to_dict())
        WriteThemes()
    return theme
    
# Update the process_message function
def ProcessMessage(datetime, message, replies, person, id):    
    global game_topics
    topMatch = True
    #round 1, look for exact match
    completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMatchAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS, TEMP, TOPIC_MODEL,TOP_P)
    options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
    if all(choice == "Unknown" for choice in options):
        #round 2, look for 1-2 summary, like topics
        completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndCanonicals(CleanMessage(message)),TOPIC_TOKENS, NUM_RESULTS1, TEMP1, TOPIC_MODEL,TOP_P1)
        options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))
        topMatch = False
        if all(choice == "Unknown" for choice in options):
            #round 3, look for 1-2 summary, wild card
            completion=CompletionEngine(SYSTEM_MESSAGE,ConcatenateMessageAndTopics(CleanMessage(message)),TOPIC_TOKENS2, NUM_RESULTS2, TEMP2, TOPIC_MODEL,TOP_P2)
            options = list(set(TruncateWords(topic.strip(),3) for choice in completion.choices for concatenated_topics in choice.message.content.strip().split("\n") for topic in concatenated_topics.split(',')))        
            topMatch = False
    print("---options: ", options, " topMatch: ",topMatch)
    if not topMatch:
        similarity_scores = []
        generated_topics_indices = [] 
        counter=0
        exact_match=False
        most_similar_topic = "Unknown"  
        unidentified_topic_count=0
        theme_obj=None
        for generated_topic in options:  
            if generated_topic != "Unknown":
                generated_topics_indices.append(counter) 
                generated_tokens = word_tokenize(generated_topic)
                generated_tokens_str = " ".join(generated_tokens)
                topic_similarities = []
                for reference_topic in game_topics.all_canonicals():
                    reference_tokens = word_tokenize(reference_topic)
                    reference_tokens_str = " ".join(reference_tokens)
                    similarity_score, exact_match = ComputeSimilarity(generated_tokens_str, reference_tokens_str)
                    if exact_match:
                        most_similar_topic = reference_topic
                        most_similar_score = 1.0
                        break
                    topic_similarities.append(similarity_score)          
                if exact_match:
                    break  
                similarity_scores.append(topic_similarities)   
            else:
                unidentified_topic_count+=1            
            counter+=1
        if len(similarity_scores) > 0 and not exact_match:
            most_similar_score=0
            # Aggregate the similarity scores for each generated topic
            similarity_scores = np.array(similarity_scores)
            aggregated_scores = np.sum(similarity_scores, axis=1)
            most_similar_index = np.argmax(aggregated_scores)
            most_similar_topic_index = generated_topics_indices[most_similar_index]
            most_similar_topic = options[most_similar_topic_index].lower()
            most_similar_score=similarity_scores[most_similar_index]

        if most_similar_topic != "Unknown":   
            #check if it's in all topics
            if most_similar_topic in game_topics.all_words():
                if most_similar_topic in game_topics.all_synonyms():
                    most_similar_topic = game_topics.canonical_for_synonym(most_similar_topic)                
                else:                
                    most_similar_topic = game_topics.get_canonical(most_similar_topic)
                most_similar_score=1.0
            else:  
                #not in all words, look for similar topics, see if it's like something in list 
                highest_similarity = 0
                best_match = None
                for known_word in game_topics.all_words():
                    #compute similarity against all topics
                    similarity_score = float(CompareTopicToGameTopic(most_similar_topic, known_word))
                    print("\tsimilarity_score: "+ str(similarity_score)+ " for known_word: "+ known_word)
                    if similarity_score > highest_similarity:
                        highest_similarity = similarity_score
                        best_match = known_word.lower()
                        print("\t>>>best_match found :", best_match, " with highest_similarity:", str(highest_similarity))

                print("if we found similar topic, use it")
                if highest_similarity > SIMILARITY_THRESHOLD:
                    if(best_match in game_topics.all_synonyms()):
                        most_similar_topic = game_topics.canonical_for_synonym(best_match)
                    else:
                        most_similar_topic = game_topics.get_canonical(best_match)
                    most_similar_score=highest_similarity
                else:
                    game_topics.enqueue(most_similar_topic)
                    most_similar_score=1.0

            theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=str(most_similar_score))   
            print(f"{id} From message:'{message}' to theme: {most_similar_topic}")
        else:
            theme_obj = Theme(datetime=datetime, theme='Unknown', modifier=0, person=person, postId=id, message=message, similarity=0)   
    else:
        most_similar_topic = options[0].lower()
        theme_obj = Theme(datetime=datetime, theme=most_similar_topic, modifier=0, person=person, postId=id, message=message, similarity=1)   
        print("\n**_*_*_* in else: theme_obj: ", theme_obj.to_dict())
    WriteTopics()
    return theme_obj

def CompareTopicToGameTopic(topic, game_topic):
    # Exact Match
    if topic == game_topic:
        return 1.0

    # Token Overlap
    tokens_topic = set(word_tokenize(topic.lower()))
    tokens_game_topic = set(word_tokenize(game_topic.lower()))
    overlap_score = len(tokens_topic & tokens_game_topic) / len(tokens_topic | tokens_game_topic)

    # Semantic Similarity using OpenAI's cosine_similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([topic, game_topic])
    semantic_similarity = coso(tfidf_matrix[0], tfidf_matrix[1])[0, 0] # Get the scalar value

    # Combine Scores
    final_score = 0.2 * overlap_score + 0.8 * semantic_similarity
    print("Tokens topic:", tokens_topic, "tokens game topic:", tokens_game_topic, " overlap score:", overlap_score," Semantic similarity:", str(semantic_similarity), " final score: " + str(final_score))
    return final_score

def ComputeSimilarity(tokens1, tokens2):    
    if set(word_tokenize(tokens1.lower())) == set(word_tokenize(tokens2.lower())):
        return 1.0, True    
    tokens1 = word_tokenize(tokens1)
    tokens2 = word_tokenize(tokens2)        
    total_similarity = 0
    num_comparisons = 0
    for token1 in tokens1:
        for token2 in tokens2:
            token1_synsets = wordnet.synsets(token1)
            token2_synsets = wordnet.synsets(token2)            
            if not token1_synsets or not token2_synsets:
                continue
            similarity_scores = [
                synset1.wup_similarity(synset2)
                for synset1 in token1_synsets
                for synset2 in token2_synsets
                if synset1.pos() == synset2.pos()
            ]            
            valid_scores = [score for score in similarity_scores if isinstance(score, float)]
            if valid_scores:
                max_similarity = max(valid_scores)
                total_similarity += max_similarity
                num_comparisons += 1
    if num_comparisons > 0:
        return total_similarity / num_comparisons, False
    else:
        return 0, False
    
def FetchSlack():       
    return pd.read_json(INPUT_PATH, orient='records')

def ProcessReactions(reactions,id):
    highestcount=0
    highestcount_reaction=""
    if not isinstance(reactions, list):
         return ""
    else:
        for reaction in reactions:
            stripped_reaction = reaction['emoji'].strip(':')
            if reaction['count'] > highestcount:
                highestcount = reaction['count']
                highestcount_reaction = stripped_reaction
    #print("returning highestcount_reaction:", highestcount_reaction)
    return highestcount_reaction

def ProcessSlack():
    global df
    if not os.path.exists(OUTPUT_THEME_PATH):
        InitializeTopics()
        # Read JSON data into DataFrame
        df = pd.read_json(INPUT_PATH)
        # Keep selected columns and drop rows with missing values
        df = df[["person", "datetime", "message","replies", "id"]]
        # Filter down to top reaction, then create theme.
        #df["reaction"] = df.apply(lambda row: ProcessReactions(row["reactions"],row["id"]), axis=1)
        df["datetime"] = df.apply(lambda row: ProcessDateTime(row["datetime"]), axis=1)
        df["theme"] = df.apply(lambda row: ProcessMessageWrapper(row["datetime"], row["message"], row["replies"], row["person"], row["id"]), axis=1)
    else:
        df = pd.read_json(OUTPUT_THEME_PATH)
    return df[["person", "theme", "message"]]

def CreateEmbeddings():
    global df
    if not os.path.exists(OUTPUT_THEME_EMBEDDINGS_PATH):
        ProcessSlack()
        #restrict sample to 500 most recent posts and remove samples that are too long
        top_n = SAMPLE_SIZE
        df = df.sort_values("datetime").tail(top_n * 2)  
        df.drop("datetime", axis=1, inplace=True)
        encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
        # omit posts that are too long to embed
        df["n_tokens"] = df.theme.apply(lambda x: len(encoding.encode(str(x))))
        df = df[df.n_tokens <= MAX_TOKENS].tail(top_n)
        df["embedding"] = df.theme.apply(lambda x: [str(val) for val in get_embedding(str(x), engine=EMBEDDING_MODEL)])        
        df.to_json(OUTPUT_THEME_EMBEDDINGS_PATH, orient="records", lines=False)
    else:
        df = pd.read_json(OUTPUT_THEME_EMBEDDINGS_PATH)
    return df[["person", "theme", "message", "embedding"]]