File size: 6,500 Bytes
45c7ceb
9b4c514
45c7ceb
e3ee58f
 
 
 
 
c6d1832
 
e3ee58f
 
45c7ceb
e3ee58f
 
2e31929
45c7ceb
e3ee58f
 
 
45c7ceb
e3ee58f
 
 
 
 
 
 
45c7ceb
 
 
e3ee58f
2e31929
c6d1832
8b56a5d
c6d1832
 
 
 
 
 
 
 
 
e3ee58f
2e31929
 
 
 
 
 
e3ee58f
45c7ceb
e3ee58f
 
 
 
 
 
 
 
 
 
 
 
 
45c7ceb
e3ee58f
 
 
 
 
 
 
 
 
 
 
 
 
 
45c7ceb
 
c6d1832
45c7ceb
 
e3ee58f
45c7ceb
 
 
 
 
e3ee58f
45c7ceb
e3ee58f
45c7ceb
e3ee58f
45c7ceb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e31929
 
 
 
 
 
 
 
45c7ceb
 
 
 
 
 
2e31929
45c7ceb
 
e3ee58f
 
 
45c7ceb
e3ee58f
 
 
45c7ceb
e3ee58f
 
2e31929
e3ee58f
c6d1832
e3ee58f
45c7ceb
 
 
e3ee58f
45c7ceb
e3ee58f
 
45c7ceb
e3ee58f
 
2e31929
 
e3ee58f
45c7ceb
e3ee58f
 
9b4c514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3ee58f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import openai
import pandas as pd
import numpy as np
import streamlit as st
import re
import string
import torch
import spacy
from spacy.cli import download
import importlib.util
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, OpenAI


import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


#############################################
# Convert OpenAI Representation to CustomName
#############################################

###################################
# HELPER FUNCTIONS
###################################

## ---------- LOAD SPACY MODEL ---------

def load_spacy_model(model_name="en_core_web_sm"):
    """
    This model is used for sentence tokenization
    as well as stopword generation
    """
    if importlib.util.find_spec(model_name) is None:
        print(f"Model '{model_name}' not found. Downloading now...")
        download(model_name)

    return spacy.load(model_name)

## -------- SENTENCE TOKENIZER -------

def spacy_sent_tokenize(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

## -------- LOAD TRANSFORMER MODEL -------

def load_embedding_model(model_name):
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"

    print(f"Using device: {device}")
    return SentenceTransformer(model_name, device=device)


## ------------ GENERATE EMBEDDINGS ------------

def encode_content_documents(embedding_model, content_documents, batch_size=20):
    embeddings_batches = []
    total_batches = range(0, len(content_documents), batch_size)

    with tqdm(total=len(total_batches), desc="Encoding Batches") as pbar:
        for i in total_batches:
            batch_docs = content_documents[i:i + batch_size]
            batch_embeddings = embedding_model.encode(batch_docs, convert_to_numpy=True, show_progress_bar=False)
            embeddings_batches.append(batch_embeddings)
            pbar.update(1)

    return np.vstack(embeddings_batches)

## ------- BUILDING STOPWORDS LIST ------

nlp = load_spacy_model()
stopwords = list(nlp.Defaults.stop_words)


custom_stopwords = [
    'thank you', 'thankyou', 'thanks', 'thank'
    'children', 'child', 'students',
    'twinkl',
    'funding'

]

stopwords.extend(custom_stopwords)


## --------- INSTANTIATE OPENAI MODEL ---------

def create_openai_model():
            client = openai.OpenAI(api_key=OPENAI_API_KEY)
            prompt = """
            This topic contains the following documents:
            [DOCUMENTS]

            The topic is described by the following keywords: [KEYWORDS]

            Based on the information above, extract a short yet descriptive topic label.
            """
            
            openai_model = OpenAI(
                    client,
                    model="gpt-4.1-nano",
                    exponential_backoff=True,
                    chat=True, prompt=prompt,
                    system_prompt= """ **Task**: As a topic modeling expert, your responsibility is to generate concise yet comprehensive topic labels from rows in a BertTopic `topic_info` dataframe. These topics have been derived from grant application forms submitted by schools, tutors, or other institutions participating in Twinkl giveaways.\n\n**Objective**: Your goal is to create labels for the extracted topics that accurately and clearly describe each topic within the specified context. These labels should be easily interpretable by the members of the Community Collections team.\n\n**Instructions**: \n\n1. **Understand the Context**: The topics relate to grant applications and are relevant to educational institutions participating in Twinkl giveaways.\n\n2. **Generate Labels**:\n- Create labels that are short yet capture the essence of each topic.\n- Ensure that the labels are contextually appropriate and provide clarity.\n- Focus on making the labels easily understandable for the Community Collections team. \n\n3. **Considerations**:\n- Each label should succinctly convey the main idea of the topic.\n- Avoid overly technical language unless necessary for precision.\n- Ensure the labels align with the overall educational and grant-related context."""
            )
            return openai_model

## ---------- AI LABELS TO TOPIC NAME ----------

def ai_labels_to_custom_name(model):
    chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model.topic_aspects_["OpenAI"].items()}
    chatgpt_topic_labels[-1] = "Outlier Topic"
    model.set_topic_labels(chatgpt_topic_labels)


#############################
# BERTOPIC MODELING
#############################

def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
    
    main_representation_model = KeyBERTInspired()

    openai_model = create_openai_model()

    representation_model = {
        "Main": main_representation_model,
        "OpenAI": openai_model
    }


    vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,2))

    topic_model = BERTopic(
        verbose=True,
        umap_model=_umap_model,
        representation_model=representation_model,
        vectorizer_model=vectorizer_model,
        hdbscan_model=_hdbscan_model,
        embedding_model=_embedding_model,
        nr_topics='auto'
    )
    
    topics, probs = topic_model.fit_transform(docs, embeddings)

    return topic_model, topics, probs




##################################
# TOPIC TO DATAFRAME MAPPING
#################################


def attach_topics(
    df, mappings, sentence_topics, label_map, col="topics", drop_outlier=True
):
    import pandas as pd  # in case it's not already imported

    s = (
        pd.DataFrame({"row": mappings, "topic": sentence_topics})
        .query("topic != -1") if drop_outlier else
        pd.DataFrame({"row": mappings, "topic": sentence_topics})
    )

    # Group topics per row and make list of labels
    topics_list = (
        s.groupby("row")["topic"]
        .agg(lambda ids: sorted({label_map.get(i, str(i)) for i in ids}))
    )

    # Assign the lists to the column, fill missing with empty list
    return df.assign(**{col: topics_list.reindex(df.index).apply(lambda x: x if isinstance(x, list) else [])})