Spaces:

TwinklData
/

Community_Collections_App

Sleeping

App Files Files Community

lynn-twinkl commited on May 23, 2025

Commit

e3ee58f

1 Parent(s): ddeb431

Testing bertopic

Browse files

Files changed (2) hide show

notebooks/app_pipeline.ipynb +321 -0
src/models/topicModeling_contentRequests.py +305 -0

notebooks/app_pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,321 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "aa8053eb-bad5-45cf-b762-5426dfaf3281",
+   "metadata": {},
+   "source": [
+    "# 1. Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "489b293a-e27a-4b66-b16f-13f0b9964566",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import altair as alt\n",
+    "import joblib\n",
+    "from io import BytesIO\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "# Add project root (one level up from notebooks/) to sys.path\n",
+    "sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))\n",
+    "\n",
+    "# ---- FUNCTIONS ----\n",
+    "\n",
+    "from src.extract_usage import extract_usage\n",
+    "from src.necessity_index import compute_necessity, index_scaler, qcut_labels\n",
+    "from src.column_detection import detect_freeform_col\n",
+    "from src.shortlist import shortlist_applications\n",
+    "from src.twinkl_originals import find_book_candidates\n",
+    "from src.preprocess_text import normalise_text \n",
+    "from typing import Tuple"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "bf6bb17e-7cf7-4864-96d2-0ceb864ff1e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_heartfelt_predictor():\n",
+    "    # Compute absolute path from notebook location\n",
+    "    project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "    model_path = os.path.join(project_root, \"src\", \"models\", \"heartfelt_pipeline.joblib\")\n",
+    "    return joblib.load(model_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "28225da1-0757-4289-a8a6-79e2c5d7e288",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_process(raw_csv) -> Tuple[pd.DataFrame, str]:\n",
+    "    \"\"\"\n",
+    "    Load CSV from raw bytes, detect freeform column, compute necessity scores,\n",
+    "    and extract usage items. Returns processed DataFrame and freeform column name.\n",
+    "    \"\"\"\n",
+    "    # Read Uploaded Data \n",
+    "    df_orig = pd.read_csv(raw_csv)\n",
+    "\n",
+    "    # Detect freeform column\n",
+    "    freeform_col = detect_freeform_col(df_orig)\n",
+    "\n",
+    "    df_orig = df_orig[df_orig[freeform_col].notna()]\n",
+    "\n",
+    "    #Word Count\n",
+    "    df_orig['word_count'] = df_orig[freeform_col].fillna('').str.split().str.len()\n",
+    "\n",
+    "    # Compute Necessity Scores\n",
+    "    scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))\n",
+    "    scored['necessity_index'] = index_scaler(scored['necessity_index'].values)\n",
+    "    scored['priority'] = qcut_labels(scored['necessity_index'])\n",
+    "\n",
+    "    # Find Twinkl Originals Candidates\n",
+    "    scored['book_candidates'] = find_book_candidates(scored, freeform_col)\n",
+    "\n",
+    "    # Label Heartfelt Applications\n",
+    "    scored['clean_text'] = scored[freeform_col].map(normalise_text)\n",
+    "    model = load_heartfelt_predictor()\n",
+    "    scored['is_heartfelt'] = model.predict(scored['clean_text'].astype(str))\n",
+    "\n",
+    "\n",
+    "    \n",
+    "    # Usage Extraction\n",
+    "    #docs = df_orig[freeform_col].to_list()  <---- Disabled Ai-powered extraction for testing\n",
+    "    #scored['Usage'] = extract_usage(docs)\n",
+    "\n",
+    "    return scored, freeform_col\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "3989b41b-bf81-4436-98fa-28a191871546",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/lynn/Documents/Twinkl/grant-applications-app/src/twinkl_originals.py:15: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  is_primary = series.str.contains(pattern_level, case=False, na=False)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df, freeform_col = load_and_process('data/feb-march-data.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "aa3aee7b-eab7-4e42-ba8c-addd9847f146",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>Date/Time Requested</th>\n",
+       "      <th>Giveaway Title</th>\n",
+       "      <th>Customer Name</th>\n",
+       "      <th>Email Address</th>\n",
+       "      <th>School Name</th>\n",
+       "      <th>Postal Address</th>\n",
+       "      <th>Address Line 2</th>\n",
+       "      <th>Address City</th>\n",
+       "      <th>Postcode</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Unnamed: 11</th>\n",
+       "      <th>word_count</th>\n",
+       "      <th>necessity_index</th>\n",
+       "      <th>urgency_score</th>\n",
+       "      <th>severity_score</th>\n",
+       "      <th>vulnerability_score</th>\n",
+       "      <th>priority</th>\n",
+       "      <th>book_candidates</th>\n",
+       "      <th>clean_text</th>\n",
+       "      <th>is_heartfelt</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>304399.0</td>\n",
+       "      <td>01/03/2025 00:52</td>\n",
+       "      <td>March Community Collection</td>\n",
+       "      <td>Susan Bushnell</td>\n",
+       "      <td>susan.bushnell@googlemail.com</td>\n",
+       "      <td>Southfield Junior School</td>\n",
+       "      <td>Shrivenham Road</td>\n",
+       "      <td>Highworth</td>\n",
+       "      <td>Swindon</td>\n",
+       "      <td>SN6 7BZ</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>69</td>\n",
+       "      <td>0.25000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>medium</td>\n",
+       "      <td>False</td>\n",
+       "      <td>i would love to use it to spread the love of r...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>305004.0</td>\n",
+       "      <td>02/03/2025 19:52</td>\n",
+       "      <td>March Community Collection</td>\n",
+       "      <td>Sarah Arabestani</td>\n",
+       "      <td>sarah.a@sandringhamnursery.com</td>\n",
+       "      <td>Sandringham Nursery</td>\n",
+       "      <td>16 Sandringham Road</td>\n",
+       "      <td>Penylan</td>\n",
+       "      <td>Cardiff</td>\n",
+       "      <td>CF23 5BJ</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>46</td>\n",
+       "      <td>0.06250</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>False</td>\n",
+       "      <td>we would like to introduce early years yoga an...</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>305493.0</td>\n",
+       "      <td>05/03/2025 14:34</td>\n",
+       "      <td>March Community Collection</td>\n",
+       "      <td>Rebecca Asker</td>\n",
+       "      <td>mrsrasker@gmail.com</td>\n",
+       "      <td>Newhaven PRU Outreach</td>\n",
+       "      <td>Newhaven Gardens</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Greenwich</td>\n",
+       "      <td>SE96HR</td>\n",
+       "      <td>...</td>\n",
+       "      <td></td>\n",
+       "      <td>86</td>\n",
+       "      <td>0.09375</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>False</td>\n",
+       "      <td>â£500 would enable us to set up a small sensor...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Id Date/Time Requested              Giveaway Title     Customer Name  \\\n",
+       "0  304399.0    01/03/2025 00:52  March Community Collection    Susan Bushnell   \n",
+       "1  305004.0    02/03/2025 19:52  March Community Collection  Sarah Arabestani   \n",
+       "2  305493.0    05/03/2025 14:34  March Community Collection     Rebecca Asker   \n",
+       "\n",
+       "                    Email Address               School Name  \\\n",
+       "0   susan.bushnell@googlemail.com  Southfield Junior School   \n",
+       "1  sarah.a@sandringhamnursery.com       Sandringham Nursery   \n",
+       "2             mrsrasker@gmail.com     Newhaven PRU Outreach   \n",
+       "\n",
+       "        Postal Address Address Line 2 Address City  Postcode  ... Unnamed: 11  \\\n",
+       "0      Shrivenham Road      Highworth      Swindon   SN6 7BZ  ...               \n",
+       "1  16 Sandringham Road        Penylan      Cardiff  CF23 5BJ  ...               \n",
+       "2     Newhaven Gardens            NaN    Greenwich    SE96HR  ...               \n",
+       "\n",
+       "  word_count  necessity_index  urgency_score  severity_score  \\\n",
+       "0         69          0.25000            0.0             0.0   \n",
+       "1         46          0.06250            0.0             0.0   \n",
+       "2         86          0.09375            0.0             0.0   \n",
+       "\n",
+       "   vulnerability_score  priority book_candidates  \\\n",
+       "0                  0.0    medium           False   \n",
+       "1                  0.0       low           False   \n",
+       "2                  1.0       low           False   \n",
+       "\n",
+       "                                          clean_text is_heartfelt  \n",
+       "0  i would love to use it to spread the love of r...         True  \n",
+       "1  we would like to introduce early years yoga an...        False  \n",
+       "2  â£500 would enable us to set up a small sensor...         True  \n",
+       "\n",
+       "[3 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "301c8139-ea06-4ee3-a4eb-0b20d29dd6a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/models/topicModeling_contentRequests.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import streamlit as st
+import re
+import string
+import torch
+import spacy
+from sentence_transformers import SentenceTransformer
+import nltk
+from nltk.corpus import stopwords
+import contractions
+from tqdm import tqdm
+from sklearn.feature_extraction.text import CountVectorizer
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
+import openai
+import numpy as np
+import os
+from dotenv import load_dotenv
+load_dotenv(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../",".env")))
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+#################################
+# OpenAI Topic Representation
+#################################
+def create_openai_model():
+            client = openai.OpenAI(api_key=OPENAI_API_KEY)
+            prompt = """
+            I have a topic that contains the following documents:
+            [DOCUMENTS]
+            The topic is described by the following keywords: [KEYWORDS]
+            Based on the information above, extract a short yet descriptive topic label of at most 4 words. The labels should be interpretable enough to stakeholders that don't have access to the raw data. Make sure it is in the following format:
+            topic: <topic label>
+            """
+            openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
+            return openai_model
+#############################################
+# Convert OpenAI Representation to CustomName
+#############################################
+def ai_labeles_to_custom_name(model):
+    chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in model.topic_aspects_["OpenAI"].items()}
+    chatgpt_topic_labels[-1] = "Outlier Topic"
+    model.set_topic_labels(chatgpt_topic_labels)
+"""
+-----------------------------------
+Lemmatization & Stopword Removal
+-----------------------------------
+"""
+def topicModeling_preprocessing(df, spacy_model="en_core_web_lg"):
+    base_stopwords = set(stopwords.words('english'))
+    custom_stopwords = {
+        'material', 'materials', 'resources', 'resource', 'activity',
+        'activities', 'sheet', 'sheets', 'worksheet', 'worksheets',
+        'teacher', 'teachers', 'teach', 'high school', 'highschool',
+        'middle school', 'grade', 'grades', 'hs', 'level', 'age', 'ages',
+        'older', 'older kid', 'kid', 'student', "1st", "2nd", "3rd", "4th", '5th', '6th',
+        '7th', '8th', '9th'
+        }
+    stopword_set = base_stopwords.union(custom_stopwords)
+    stopword_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stopword_set) + r')\b'
+    nlp = spacy.load(spacy_model)
+    def clean_lemmatize_text(text):
+        if not isinstance(text, str):
+            return None
+        text = contractions.fix(text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        text = re.sub(stopword_pattern, '', text)
+        doc = nlp(text)
+        tokens = [token.lemma_ for token in doc]
+        clean_text = " ".join(tokens).strip()
+        clean_text = re.sub(r'\s+', ' ', clean_text)
+        return clean_text if clean_text else None
+    df['processedForModeling'] = df['preprocessedBasic'].apply(clean_lemmatize_text)
+    # Drop rows where cleaned text is empty or None
+    df = df.dropna(subset=['processedForModeling'])
+    return df
+"""
+--------------------------
+ Load Transformer Model
+--------------------------
+"""
+def load_embedding_model(model_name):
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    print(f"Using device: {device}")
+    return SentenceTransformer(model_name, device=device)
+"""
+-------------------------
+Batch Embedding Creation
+-------------------------
+"""
+def encode_content_documents(embedding_model, content_documents, batch_size=20):
+    embeddings_batches = []
+    total_batches = range(0, len(content_documents), batch_size)
+    with tqdm(total=len(total_batches), desc="Encoding Batches") as pbar:
+        for i in total_batches:
+            batch_docs = content_documents[i:i + batch_size]
+            batch_embeddings = embedding_model.encode(batch_docs, convert_to_numpy=True, show_progress_bar=False)
+            embeddings_batches.append(batch_embeddings)
+            pbar.update(1)
+    return np.vstack(embeddings_batches)
+"""
+-----------------------------
+Topic Modeling with BERTopic
+-----------------------------
+"""
+try:
+    nltk.data.find("corpora/stopwords")
+except LookupError:
+    nltk.download("stopwords")
+stopwords = list(stopwords.words('english')) + [
+        'activities',
+        'activity',
+        'class',
+        'classroom',
+        'material',
+        'materials',
+        'membership',
+        'memberships',
+        'pupil',
+        'pupils',
+        'resource',
+        'resources',
+        'sheet',
+        'sheets',
+        'student',
+        'students',
+        'subscription',
+        'subscriptions',
+        'subscribe',
+        'subscribed',
+        'recommend',
+        'recommendation',
+        'teach',
+        'teacher',
+        'teachers',
+        'tutor',
+        'tutors',
+        'twinkl',
+        'twinkls',
+        'twinkle',
+        'worksheet',
+        'worksheets',
+    ]
+######### --------------- BERTOPIC ----------------- #############
+def bertopic_model(docs, embeddings, _embedding_model, _umap_model, _hdbscan_model):
+    main_representation_model = KeyBERTInspired()
+    aspect_representation_model1 = MaximalMarginalRelevance(diversity=.3)
+    # OpenAI Representation Model
+    client = openai.OpenAI(api_key=OPENAI_API_KEY)
+    prompt = """
+    I have a topic that contains the following documents:
+    [DOCUMENTS]
+    The topic is described by the following keywords: [KEYWORDS]
+    Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
+    topic: <topic label>
+    """
+    openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)
+    representation_model = {
+        "Main": main_representation_model,
+        "Secondary Representation": aspect_representation_model1,
+    }
+    vectorizer_model = CountVectorizer(min_df=2, max_df=0.60, stop_words=stopwords)
+    seed_topic_list = [
+            ["autism", "special needs", "special education needs", "special education", "adhd", "autistic", "dyslexia", "dyslexic", "sen"],
+            ]
+    topic_model = BERTopic(
+        verbose=True,
+        embedding_model=_embedding_model,
+        umap_model=_umap_model,
+        hdbscan_model = _hdbscan_model,
+        vectorizer_model=vectorizer_model,
+        #seed_topic_list = seed_topic_list,
+        representation_model=representation_model,
+    )
+    topics, probs = topic_model.fit_transform(docs, embeddings)
+    return topic_model, topics, probs
+##################################
+# TOPIC MERGING
+##################################
+def merge_specific_topics(topic_model, sentences,
+                          cancellation_keywords=["cancel", "cancellation", "cancel", "canceled"],
+                          thanks_keywords=["thank", "thanks", "thank you", "thankyou", "ty", "thx"],
+                          expensive_keywords=["can't afford", "price", "expensive", "cost"]):
+    topic_info = topic_model.get_topic_info()
+    # Identify cancellation-related topics by checking if any cancellation keyword appears in the topic name.
+    cancellation_regex = '|'.join(cancellation_keywords)
+    cancellation_topics = topic_info[
+        topic_info['Name'].str.contains(cancellation_regex, case=False, na=False)
+    ]['Topic'].tolist()
+    # Identify thank-you-related topics similarly.
+    thanks_regex = '|'.join(thanks_keywords)
+    thanks_topics = topic_info[
+        topic_info['Name'].str.contains(thanks_regex, case=False, na=False)
+    ]['Topic'].tolist()
+    # Identify expensive-related topics.
+    expensive_regex = '|'.join(expensive_keywords)
+    expensive_topics = topic_info[
+        topic_info['Name'].str.contains(expensive_regex, case=False, na=False)
+    ]['Topic'].tolist()
+    # Exclude the outlier topic (-1) if it appears.
+    cancellation_topics = [t for t in cancellation_topics if t != -1]
+    thanks_topics = [t for t in thanks_topics if t != -1]
+    expensive_topics = [t for t in expensive_topics if t != -1]
+    # Create a list of topics to merge
+    topics_to_merge = []
+    if len(cancellation_topics) > 1:
+        print(f"Merging cancellation topics: {cancellation_topics}")
+        topics_to_merge.append(cancellation_topics)
+    if len(thanks_topics) > 1:
+        print(f"Merging thank-you topics: {thanks_topics}")
+        topics_to_merge.append(thanks_topics)
+    if len(expensive_topics) > 1:
+        print(f"Merging expensive topics: {expensive_topics}")
+        topics_to_merge.append(expensive_topics)
+    # Call merge_topics
+    if topics_to_merge:
+        topic_model.merge_topics(sentences, topics_to_merge)
+    return topic_model
+##################################
+# Topic to Dataframe Mapping
+#################################
+def update_df_with_topics(df, mapping, sentence_topics, topic_label_map):
+    topics_by_row = {}
+    for i, row_idx in enumerate(mapping):
+        topic = sentence_topics[i]
+        topics_by_row.setdefault(row_idx, set()).add(topic)
+    updated_df = df.copy()
+    def map_topics(row_idx):
+        topic_ids = topics_by_row.get(row_idx, set())
+        topic_names = [topic_label_map.get(t, str(t)) for t in topic_ids if t != -1]
+        return ", ".join(sorted(topic_names))
+    updated_df['Topics'] = updated_df.index.map(map_topics)
+    return updated_df