romybeaute commited on
Commit
735caac
·
1 Parent(s): 9209170

modified core functions to correspond fully to app

Browse files
mosaic_core/analysis.py DELETED
@@ -1,248 +0,0 @@
1
- #Move these functions: clean_text, get_embeddings, run_bertopic, run_umap, extract_topics from the app.py
2
-
3
- """
4
- File: mosaic_core/analysis.py
5
- Description: Core logic extracted from MOSAIC app.
6
- Pure Python implementation (no Streamlit dependencies).
7
- """
8
-
9
- import pandas as pd
10
- import numpy as np
11
- import nltk
12
- import json
13
- import re
14
- import os
15
- from pathlib import Path
16
- from typing import List, Tuple, Dict, Any, Optional
17
-
18
- # NLP / ML Imports
19
- from sentence_transformers import SentenceTransformer
20
- from bertopic import BERTopic
21
- from sklearn.feature_extraction.text import CountVectorizer
22
- from umap import UMAP
23
- from hdbscan import HDBSCAN
24
- from huggingface_hub import InferenceClient
25
-
26
- # =====================================================================
27
- # Constants (Copied from app.py)
28
- # =====================================================================
29
-
30
- SYSTEM_PROMPT = """You are an expert phenomenologist analysing first-person experiential reports or microphenomenological interviews.
31
- Your task is to assign a concise label to a cluster of similar reports by identifying the
32
- shared lived experiential structure or process they describe.
33
- The label must:
34
- 1. Describe what changes in experience itself.
35
- 2. Capture the underlying experiential process.
36
- 3. Be concise and noun-phrase-like.
37
- Constraints: Output ONLY the label (no explanation). 3–8 words.
38
- """
39
-
40
- USER_TEMPLATE = """Here is a cluster of participant reports describing a specific phenomenon:
41
-
42
- {documents}
43
-
44
- Top keywords associated with this cluster:
45
- {keywords}
46
-
47
- Task: Return a single scientifically precise label (3–7 words). Output ONLY the label.
48
- """
49
-
50
- # =====================================================================
51
- # 1. Preprocessing & Embedding Logic
52
- # =====================================================================
53
-
54
- def load_embedding_model(model_name: str):
55
- print(f"Loading embedding model '{model_name}'...")
56
- return SentenceTransformer(model_name)
57
-
58
- def _pick_text_column(df: pd.DataFrame) -> Optional[str]:
59
- """Helper to find the text column."""
60
- ACCEPTABLE_TEXT_COLUMNS = [
61
- "reflection_answer_english", "reflection_answer", "text", "report",
62
- ]
63
- for col in ACCEPTABLE_TEXT_COLUMNS:
64
- if col in df.columns:
65
- return col
66
- return None
67
-
68
- def preprocess_and_embed(
69
- csv_path: str,
70
- model_name: str = "BAAI/bge-small-en-v1.5",
71
- text_col: Optional[str] = None,
72
- split_sentences: bool = True,
73
- min_words: int = 3,
74
- device: str = "cpu"
75
- ) -> Tuple[List[str], np.ndarray]:
76
- """
77
- Equivalent to 'generate_and_save_embeddings' but returns data instead of saving to disk.
78
- """
79
- # 1. Load CSV
80
- df = pd.read_csv(csv_path)
81
-
82
- # 2. Pick Column
83
- if text_col is None:
84
- text_col = _pick_text_column(df)
85
-
86
- if text_col is None or text_col not in df.columns:
87
- raise ValueError(f"Could not find a valid text column in {csv_path}")
88
-
89
- # 3. Clean NaN/Empty
90
- df.dropna(subset=[text_col], inplace=True)
91
- df[text_col] = df[text_col].astype(str)
92
- reports = [r for r in df[text_col] if r.strip()]
93
-
94
- # 4. Tokenize / Split
95
- docs = []
96
- try:
97
- nltk.data.find("tokenizers/punkt")
98
- except LookupError:
99
- nltk.download("punkt")
100
-
101
- if split_sentences:
102
- for r in reports:
103
- # Simple wrapper to avoid crashes
104
- sents = nltk.sent_tokenize(r)
105
- docs.extend(sents)
106
- else:
107
- docs = reports
108
-
109
- # 5. Filter min_words
110
- if min_words > 0:
111
- docs = [d for d in docs if len(d.split()) >= min_words]
112
-
113
- print(f"Preprocessing complete. {len(docs)} documents prepared.")
114
-
115
- # 6. Embed
116
- model = load_embedding_model(model_name)
117
-
118
- encode_device = "cpu"
119
- if device.lower() == "gpu":
120
- import torch
121
- if torch.cuda.is_available():
122
- encode_device = "cuda"
123
- elif torch.backends.mps.is_available():
124
- encode_device = "mps"
125
-
126
- print(f"Encoding on {encode_device}...")
127
- embeddings = model.encode(
128
- docs,
129
- show_progress_bar=True,
130
- batch_size=32,
131
- device=encode_device,
132
- convert_to_numpy=True
133
- )
134
-
135
- return docs, np.asarray(embeddings, dtype=np.float32)
136
-
137
- # =====================================================================
138
- # 2. Topic Modeling Logic
139
- # =====================================================================
140
-
141
- def run_topic_model(
142
- docs: List[str],
143
- embeddings: np.ndarray,
144
- config: Dict[str, Any]
145
- ):
146
- """
147
- Equivalent to 'perform_topic_modeling'.
148
- Config expects keys: umap_params, hdbscan_params, vectorizer_params, bt_params
149
- """
150
- # Unpack config (with defaults matching your app)
151
- umap_params = config.get("umap_params", {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0})
152
- hdbscan_params = config.get("hdbscan_params", {"min_cluster_size": 10, "min_samples": 5})
153
- vec_params = config.get("vectorizer_params", {})
154
- bt_params = config.get("bt_params", {"nr_topics": "auto", "top_n_words": 10})
155
-
156
- # Handle ngram_range tuple conversion
157
- if "ngram_range" in vec_params and isinstance(vec_params["ngram_range"], list):
158
- vec_params["ngram_range"] = tuple(vec_params["ngram_range"])
159
-
160
- # Instantiate models
161
- umap_model = UMAP(random_state=42, metric="cosine", **umap_params)
162
- hdbscan_model = HDBSCAN(metric="euclidean", prediction_data=True, **hdbscan_params)
163
-
164
- vectorizer_model = None
165
- if config.get("use_vectorizer", True):
166
- vectorizer_model = CountVectorizer(**vec_params)
167
-
168
- nr_topics = bt_params.get("nr_topics", "auto")
169
- if nr_topics != "auto":
170
- nr_topics = int(nr_topics)
171
-
172
- # Run BERTopic
173
- topic_model = BERTopic(
174
- umap_model=umap_model,
175
- hdbscan_model=hdbscan_model,
176
- vectorizer_model=vectorizer_model,
177
- top_n_words=bt_params.get("top_n_words", 10),
178
- nr_topics=nr_topics,
179
- verbose=True
180
- )
181
-
182
- topics, probs = topic_model.fit_transform(docs, embeddings)
183
-
184
- # Calculate UMAP reduction for visualization (2D)
185
- reduced_2d = UMAP(
186
- n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine", random_state=42
187
- ).fit_transform(embeddings)
188
-
189
- return topic_model, reduced_2d, topics
190
-
191
- # =====================================================================
192
- # 3. LLM Logic
193
- # =====================================================================
194
-
195
- def _clean_label(x: str) -> str:
196
- """Helper to clean LLM output"""
197
- x = (x or "").strip()
198
- x = x.splitlines()[0].strip()
199
- x = x.strip(' "\'`')
200
- return x or "Unlabelled"
201
-
202
- def generate_llm_labels(
203
- topic_model: BERTopic,
204
- hf_token: str,
205
- model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
206
- max_topics: int = 40,
207
- max_docs_per_topic: int = 8
208
- ) -> Dict[int, str]:
209
- """
210
- Equivalent to 'generate_labels_via_chat_completion' but without Streamlit progress bars.
211
- """
212
- client = InferenceClient(model=model_id, token=hf_token)
213
-
214
- topic_info = topic_model.get_topic_info()
215
- topic_info = topic_info[topic_info.Topic != -1].head(max_topics)
216
-
217
- labels = {}
218
- print(f"Generating labels for {len(topic_info)} topics...")
219
-
220
- for tid in topic_info.Topic.tolist():
221
- # Get keywords
222
- words = topic_model.get_topic(tid) or []
223
- keywords = ", ".join([w for (w, _) in words[:10]])
224
-
225
- # Get docs
226
- reps = (topic_model.get_representative_docs(tid) or [])[:max_docs_per_topic]
227
- docs_block = "\n".join([f"- {r}" for r in reps]) if reps else "(No docs)"
228
-
229
- # Prompt
230
- user_prompt = USER_TEMPLATE.format(documents=docs_block, keywords=keywords)
231
-
232
- try:
233
- out = client.chat_completion(
234
- model=model_id,
235
- messages=[
236
- {"role": "system", "content": SYSTEM_PROMPT},
237
- {"role": "user", "content": user_prompt},
238
- ],
239
- max_tokens=24,
240
- temperature=0.2
241
- )
242
- raw = out.choices[0].message.content
243
- labels[int(tid)] = _clean_label(raw)
244
- except Exception as e:
245
- print(f"Error on topic {tid}: {e}")
246
- labels[int(tid)] = f"Topic {tid} (Error)"
247
-
248
- return labels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mosaic_core/core_functions.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core analysis functions for MOSAIC topic modeling.
3
+
4
+ This module provides preprocessing, embedding, topic modeling, and LLM labeling
5
+ for phenomenological text analysis. No Streamlit dependencies.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import logging
11
+ import re
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import nltk
18
+ from sentence_transformers import SentenceTransformer
19
+ from bertopic import BERTopic
20
+ from sklearn.feature_extraction.text import CountVectorizer
21
+ from umap import UMAP
22
+ from hdbscan import HDBSCAN
23
+ from huggingface_hub import InferenceClient
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # --- Text column detection ---
29
+
30
+ TEXT_COLUMN_CANDIDATES = [
31
+ "reflection_answer_english",
32
+ "reflection_answer",
33
+ "text",
34
+ "report",
35
+ ]
36
+
37
+
38
+ def pick_text_column(df):
39
+ """Return first column matching TEXT_COLUMN_CANDIDATES, or None."""
40
+ for col in TEXT_COLUMN_CANDIDATES:
41
+ if col in df.columns:
42
+ return col
43
+ return None
44
+
45
+
46
+ def list_text_columns(df):
47
+ """Return all column names."""
48
+ return list(df.columns)
49
+
50
+
51
+ # --- String utilities ---
52
+
53
+ def slugify(s):
54
+ """Convert string to filesystem-safe name."""
55
+ s = s.strip()
56
+ s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
57
+ return s or "DATASET"
58
+
59
+
60
+ def clean_label(raw):
61
+ """
62
+ Normalise LLM-generated topic label.
63
+
64
+ Takes first line, strips quotes/punctuation, removes wrapper phrases
65
+ like "Experience of". Returns "Unlabelled" if empty.
66
+ """
67
+ text = (raw or "").strip()
68
+ lines = text.splitlines()
69
+ text = lines[0].strip() if lines else ""
70
+ text = text.strip(' "\'`')
71
+ text = re.sub(r"[.:\-–—]+$", "", text).strip()
72
+ text = re.sub(r"[^\w\s]", "", text).strip()
73
+
74
+ text = re.sub(
75
+ r"^(Experiential(?:\s+Phenomenon)?|Experience of|Subjective Experience of|Phenomenon of)\s+",
76
+ "",
77
+ text,
78
+ flags=re.IGNORECASE,
79
+ )
80
+ text = re.sub(
81
+ r"\s+(experience|experiences|phenomenon|state|states)$",
82
+ "",
83
+ text,
84
+ flags=re.IGNORECASE,
85
+ )
86
+ return text.strip() or "Unlabelled"
87
+
88
+
89
+ # --- Config and caching utilities ---
90
+
91
+ def get_config_hash(cfg):
92
+ """Generate a hash string from config dict for caching."""
93
+ return hashlib.md5(json.dumps(cfg, sort_keys=True).encode()).hexdigest()[:12]
94
+
95
+
96
+ def make_run_id(cfg):
97
+ """Generate unique run ID from timestamp and config hash."""
98
+ h = get_config_hash(cfg)
99
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
100
+ return f"{ts}_{h}"
101
+
102
+
103
+ def cleanup_old_cache(cache_dir, current_slug):
104
+ """Delete cached .npy files that don't match current dataset slug."""
105
+ cache_dir = Path(cache_dir)
106
+ if not cache_dir.exists():
107
+ return 0
108
+
109
+ removed = 0
110
+ for p in cache_dir.glob("precomputed_*.npy"):
111
+ if current_slug not in p.name:
112
+ try:
113
+ p.unlink()
114
+ removed += 1
115
+ except Exception as e:
116
+ logger.warning(f"Could not delete {p}: {e}")
117
+
118
+ if removed:
119
+ logger.info(f"Cleaned up {removed} old cache files")
120
+ return removed
121
+
122
+
123
+ # --- NLTK setup ---
124
+
125
+ def ensure_nltk_data(data_dir=None):
126
+ """Download NLTK punkt tokenizer if missing."""
127
+ if data_dir and data_dir not in nltk.data.path:
128
+ nltk.data.path.append(data_dir)
129
+
130
+ for resource in ("punkt_tab", "punkt"):
131
+ try:
132
+ nltk.data.find(f"tokenizers/{resource}")
133
+ return
134
+ except LookupError:
135
+ pass
136
+
137
+ try:
138
+ nltk.download("punkt", download_dir=data_dir, quiet=True)
139
+ except Exception as e:
140
+ logger.warning(f"Could not download NLTK punkt: {e}")
141
+
142
+
143
+ # --- Embedding ---
144
+
145
+ def load_embedding_model(model_name):
146
+ """Load a sentence-transformers model."""
147
+ logger.info(f"Loading embedding model: {model_name}")
148
+ return SentenceTransformer(model_name)
149
+
150
+
151
+ def resolve_device(requested):
152
+ """
153
+ Resolve device string to actual device and batch size.
154
+
155
+ Returns (device, batch_size) where device is 'cpu', 'cuda', or 'mps'.
156
+ """
157
+ if requested.lower() == "cpu":
158
+ return "cpu", 64
159
+
160
+ import torch
161
+ if torch.cuda.is_available():
162
+ return "cuda", 32
163
+ if torch.backends.mps.is_available():
164
+ return "mps", 32
165
+
166
+ logger.warning("GPU requested but unavailable, using CPU")
167
+ return "cpu", 64
168
+
169
+
170
+ # --- Preprocessing ---
171
+
172
+ def preprocess_texts(texts, split_sentences=True, min_words=3):
173
+ """
174
+ Clean and optionally split texts into sentences.
175
+
176
+ Returns (docs, removed, stats) where stats has keys:
177
+ total_before, total_after, removed_count
178
+ """
179
+ ensure_nltk_data()
180
+
181
+ if split_sentences:
182
+ units = []
183
+ for text in texts:
184
+ units.extend(nltk.sent_tokenize(str(text)))
185
+ else:
186
+ units = [str(t) for t in texts]
187
+
188
+ total_before = len(units)
189
+
190
+ if min_words > 0:
191
+ docs = [u for u in units if len(u.split()) >= min_words]
192
+ removed = [u for u in units if len(u.split()) < min_words]
193
+ else:
194
+ docs = units
195
+ removed = []
196
+
197
+ stats = {
198
+ "total_before": total_before,
199
+ "total_after": len(docs),
200
+ "removed_count": len(removed),
201
+ }
202
+ return docs, removed, stats
203
+
204
+
205
+ def load_csv_texts(csv_path, text_col=None):
206
+ """
207
+ Load CSV and extract texts from specified or auto-detected column.
208
+
209
+ Returns list of non-empty text strings.
210
+ Raises ValueError if no valid text column found.
211
+ """
212
+ df = pd.read_csv(csv_path)
213
+
214
+ if text_col is None:
215
+ text_col = pick_text_column(df)
216
+
217
+ if text_col is None or text_col not in df.columns:
218
+ raise ValueError(f"No valid text column found in {csv_path}")
219
+
220
+ df = df.dropna(subset=[text_col])
221
+ df[text_col] = df[text_col].astype(str)
222
+ return [t for t in df[text_col] if t.strip()]
223
+
224
+
225
+ def count_clean_reports(csv_path, text_col=None):
226
+ """Count non-empty reports in CSV."""
227
+ try:
228
+ texts = load_csv_texts(csv_path, text_col)
229
+ return len(texts)
230
+ except Exception:
231
+ return 0
232
+
233
+
234
+ def compute_embeddings(docs, model_name="BAAI/bge-small-en-v1.5", device="cpu"):
235
+ """
236
+ Compute sentence embeddings.
237
+
238
+ Returns float32 numpy array of shape (n_docs, embedding_dim).
239
+ """
240
+ model = load_embedding_model(model_name)
241
+ encode_device, batch_size = resolve_device(device)
242
+
243
+ logger.info(f"Encoding {len(docs)} documents on {encode_device}")
244
+ embeddings = model.encode(
245
+ docs,
246
+ show_progress_bar=True,
247
+ batch_size=batch_size,
248
+ device=encode_device,
249
+ convert_to_numpy=True,
250
+ )
251
+ return np.asarray(embeddings, dtype=np.float32)
252
+
253
+
254
+ def preprocess_and_embed(csv_path, model_name="BAAI/bge-small-en-v1.5",
255
+ text_col=None, split_sentences=True, min_words=3,
256
+ device="cpu"):
257
+ """
258
+ Full pipeline: load CSV, preprocess, compute embeddings.
259
+
260
+ Returns (docs, embeddings).
261
+ """
262
+ texts = load_csv_texts(csv_path, text_col)
263
+ docs, removed, stats = preprocess_texts(texts, split_sentences, min_words)
264
+
265
+ logger.info(f"Preprocessed {stats['total_after']} units "
266
+ f"(removed {stats['removed_count']} short)")
267
+
268
+ embeddings = compute_embeddings(docs, model_name, device)
269
+ return docs, embeddings
270
+
271
+
272
+ # --- Topic modeling ---
273
+
274
+ def run_topic_model(docs, embeddings, config):
275
+ """
276
+ Fit BERTopic and compute 2D UMAP projection.
277
+
278
+ Config keys:
279
+ umap_params: dict (default: n_neighbors=15, n_components=5, min_dist=0.0)
280
+ hdbscan_params: dict (default: min_cluster_size=10, min_samples=5)
281
+ vectorizer_params: dict (optional)
282
+ use_vectorizer: bool (default: True)
283
+ bt_params: dict with nr_topics ('auto' or int), top_n_words (default: 10)
284
+
285
+ Returns (topic_model, reduced_2d, topics).
286
+ """
287
+ embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
288
+
289
+ umap_params = config.get("umap_params", {
290
+ "n_neighbors": 15, "n_components": 5, "min_dist": 0.0
291
+ })
292
+ hdbscan_params = config.get("hdbscan_params", {
293
+ "min_cluster_size": 10, "min_samples": 5
294
+ })
295
+ vec_params = config.get("vectorizer_params", {}).copy()
296
+ bt_params = config.get("bt_params", {"nr_topics": "auto", "top_n_words": 10})
297
+
298
+ if "ngram_range" in vec_params and isinstance(vec_params["ngram_range"], list):
299
+ vec_params["ngram_range"] = tuple(vec_params["ngram_range"])
300
+
301
+ umap_model = UMAP(random_state=42, metric="cosine", **umap_params)
302
+ hdbscan_model = HDBSCAN(metric="euclidean", prediction_data=True, **hdbscan_params)
303
+
304
+ vectorizer = None
305
+ if config.get("use_vectorizer", True):
306
+ vectorizer = CountVectorizer(**vec_params)
307
+
308
+ nr_topics = bt_params.get("nr_topics", "auto")
309
+ if nr_topics == "auto":
310
+ nr_topics = None
311
+ else:
312
+ nr_topics = int(nr_topics)
313
+
314
+ topic_model = BERTopic(
315
+ umap_model=umap_model,
316
+ hdbscan_model=hdbscan_model,
317
+ vectorizer_model=vectorizer,
318
+ top_n_words=bt_params.get("top_n_words", 10),
319
+ nr_topics=nr_topics,
320
+ verbose=False,
321
+ )
322
+
323
+ topics, _ = topic_model.fit_transform(docs, embeddings)
324
+
325
+ reduced_2d = UMAP(
326
+ n_neighbors=15, n_components=2, min_dist=0.0,
327
+ metric="cosine", random_state=42
328
+ ).fit_transform(embeddings)
329
+
330
+ return topic_model, reduced_2d, topics
331
+
332
+
333
+ def get_topic_labels(topic_model, topics):
334
+ """Get keyword-based label for each document's assigned topic."""
335
+ info = topic_model.get_topic_info()
336
+ name_map = info.set_index("Topic")["Name"].to_dict()
337
+ return [name_map.get(t, "Unknown") for t in topics]
338
+
339
+
340
+ def get_outlier_stats(topic_model):
341
+ """Return (outlier_count, outlier_percentage)."""
342
+ info = topic_model.get_topic_info()
343
+ total = info["Count"].sum()
344
+
345
+ if -1 in info["Topic"].values:
346
+ outlier_count = int(info.loc[info["Topic"] == -1, "Count"].iloc[0])
347
+ else:
348
+ outlier_count = 0
349
+
350
+ pct = (100.0 * outlier_count / total) if total > 0 else 0.0
351
+ return outlier_count, pct
352
+
353
+
354
+ def get_num_topics(topic_model):
355
+ """Return number of topics (excluding outlier topic -1)."""
356
+ info = topic_model.get_topic_info()
357
+ return int((info["Topic"] != -1).sum())
358
+
359
+
360
+ # --- LLM labeling ---
361
+
362
+ SYSTEM_PROMPT = """You are an expert phenomenologist analysing first-person experiential reports or microphenomenological interviews.
363
+
364
+ Your task is to assign a concise label to a cluster of similar reports by identifying the
365
+ shared lived experiential structure or process they describe.
366
+
367
+ The label must:
368
+ 1. Describe what changes in experience itself (e.g. boundaries, temporality, embodiment, agency, affect, meaning).
369
+ 2. Capture the underlying experiential process or structural transformation, not surface narrative details.
370
+ 3. Be specific and distinctive, but at the level of experiential structure rather than anecdotal content.
371
+ 4. Use phenomenological language that describes how cognitive, affective, or perceptual processes are lived, rather than analytic or evaluative abstractions.
372
+ 5. Be conceptually focused on a single dominant experiential pattern.
373
+ 6. Be concise and noun-phrase-like.
374
+
375
+ Constraints:
376
+ - Output ONLY the label (no explanation).
377
+ - 3–8 words.
378
+ - Avoid surface-specific details unless they reflect a recurring experiential structure.
379
+ - Avoid meta-level analytic terms (e.g. epistemic, estimation, verification, evaluation) unless they directly describe how the process is experienced.
380
+ - Avoid generic wrappers such as "experience of", "state of", or "phenomenon of".
381
+ - No punctuation, no quotes, no extra text.
382
+ - Do not explain your reasoning.
383
+ """
384
+
385
+ USER_TEMPLATE = """Here is a cluster of participant reports describing a specific phenomenon:
386
+
387
+ {documents}
388
+
389
+ Top keywords associated with this cluster:
390
+ {keywords}
391
+
392
+ Task: Return a single scientifically precise label (3–7 words). Output ONLY the label.
393
+ """
394
+
395
+
396
+ def get_hf_status_code(exc):
397
+ """Extract HTTP status code from HuggingFace exception, if present."""
398
+ resp = getattr(exc, "response", None)
399
+ return getattr(resp, "status_code", None)
400
+
401
+
402
+ def generate_llm_labels(topic_model, hf_token, model_id="meta-llama/Meta-Llama-3-8B-Instruct",
403
+ max_topics=50, max_docs_per_topic=10, doc_char_limit=400,
404
+ temperature=0.2):
405
+ """
406
+ Generate topic labels via HuggingFace Inference API.
407
+
408
+ Returns dict mapping topic_id to label string.
409
+ Raises RuntimeError on 402 (payment required).
410
+ """
411
+ client = InferenceClient(model=model_id, token=hf_token)
412
+
413
+ info = topic_model.get_topic_info()
414
+ info = info[info["Topic"] != -1].head(max_topics)
415
+
416
+ labels = {}
417
+ logger.info(f"Generating LLM labels for {len(info)} topics")
418
+
419
+ for tid in info["Topic"].tolist():
420
+ words = topic_model.get_topic(tid) or []
421
+ keywords = ", ".join([w for w, _ in words[:10]])
422
+
423
+ try:
424
+ reps = (topic_model.get_representative_docs(tid) or [])[:max_docs_per_topic]
425
+ except Exception:
426
+ reps = []
427
+
428
+ reps = [r.replace("\n", " ").strip()[:doc_char_limit] for r in reps if str(r).strip()]
429
+ docs_block = "\n".join([f"- {r}" for r in reps]) if reps else "- (No docs)"
430
+
431
+ prompt = USER_TEMPLATE.format(documents=docs_block, keywords=keywords)
432
+
433
+ try:
434
+ out = client.chat_completion(
435
+ model=model_id,
436
+ messages=[
437
+ {"role": "system", "content": SYSTEM_PROMPT},
438
+ {"role": "user", "content": prompt},
439
+ ],
440
+ max_tokens=24,
441
+ temperature=temperature,
442
+ stop=["\n"],
443
+ )
444
+ raw = out.choices[0].message.content
445
+ labels[int(tid)] = clean_label(raw)
446
+
447
+ except Exception as e:
448
+ code = get_hf_status_code(e)
449
+ if code == 402:
450
+ raise RuntimeError(
451
+ "HuggingFace returned 402 Payment Required. "
452
+ "Monthly credits exhausted—upgrade or skip LLM labeling."
453
+ ) from e
454
+ logger.warning(f"LLM labeling failed for topic {tid}: {e}")
455
+ labels[int(tid)] = f"Topic {tid}"
456
+
457
+ return labels
458
+
459
+
460
+ def labels_cache_path(cache_dir, config_hash, model_id):
461
+ """Generate path for cached LLM labels."""
462
+ safe_model = re.sub(r"[^a-zA-Z0-9_.-]", "_", model_id)
463
+ return Path(cache_dir) / f"llm_labels_{safe_model}_{config_hash}.json"
464
+
465
+
466
+ def load_cached_labels(cache_path):
467
+ """Load labels from cache file, returns None if not found or invalid."""
468
+ try:
469
+ data = json.loads(Path(cache_path).read_text(encoding="utf-8"))
470
+ return {int(k): str(v) for k, v in data.items()}
471
+ except Exception:
472
+ return None
473
+
474
+
475
+ def save_labels_cache(cache_path, labels):
476
+ """Save labels dict to cache file."""
477
+ try:
478
+ data = {str(k): v for k, v in labels.items()}
479
+ Path(cache_path).write_text(json.dumps(data, indent=2), encoding="utf-8")
480
+ except Exception as e:
481
+ logger.warning(f"Could not save labels cache: {e}")