| | """ |
| | Training pipeline for PubGuard classification heads. |
| | |
| | Trains lightweight linear classifiers on frozen model2vec embeddings. |
| | This follows the same paradigm as the openalex-topic-classifier: |
| | the expensive embedding is pre-computed once, and the classifier |
| | itself is a single matrix multiply β fast to train, fast to infer. |
| | |
| | Training strategy: |
| | 1. Load + cache model2vec embeddings for all training data |
| | 2. For each head, fit a logistic regression (sklearn) with |
| | class-balanced weights and L2 regularisation |
| | 3. Export weights as .npz for the numpy-only inference path |
| | 4. Report per-class precision / recall / F1 on held-out split |
| | |
| | The entire pipeline trains in <5 minutes on CPU for ~50K samples, |
| | consistent with your existing toolchain. |
| | """ |
| |
|
| | import json |
| | import logging |
| | import time |
| | from pathlib import Path |
| | from typing import Dict, List, Optional, Tuple |
| |
|
| | import numpy as np |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.metrics import classification_report |
| | from sklearn.model_selection import train_test_split |
| |
|
| | from .config import PubGuardConfig, DOC_TYPE_LABELS, AI_DETECT_LABELS, TOXICITY_LABELS |
| | from .classifier import LinearHead |
| | from .text import clean_text, extract_structural_features, N_STRUCTURAL_FEATURES |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def load_ndjson(path: Path) -> Tuple[List[str], List[str]]: |
| | """Load NDJSON file β (texts, labels).""" |
| | texts, labels = [], [] |
| | with open(path) as f: |
| | for line in f: |
| | if line.strip(): |
| | row = json.loads(line) |
| | texts.append(row["text"]) |
| | labels.append(row["label"]) |
| | return texts, labels |
| |
|
| |
|
| | def embed_texts( |
| | texts: List[str], |
| | config: PubGuardConfig, |
| | cache_path: Optional[Path] = None, |
| | ) -> np.ndarray: |
| | """ |
| | Encode texts with model2vec, L2-normalise, return (N, D) float32. |
| | |
| | Optionally caches to disk to avoid re-embedding on repeat runs. |
| | """ |
| | if cache_path and cache_path.exists(): |
| | logger.info(f"Loading cached embeddings from {cache_path}") |
| | return np.load(cache_path) |
| |
|
| | from model2vec import StaticModel |
| |
|
| | model_path = config.distilled_model_path |
| | if model_path.exists(): |
| | model = StaticModel.from_pretrained(str(model_path)) |
| | else: |
| | model = StaticModel.from_pretrained(config.model_name) |
| | model_path.parent.mkdir(parents=True, exist_ok=True) |
| | model.save_pretrained(str(model_path)) |
| |
|
| | logger.info(f"Embedding {len(texts)} texts...") |
| | cleaned = [clean_text(t, config.max_text_chars) for t in texts] |
| | embeddings = model.encode(cleaned, show_progress_bar=True) |
| |
|
| | |
| | norms = np.linalg.norm(embeddings, axis=1, keepdims=True) |
| | norms = np.where(norms == 0, 1, norms) |
| | embeddings = (embeddings / norms).astype("float32") |
| |
|
| | if cache_path: |
| | cache_path.parent.mkdir(parents=True, exist_ok=True) |
| | np.save(cache_path, embeddings) |
| | logger.info(f"Cached embeddings to {cache_path}") |
| |
|
| | return embeddings |
| |
|
| |
|
| | def compute_structural_features(texts: List[str]) -> np.ndarray: |
| | """Compute structural features for all texts.""" |
| | feats = [] |
| | for t in texts: |
| | cleaned = clean_text(t) |
| | feat_dict = extract_structural_features(cleaned) |
| | feats.append(list(feat_dict.values())) |
| | return np.array(feats, dtype="float32") |
| |
|
| |
|
| | def train_head( |
| | X_train: np.ndarray, |
| | y_train: np.ndarray, |
| | X_test: np.ndarray, |
| | y_test: np.ndarray, |
| | labels: List[str], |
| | head_name: str, |
| | C: float = 1.0, |
| | max_iter: int = 1000, |
| | ) -> LinearHead: |
| | """ |
| | Train a single linear classification head. |
| | |
| | Uses sklearn LogisticRegression with: |
| | - L2 regularisation (C parameter) |
| | - class_weight='balanced' for imbalanced data |
| | - lbfgs solver (good for moderate feature counts) |
| | - multinomial objective even for binary (consistent API) |
| | |
| | Extracts W and b into a LinearHead for numpy-only inference. |
| | """ |
| | logger.info(f"\n{'='*60}") |
| | logger.info(f"Training {head_name} head") |
| | logger.info(f"{'='*60}") |
| | logger.info(f" Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,}") |
| | logger.info(f" Features: {X_train.shape[1]} | Classes: {len(labels)}") |
| |
|
| | |
| | unique, counts = np.unique(y_train, return_counts=True) |
| | for u, c in zip(unique, counts): |
| | logger.info(f" {u}: {c:,}") |
| |
|
| | start = time.time() |
| |
|
| | clf = LogisticRegression( |
| | C=C, |
| | max_iter=max_iter, |
| | class_weight="balanced", |
| | solver="lbfgs", |
| | n_jobs=-1, |
| | random_state=42, |
| | ) |
| | clf.fit(X_train, y_train) |
| |
|
| | elapsed = time.time() - start |
| | logger.info(f" Trained in {elapsed:.1f}s") |
| |
|
| | |
| | y_pred = clf.predict(X_test) |
| | report = classification_report(y_test, y_pred, target_names=labels, digits=4) |
| | logger.info(f"\n{report}") |
| |
|
| | |
| | head = LinearHead(labels) |
| | |
| | |
| | if clf.coef_.shape[0] == 1: |
| | |
| | |
| | head.W = np.vstack([-clf.coef_[0], clf.coef_[0]]).T.astype("float32") |
| | head.b = np.array([-clf.intercept_[0], clf.intercept_[0]], dtype="float32") |
| | else: |
| | head.W = clf.coef_.T.astype("float32") |
| | head.b = clf.intercept_.astype("float32") |
| |
|
| | |
| | logits = X_test[:5] @ head.W + head.b |
| | e = np.exp(logits - logits.max(axis=-1, keepdims=True)) |
| | probs = e / e.sum(axis=-1, keepdims=True) |
| | np_pred_idx = np.argmax(probs, axis=1) |
| | sk_pred_idx = clf.predict(X_test[:5]) |
| | assert list(np_pred_idx) == list(int(x) for x in sk_pred_idx), \ |
| | f"Mismatch: {list(np_pred_idx)} vs {list(sk_pred_idx)}" |
| | logger.info(" β Numpy inference matches sklearn predictions") |
| |
|
| | return head |
| |
|
| |
|
| | def train_all( |
| | data_dir: Path, |
| | config: Optional[PubGuardConfig] = None, |
| | test_size: float = 0.15, |
| | ): |
| | """ |
| | Train all three classification heads. |
| | |
| | Args: |
| | data_dir: Directory containing the prepared NDJSON files |
| | config: PubGuard configuration |
| | test_size: Fraction of data held out for evaluation |
| | """ |
| | config = config or PubGuardConfig() |
| | data_dir = Path(data_dir) |
| | cache_dir = data_dir / "embeddings_cache" |
| |
|
| | logger.info("=" * 60) |
| | logger.info("PubGuard Training Pipeline") |
| | logger.info("=" * 60) |
| | logger.info(f"Data dir: {data_dir}") |
| | logger.info(f"Models dir: {config.models_dir}") |
| | start_total = time.time() |
| |
|
| | |
| | doc_type_path = data_dir / "doc_type_train.ndjson" |
| | if doc_type_path.exists(): |
| | texts, labels = load_ndjson(doc_type_path) |
| | label_to_idx = {l: i for i, l in enumerate(DOC_TYPE_LABELS)} |
| |
|
| | |
| | embeddings = embed_texts( |
| | texts, config, |
| | cache_path=cache_dir / "doc_type_emb.npy", |
| | ) |
| |
|
| | |
| | logger.info("Computing structural features...") |
| | struct = compute_structural_features(texts) |
| | X = np.concatenate([embeddings, struct], axis=1) |
| |
|
| | y = np.array([label_to_idx.get(l, 0) for l in labels]) |
| |
|
| | X_tr, X_te, y_tr, y_te = train_test_split( |
| | X, y, test_size=test_size, stratify=y, random_state=42 |
| | ) |
| |
|
| | head = train_head(X_tr, y_tr, X_te, y_te, DOC_TYPE_LABELS, "doc_type") |
| | head.save(config.doc_type_head_path) |
| | logger.info(f"Saved β {config.doc_type_head_path}") |
| | else: |
| | logger.warning(f"doc_type data not found: {doc_type_path}") |
| |
|
| | |
| | ai_path = data_dir / "ai_detect_train.ndjson" |
| | if ai_path.exists(): |
| | texts, labels = load_ndjson(ai_path) |
| | label_to_idx = {l: i for i, l in enumerate(AI_DETECT_LABELS)} |
| |
|
| | embeddings = embed_texts( |
| | texts, config, |
| | cache_path=cache_dir / "ai_detect_emb.npy", |
| | ) |
| |
|
| | y = np.array([label_to_idx.get(l, 0) for l in labels]) |
| |
|
| | X_tr, X_te, y_tr, y_te = train_test_split( |
| | embeddings, y, test_size=test_size, stratify=y, random_state=42 |
| | ) |
| |
|
| | head = train_head(X_tr, y_tr, X_te, y_te, AI_DETECT_LABELS, "ai_detect") |
| | head.save(config.ai_detect_head_path) |
| | logger.info(f"Saved β {config.ai_detect_head_path}") |
| | else: |
| | logger.warning(f"ai_detect data not found: {ai_path}") |
| |
|
| | |
| | tox_path = data_dir / "toxicity_train.ndjson" |
| | if tox_path.exists(): |
| | texts, labels = load_ndjson(tox_path) |
| | label_to_idx = {l: i for i, l in enumerate(TOXICITY_LABELS)} |
| |
|
| | embeddings = embed_texts( |
| | texts, config, |
| | cache_path=cache_dir / "toxicity_emb.npy", |
| | ) |
| |
|
| | y = np.array([label_to_idx.get(l, 0) for l in labels]) |
| |
|
| | X_tr, X_te, y_tr, y_te = train_test_split( |
| | embeddings, y, test_size=test_size, stratify=y, random_state=42 |
| | ) |
| |
|
| | head = train_head(X_tr, y_tr, X_te, y_te, TOXICITY_LABELS, "toxicity") |
| | head.save(config.toxicity_head_path) |
| | logger.info(f"Saved β {config.toxicity_head_path}") |
| | else: |
| | logger.warning(f"toxicity data not found: {tox_path}") |
| |
|
| | elapsed = time.time() - start_total |
| | logger.info(f"\nTotal training time: {elapsed/60:.1f} minutes") |
| | logger.info("All heads saved to: " + str(config.models_dir)) |
| |
|