jimnoneill
/

abstract-archon

+#!/usr/bin/env python3
+"""
+Abstract Archon — binary classifier: "Is this text a real research abstract?"
+Uses Potion-base-32M (512-dim) + LogisticRegression, distilled from SVM-RBF.
+Applied as a quality gate to every publication in the database.
+Usage:
+    python train_abstract_archon.py --export       # Export training data from PG
+    python train_abstract_archon.py --train         # Train and save model
+    python train_abstract_archon.py --validate      # Validate on held-out data
+"""
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+import numpy as np
+import psycopg2
+import psycopg2.extras
+DB_PARAMS = dict(host='localhost', port=5434, dbname='pubverse',
+                 user='pubverse', password='pubverse123')
+DATA_DIR = Path(__file__).parent / 'abstract_archon_data'
+EXPORT_PATH = DATA_DIR / 'training_export.ndjson'
+MODEL_PATH = DATA_DIR / 'abstract_archon_head.npz'
+N_POSITIVES = 2000
+N_NEGATIVES_PER_REASON = {
+    'html_heavy': 250,
+    'html_heavy_text': 250,
+    'supplementary_content': 250,
+    'author_byline': 200,
+    'figure_table_caption': 250,
+    'journal_article_scrape': 250,
+    'moesm_title': 200,
+    'taxonomy_stub': 200,
+}
+N_BORDERLINE_SHORT = 150
+def get_conn():
+    return psycopg2.connect(**DB_PARAMS)
+def export_data():
+    """Export training data from PostgreSQL."""
+    DATA_DIR.mkdir(exist_ok=True)
+    conn = get_conn()
+    records = []
+    # --- Load cleanup IDs into a set for fast lookup ---
+    print("Loading cleanup source_ids for exclusion filter...")
+    cleanup_ids = set()
+    with conn.cursor(name='cleanup_scan') as cur:
+        cur.itersize = 500000
+        cur.execute("SELECT source_id FROM _quality_cleanup_ids")
+        for row in cur:
+            cleanup_ids.add(row[0])
+    print(f"  Loaded {len(cleanup_ids):,} cleanup IDs")
+    # --- Positive examples: TABLESAMPLE then filter in Python ---
+    print(f"Exporting {N_POSITIVES} positive examples (real abstracts)...")
+    with conn.cursor(name='pos_scan') as cur:
+        cur.itersize = 10000
+        cur.execute("""
+            SELECT p.source_id, LEFT(p.abstract, 500) as text
+            FROM publications p TABLESAMPLE BERNOULLI(0.005)
+            WHERE LENGTH(p.abstract) >= 200
+        """)
+        pos_count = 0
+        for source_id, text in cur:
+            if pos_count >= N_POSITIVES:
+                break
+            if source_id in cleanup_ids:
+                continue
+            if text and len(text.strip()) >= 50:
+                records.append({
+                    'text': text.strip()[:500],
+                    'label': 1,
+                    'source': 'positive_real_abstract',
+                    'source_id': source_id
+                })
+                pos_count += 1
+                if pos_count % 500 == 0:
+                    print(f"    {pos_count} positives collected...")
+    print(f"  Got {pos_count} positives")
+    # --- Negative examples: known garbage by reason ---
+    # Pre-fetch source_ids per reason from the smaller cleanup table, then look up text
+    total_neg = 0
+    for reason, n in N_NEGATIVES_PER_REASON.items():
+        print(f"Exporting {n} negatives for reason={reason}...")
+        with conn.cursor() as cur:
+            # Fast: random sample from cleanup table (much smaller), then fetch text
+            cur.execute("""
+                SELECT q.source_id
+                FROM _quality_cleanup_ids q
+                WHERE q.reason = %s
+                ORDER BY RANDOM()
+                LIMIT %s
+            """, (reason, n * 3))
+            candidate_ids = [row[0] for row in cur.fetchall()]
+        # Fetch actual text for candidates
+        collected = 0
+        batch_size = 200
+        for i in range(0, len(candidate_ids), batch_size):
+            if collected >= n:
+                break
+            batch = candidate_ids[i:i+batch_size]
+            with conn.cursor() as cur:
+                cur.execute("""
+                    SELECT source_id, LEFT(abstract, 500) as text
+                    FROM publications
+                    WHERE source_id = ANY(%s)
+                    AND LENGTH(abstract) > 10
+                """, (batch,))
+                for source_id, text in cur.fetchall():
+                    if collected >= n:
+                        break
+                    if text and len(text.strip()) > 5:
+                        records.append({
+                            'text': text.strip()[:500],
+                            'label': 0,
+                            'source': f'negative_{reason}',
+                            'source_id': source_id
+                        })
+                        collected += 1
+                        total_neg += 1
+        print(f"  Got {collected} for {reason}, running total: {total_neg}")
+    # --- Borderline negatives: very short garbage texts ---
+    print(f"Exporting {N_BORDERLINE_SHORT} borderline short negatives...")
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT q.source_id
+            FROM _quality_cleanup_ids q
+            WHERE q.reason NOT IN ('short_abstract', 'empty_abstract', 'non_english')
+            ORDER BY RANDOM()
+            LIMIT %s
+        """, (N_BORDERLINE_SHORT * 5,))
+        candidate_ids = [row[0] for row in cur.fetchall()]
+    collected = 0
+    for i in range(0, len(candidate_ids), 200):
+        if collected >= N_BORDERLINE_SHORT:
+            break
+        batch = candidate_ids[i:i+200]
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT source_id, LEFT(abstract, 500) as text
+                FROM publications
+                WHERE source_id = ANY(%s)
+                AND LENGTH(abstract) BETWEEN 20 AND 100
+            """, (batch,))
+            for source_id, text in cur.fetchall():
+                if collected >= N_BORDERLINE_SHORT:
+                    break
+                if text and len(text.strip()) > 5:
+                    records.append({
+                        'text': text.strip()[:500],
+                        'label': 0,
+                        'source': 'negative_borderline_short',
+                        'source_id': source_id
+                    })
+                    collected += 1
+                    total_neg += 1
+    print(f"  Got {collected} borderline, total negatives: {total_neg}")
+    conn.close()
+    print(f"\nTotal: {len([r for r in records if r['label']==1])} positives, "
+          f"{len([r for r in records if r['label']==0])} negatives")
+    with open(EXPORT_PATH, 'w') as f:
+        for r in records:
+            f.write(json.dumps(r) + '\n')
+    print(f"Saved to {EXPORT_PATH}")
+def train_model():
+    """Train SVM-RBF, distill to LogisticRegression, save .npz head."""
+    from model2vec import StaticModel
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.metrics import (classification_report, confusion_matrix,
+                                 roc_auc_score)
+    from sklearn.model_selection import StratifiedKFold, train_test_split
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.svm import SVC
+    print("Loading training data...")
+    records = []
+    with open(EXPORT_PATH) as f:
+        for line in f:
+            records.append(json.loads(line))
+    texts = [r['text'] for r in records]
+    labels = np.array([r['label'] for r in records])
+    sources = [r['source'] for r in records]
+    print(f"  {len(texts)} samples: {labels.sum()} positive, {(1-labels).sum()} negative")
+    print("Embedding with Potion-base-32M...")
+    model = StaticModel.from_pretrained('minishlab/potion-base-32M')
+    embeddings = model.encode(texts, show_progress_bar=True)
+    print(f"  Embeddings shape: {embeddings.shape}")
+    X_train, X_test, y_train, y_test, src_train, src_test, txt_train, txt_test = \
+        train_test_split(embeddings, labels, sources, texts,
+                         test_size=0.2, random_state=42, stratify=labels)
+    scaler = StandardScaler()
+    X_train_s = scaler.fit_transform(X_train)
+    X_test_s = scaler.transform(X_test)
+    # --- Train SVM-RBF teacher ---
+    print("\nTraining SVM-RBF teacher...")
+    svm = SVC(kernel='rbf', probability=True, C=10.0, gamma='scale',
+              class_weight='balanced', random_state=42)
+    svm.fit(X_train_s, y_train)
+    svm_pred = svm.predict(X_test_s)
+    svm_proba = svm.predict_proba(X_test_s)[:, 1]
+    print("\n=== SVM-RBF Results ===")
+    print(classification_report(y_test, svm_pred, target_names=['garbage', 'abstract']))
+    print("Confusion matrix:\n", confusion_matrix(y_test, svm_pred))
+    print(f"ROC-AUC: {roc_auc_score(y_test, svm_proba):.4f}")
+    fn_rate = ((svm_pred == 0) & (y_test == 1)).sum() / (y_test == 1).sum()
+    print(f"False negative rate on real abstracts: {fn_rate:.4f}")
+    # Show misclassified examples
+    fn_mask = (svm_pred == 0) & (y_test == 1)
+    fp_mask = (svm_pred == 1) & (y_test == 0)
+    print(f"\n--- False Negatives (real abstracts called garbage): {fn_mask.sum()} ---")
+    fn_indices = np.where(fn_mask)[0]
+    for idx in fn_indices[:10]:
+        print(f"  [{svm_proba[idx]:.3f}] {txt_test[idx][:120]}")
+    print(f"\n--- False Positives (garbage called abstract): {fp_mask.sum()} ---")
+    fp_indices = np.where(fp_mask)[0]
+    for idx in fp_indices[:10]:
+        print(f"  [{svm_proba[idx]:.3f}] [{src_test[idx]}] {txt_test[idx][:120]}")
+    # --- Train LR directly (often better than distillation for small datasets) ---
+    print("\n\nTraining LogisticRegression directly...")
+    best_lr = None
+    best_auc = 0
+    for C in [0.01, 0.1, 1.0, 10.0, 100.0]:
+        lr = LogisticRegression(max_iter=5000, C=C, solver='lbfgs',
+                                class_weight='balanced', random_state=42)
+        lr.fit(X_train_s, y_train)
+        lr_proba = lr.predict_proba(X_test_s)[:, 1]
+        auc = roc_auc_score(y_test, lr_proba)
+        lr_pred = lr.predict(X_test_s)
+        fn = ((lr_pred == 0) & (y_test == 1)).sum() / (y_test == 1).sum()
+        print(f"  C={C:6.2f} → AUC={auc:.4f}, FNR={fn:.4f}")
+        if auc > best_auc:
+            best_auc = auc
+            best_lr = lr
+    lr = best_lr
+    lr_pred = lr.predict(X_test_s)
+    lr_proba = lr.predict_proba(X_test_s)[:, 1]
+    print(f"\n=== Best Direct LR Results (C={lr.C}) ===")
+    print(classification_report(y_test, lr_pred, target_names=['garbage', 'abstract']))
+    print("Confusion matrix:\n", confusion_matrix(y_test, lr_pred))
+    print(f"ROC-AUC: {roc_auc_score(y_test, lr_proba):.4f}")
+    fn_rate_lr = ((lr_pred == 0) & (y_test == 1)).sum() / (y_test == 1).sum()
+    print(f"LR False negative rate: {fn_rate_lr:.4f}")
+    # --- Also try SVM distillation for comparison ---
+    print("\nDistilling SVM → LR...")
+    svm_soft = svm.predict_proba(X_train_s)[:, 1]
+    lr_distilled = LogisticRegression(max_iter=5000, C=1.0, random_state=42)
+    lr_distilled.fit(X_train_s, (svm_soft > 0.5).astype(int))
+    dist_proba = lr_distilled.predict_proba(X_test_s)[:, 1]
+    dist_auc = roc_auc_score(y_test, dist_proba)
+    print(f"  Distilled LR AUC: {dist_auc:.4f}")
+    # Pick the best LR variant
+    if dist_auc > best_auc:
+        print("  → Distilled LR wins, using that")
+        lr = lr_distilled
+        lr_proba = dist_proba
+    else:
+        print(f"  → Direct LR wins (AUC {best_auc:.4f} vs {dist_auc:.4f})")
+    # Find threshold for ~99.5% recall on real abstracts
+    thresholds = np.arange(0.01, 0.99, 0.001)
+    best_t = 0.5
+    for t in thresholds:
+        pred_t = (lr_proba >= t).astype(int)
+        recall = ((pred_t == 1) & (y_test == 1)).sum() / (y_test == 1).sum()
+        precision_garbage = ((pred_t == 0) & (y_test == 0)).sum() / max((pred_t == 0).sum(), 1)
+        if recall >= 0.995:
+            best_t = t
+            print(f"\nAt threshold {t:.3f}: recall={recall:.4f}, garbage_precision={precision_garbage:.4f}")
+            break
+    else:
+        # Find the lowest threshold that gives max recall
+        for t in thresholds:
+            pred_t = (lr_proba >= t).astype(int)
+            recall = ((pred_t == 1) & (y_test == 1)).sum() / (y_test == 1).sum()
+            if recall >= 0.99:
+                best_t = t
+                print(f"\nRelaxed: threshold {t:.3f} gives recall={recall:.4f}")
+                break
+        else:
+            best_t = 0.1
+            pred_t = (lr_proba >= best_t).astype(int)
+            recall = ((pred_t == 1) & (y_test == 1)).sum() / (y_test == 1).sum()
+            print(f"\nFallback: threshold {best_t:.3f} gives recall={recall:.4f}")
+    # Save model
+    np.savez(MODEL_PATH,
+             coef=lr.coef_,
+             intercept=lr.intercept_,
+             classes=lr.classes_,
+             labels=np.array(['garbage', 'abstract']),
+             scaler_mean=scaler.mean_,
+             scaler_scale=scaler.scale_,
+             embed_model='minishlab/potion-base-32M',
+             version='v1',
+             threshold=np.array([best_t]))
+    print(f"\nSaved model to {MODEL_PATH}")
+    print(f"Model size: {MODEL_PATH.stat().st_size / 1024:.1f} KB")
+def validate():
+    """Validate on held-out random publications."""
+    from model2vec import StaticModel
+    print("Loading model...")
+    data = np.load(MODEL_PATH, allow_pickle=True)
+    coef = data['coef']
+    intercept = data['intercept']
+    scaler_mean = data['scaler_mean']
+    scaler_scale = data['scaler_scale']
+    threshold = float(data['threshold'][0])
+    print(f"  Threshold: {threshold:.3f}")
+    model = StaticModel.from_pretrained('minishlab/potion-base-32M')
+    # Load training source_ids to exclude
+    training_ids = set()
+    with open(EXPORT_PATH) as f:
+        for line in f:
+            r = json.loads(line)
+            training_ids.add(r['source_id'])
+    conn = get_conn()
+    print("Sampling 500 random publications for validation...")
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT source_id, LEFT(abstract, 500) as text
+            FROM publications TABLESAMPLE BERNOULLI(0.001)
+            WHERE LENGTH(abstract) > 10
+            LIMIT 1000
+        """)
+        rows = cur.fetchall()
+    # Filter out training data
+    val_data = [(sid, t) for sid, t in rows if sid not in training_ids][:500]
+    conn.close()
+    texts = [t for _, t in val_data]
+    embeddings = model.encode(texts)
+    X_s = (embeddings - scaler_mean) / scaler_scale
+    # LR prediction
+    logits = X_s @ coef.T + intercept
+    from scipy.special import expit
+    probas = expit(logits)[:, 0] if coef.shape[0] == 1 else expit(logits)[:, 1]
+    preds = (probas >= threshold).astype(int)
+    print(f"\nResults on {len(texts)} validation samples:")
+    print(f"  Predicted abstract: {preds.sum()}")
+    print(f"  Predicted garbage:  {(1-preds).sum()}")
+    # Show borderline cases
+    borderline = [(i, probas[i], texts[i][:120]) for i in range(len(texts))
+                  if 0.3 <= probas[i] <= 0.7]
+    if borderline:
+        print(f"\n  Borderline cases ({len(borderline)}):")
+        for i, p, t in borderline[:10]:
+            print(f"    [{p:.3f}] {t}")
+    # Show confident garbage
+    garbage_idx = np.where(preds == 0)[0]
+    if len(garbage_idx) > 0:
+        print(f"\n  Sample 'garbage' predictions:")
+        for idx in garbage_idx[:10]:
+            print(f"    [{probas[idx]:.3f}] {texts[idx][:150]}")
+    # Sanity check PMID 39869795
+    print("\n  Sanity check: PMID 39869795...")
+    conn = get_conn()
+    with conn.cursor() as cur:
+        cur.execute("SELECT LEFT(abstract, 500) FROM publications WHERE source_id LIKE '%39869795%' LIMIT 1")
+        row = cur.fetchone()
+    conn.close()
+    if row:
+        emb = model.encode([row[0]])
+        x_s = (emb - scaler_mean) / scaler_scale
+        logit = x_s @ coef.T + intercept
+        prob = expit(logit).flatten()
+        p = prob[0] if coef.shape[0] == 1 else prob[1]
+        print(f"    Probability(abstract): {p:.4f} → {'PASS' if p >= threshold else 'FAIL'}")
+    else:
+        print("    PMID not found in database")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Abstract Archon trainer')
+    parser.add_argument('--export', action='store_true', help='Export training data from PG')
+    parser.add_argument('--train', action='store_true', help='Train model')
+    parser.add_argument('--validate', action='store_true', help='Validate on held-out data')
+    args = parser.parse_args()
+    if args.export:
+        export_data()
+    elif args.train:
+        train_model()
+    elif args.validate:
+        validate()
+    else:
+        parser.print_help()