#!/usr/bin/env python3 """ Full training pipeline: download data → train heads → evaluate. Usage: cd /home/joneill/pubverse_brett/pub_check source ~/myenv/bin/activate pip install -e ".[train]" python scripts/train_pubguard.py [--data-dir ./pubguard_data] [--n-per-class 15000] """ import argparse import logging import sys import os logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) from pathlib import Path from pubguard.config import PubGuardConfig from pubguard.data import prepare_all from pubguard.train import train_all def main(): parser = argparse.ArgumentParser(description="Train PubGuard") parser.add_argument("--data-dir", default="./pubguard_data", help="Directory for training data") parser.add_argument("--models-dir", default=None, help="Override models output directory") parser.add_argument("--n-per-class", type=int, default=15000, help="Samples per class per head") parser.add_argument("--test-size", type=float, default=0.15, help="Held-out test fraction") parser.add_argument("--skip-download", action="store_true", help="Skip dataset download (use existing data)") args = parser.parse_args() data_dir = Path(args.data_dir) config = PubGuardConfig() if args.models_dir: config.models_dir = Path(args.models_dir) # Step 1: Download and prepare datasets if not args.skip_download: prepare_all(data_dir, n_per_class=args.n_per_class) # Step 2: Train all heads train_all(data_dir, config=config, test_size=args.test_size) # Step 3: Quick smoke test print("\n" + "=" * 60) print("SMOKE TEST") print("=" * 60) from pubguard import PubGuard guard = PubGuard(config=config) guard.initialize() test_cases = [ ( "Introduction: We present a novel deep learning approach for protein " "structure prediction. Methods: We trained a transformer model on 50,000 " "protein sequences from the PDB database. Results: Our model achieves " "state-of-the-art accuracy with an RMSD of 1.2 ƅ on the CASP14 benchmark. " "Discussion: These results demonstrate the potential of attention mechanisms " "for structural biology. References: [1] AlphaFold (2021) [2] ESMFold (2022)", "scientific_paper", ), ( "šŸŽ‰ POOL PARTY THIS SATURDAY! šŸŠ Come join us at the community center " "pool. Bring snacks and sunscreen. RSVP to poolparty@gmail.com by Thursday!", "junk", ), ( "TITLE: Deep Learning for Medical Imaging\nAUTHORS: J. Smith, A. Lee\n" "AFFILIATION: MIT\n\nKey Findings:\n• 95% accuracy on chest X-rays\n" "• Novel attention mechanism\n\nContact: jsmith@mit.edu", "poster", ), ( "We investigate the role of microRNAs in hepatocellular carcinoma " "progression. Using RNA-seq data from 200 patient samples collected at " "three clinical sites, we identified 15 differentially expressed miRNAs " "associated with tumor stage (FDR < 0.01).", "abstract_only", ), ] for text, expected_type in test_cases: verdict = guard.screen(text) status = "āœ…" if verdict["doc_type"]["label"] == expected_type else "āš ļø" print(f" {status} Expected: {expected_type:20s} Got: {verdict['doc_type']['label']:20s} " f"(score={verdict['doc_type']['score']:.3f})") print(f" AI: {verdict['ai_generated']['label']} ({verdict['ai_generated']['score']:.3f}) " f"Toxic: {verdict['toxicity']['label']} ({verdict['toxicity']['score']:.3f}) " f"Pass: {verdict['pass']}") print(f"\nāœ… Training complete! Heads saved to: {config.models_dir}") if __name__ == "__main__": main()