|
|
|
|
|
""" |
|
|
Full training pipeline: download data β train heads β evaluate. |
|
|
|
|
|
Usage: |
|
|
cd /home/joneill/pubverse_brett/pub_check |
|
|
source ~/myenv/bin/activate |
|
|
pip install -e ".[train]" |
|
|
python scripts/train_pubguard.py [--data-dir ./pubguard_data] [--n-per-class 15000] |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import logging |
|
|
import sys |
|
|
import os |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s | %(levelname)s | %(message)s", |
|
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
|
) |
|
|
|
|
|
from pathlib import Path |
|
|
from pubguard.config import PubGuardConfig |
|
|
from pubguard.data import prepare_all |
|
|
from pubguard.train import train_all |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Train PubGuard") |
|
|
parser.add_argument("--data-dir", default="./pubguard_data", |
|
|
help="Directory for training data") |
|
|
parser.add_argument("--models-dir", default=None, |
|
|
help="Override models output directory") |
|
|
parser.add_argument("--n-per-class", type=int, default=15000, |
|
|
help="Samples per class per head") |
|
|
parser.add_argument("--test-size", type=float, default=0.15, |
|
|
help="Held-out test fraction") |
|
|
parser.add_argument("--skip-download", action="store_true", |
|
|
help="Skip dataset download (use existing data)") |
|
|
args = parser.parse_args() |
|
|
|
|
|
data_dir = Path(args.data_dir) |
|
|
config = PubGuardConfig() |
|
|
if args.models_dir: |
|
|
config.models_dir = Path(args.models_dir) |
|
|
|
|
|
|
|
|
if not args.skip_download: |
|
|
prepare_all(data_dir, n_per_class=args.n_per_class) |
|
|
|
|
|
|
|
|
train_all(data_dir, config=config, test_size=args.test_size) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("SMOKE TEST") |
|
|
print("=" * 60) |
|
|
|
|
|
from pubguard import PubGuard |
|
|
|
|
|
guard = PubGuard(config=config) |
|
|
guard.initialize() |
|
|
|
|
|
test_cases = [ |
|
|
( |
|
|
"Introduction: We present a novel deep learning approach for protein " |
|
|
"structure prediction. Methods: We trained a transformer model on 50,000 " |
|
|
"protein sequences from the PDB database. Results: Our model achieves " |
|
|
"state-of-the-art accuracy with an RMSD of 1.2 Γ
on the CASP14 benchmark. " |
|
|
"Discussion: These results demonstrate the potential of attention mechanisms " |
|
|
"for structural biology. References: [1] AlphaFold (2021) [2] ESMFold (2022)", |
|
|
"scientific_paper", |
|
|
), |
|
|
( |
|
|
"π POOL PARTY THIS SATURDAY! π Come join us at the community center " |
|
|
"pool. Bring snacks and sunscreen. RSVP to poolparty@gmail.com by Thursday!", |
|
|
"junk", |
|
|
), |
|
|
( |
|
|
"TITLE: Deep Learning for Medical Imaging\nAUTHORS: J. Smith, A. Lee\n" |
|
|
"AFFILIATION: MIT\n\nKey Findings:\nβ’ 95% accuracy on chest X-rays\n" |
|
|
"β’ Novel attention mechanism\n\nContact: jsmith@mit.edu", |
|
|
"poster", |
|
|
), |
|
|
( |
|
|
"We investigate the role of microRNAs in hepatocellular carcinoma " |
|
|
"progression. Using RNA-seq data from 200 patient samples collected at " |
|
|
"three clinical sites, we identified 15 differentially expressed miRNAs " |
|
|
"associated with tumor stage (FDR < 0.01).", |
|
|
"abstract_only", |
|
|
), |
|
|
] |
|
|
|
|
|
for text, expected_type in test_cases: |
|
|
verdict = guard.screen(text) |
|
|
status = "β
" if verdict["doc_type"]["label"] == expected_type else "β οΈ" |
|
|
print(f" {status} Expected: {expected_type:20s} Got: {verdict['doc_type']['label']:20s} " |
|
|
f"(score={verdict['doc_type']['score']:.3f})") |
|
|
print(f" AI: {verdict['ai_generated']['label']} ({verdict['ai_generated']['score']:.3f}) " |
|
|
f"Toxic: {verdict['toxicity']['label']} ({verdict['toxicity']['score']:.3f}) " |
|
|
f"Pass: {verdict['pass']}") |
|
|
|
|
|
print(f"\nβ
Training complete! Heads saved to: {config.models_dir}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|