File size: 3,365 Bytes
90d0b4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
{
  "model_name": "SV-SPR (caller-agnostic, reference-only)",
  "model_file": "model/svspr_v14_seq.pkl",
  "model_version": "v14_seq_only_unified",
  "model_sha256": "36b02f6249ec1858b21b0a9836590a833e7badb1d08e4b09b6e8a1908527dc46",
  "architecture": "sklearn.ensemble.RandomForestClassifier",
  "n_estimators": 200,
  "class_weight": "balanced",
  "trained_with_sklearn": "1.4.0",
  "verified_with_sklearn": "1.6.1",
  "random_seed": 42,
  "training": {
    "csv_source": "sv_matched_features_v7_depth20.csv",
    "cohort": "143 Korean parents (probands held out), paired Illumina DRAGEN + PacBio HiFi",
    "label": "confirmed = SRS Manta call matched by LRS Sawfish within +-500bp, same svtype",
    "common_sv_scope": "KPPD_AF >= 0.01 (diploid, n_alt/(2*226))",
    "train_cap_per_fold": 300000,
    "cross_validation": "143-fold sample-LOSO (LeaveOneGroupOut by sample_id)",
    "cv_f1_avg": 0.9593,
    "cv_f1_ci95": [0.9564, 0.9616],
    "cv_auroc": 0.9739,
    "comparison_v13_parents_FULL_f1": 0.9308
  },
  "n_features": 11,
  "feature_order": [
    "svlen_abs_manta",
    "log10_svlen",
    "svtype_DEL_manta",
    "svtype_INS_manta",
    "svtype_DUP_manta",
    "svtype_BND_manta",
    "gc_flank_w100",
    "at_flank_w100",
    "gc_inner_w100",
    "n_motif_2_w100",
    "n_motif_3_w100"
  ],
  "features": {
    "svlen_abs_manta":  {"dtype": "float", "desc": "abs(SVLEN) in bp", "source": "VCF INFO SVLEN or END-POS", "default": 0},
    "log10_svlen":      {"dtype": "float", "desc": "log10(svlen_abs + 1)", "source": "derived", "default": 0},
    "svtype_DEL_manta": {"dtype": "int{0,1}", "desc": "one-hot DEL", "source": "VCF INFO SVTYPE", "default": 0},
    "svtype_INS_manta": {"dtype": "int{0,1}", "desc": "one-hot INS", "source": "VCF INFO SVTYPE", "default": 0},
    "svtype_DUP_manta": {"dtype": "int{0,1}", "desc": "one-hot DUP", "source": "VCF INFO SVTYPE", "default": 0},
    "svtype_BND_manta": {"dtype": "int{0,1}", "desc": "one-hot BND (INV grouped here)", "source": "VCF INFO SVTYPE", "default": 0},
    "gc_flank_w100":    {"dtype": "float[0,1]", "desc": "GC fraction in +-100bp flanks (5'+3' averaged)", "source": "reference FASTA", "default": 0},
    "at_flank_w100":    {"dtype": "float[0,1]", "desc": "AT fraction in +-100bp flanks", "source": "reference FASTA", "default": 0},
    "gc_inner_w100":    {"dtype": "float[0,1]", "desc": "GC fraction inside DEL/DUP span (INS: insseq; fallback=gc_flank)", "source": "reference FASTA", "default": 0},
    "n_motif_2_w100":   {"dtype": "int", "desc": "count of dinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0},
    "n_motif_3_w100":   {"dtype": "int", "desc": "count of trinucleotide tandem motifs in flank", "source": "reference FASTA", "default": 0}
  },
  "window_bp": 100,
  "required_inputs": ["VCF: chrom,pos,end/SVLEN,SVTYPE", "reference FASTA (GRCh38)"],
  "not_required": ["BAM/CRAM", "caller-specific INFO/FORMAT fields (QUAL, GQ, PR, SR, IMPRECISE)"],
  "output": {"CS": "P(confirmed by LRS), float[0,1]", "tier": "High>=0.9 | Moderate 0.7-0.9 | Warning 0.5-0.7 | Low<0.5 (matches Methods 2.7.2)"},
  "calibration_note": "CS is the RandomForest positive-class probability and is NOT calibrated out-of-the-box (held-out ECE ~= 0.07; under-confident in mid-range). Apply isotonic/Platt calibration before treating CS as a literal probability."
}