SVSTR-Score / str_model_meta.json
khyeom's picture
Release v1.0: HPRC-trained 35/21-feature calibrated SV+STR models (#1)
3c7d0d1
Raw
History Blame Contribute Delete
11.6 kB
{
"variant": "str",
"created_unix": 1782043477,
"feature_cols": [
"is_pass",
"motif_len",
"ref_copynum",
"gt_repcn_max",
"gt_repcn_min",
"expansion_over_ref",
"repci_width_max",
"spanning_reads",
"flanking_reads",
"inrepeat_reads",
"locus_depth",
"gt_hom",
"ref_tract_bp",
"spanning_frac",
"allele_vs_readlen",
"motif_is_homopolymer",
"gc_flank",
"entropy_flank",
"in_segdup",
"in_difficult",
"flank_lowmap"
],
"n_features": 21,
"tier_edges": [
0.3,
0.5,
0.7
],
"tier_names": [
"LOW",
"Warning",
"Moderate",
"High"
],
"missing_sentinel": -99999.0,
"rf_params": {
"bootstrap": true,
"ccp_alpha": 0.0,
"class_weight": "balanced_subsample",
"criterion": "gini",
"max_depth": null,
"max_features": "sqrt",
"max_leaf_nodes": null,
"max_samples": 2000000,
"min_impurity_decrease": 0.0,
"min_samples_leaf": 50,
"min_samples_split": 2,
"min_weight_fraction_leaf": 0.0,
"monotonic_cst": null,
"n_estimators": 300,
"n_jobs": -1,
"oob_score": false,
"random_state": 42,
"verbose": 0,
"warm_start": false
},
"n_train_rows": 22651133,
"n_samples": 208,
"qc": {
"label_rows_raw": 36254400,
"label_dist_raw": {
"concordant": 21350382,
"discordant": 13838163,
"unlabeled": 1065855
},
"label_rows_usable": 35188545,
"ambiguous_keys_dropped": 0,
"ambiguous_feat_rows": 0,
"ambiguous_label_rows": 0,
"dup_keys_feature": 0,
"dup_keys_label": 0,
"merged_rows": 22651133,
"match_rate_vs_labels": 0.6437075758602693,
"match_rate_vs_features": 0.9832673629385175,
"class_balance": {
"concordant": 13960015,
"discordant": 8691118
},
"concordant_rate": 0.6163053742168217
},
"cv_folds": 5,
"cv_fold_metrics": [
{
"n": 4469639,
"pos_rate": 0.6172603648751052,
"auroc": 0.8345731588413778,
"auprc": 0.8868311937682424,
"brier": 0.16715199887480572,
"logloss": 0.505031384190826,
"fold": 0,
"seconds": 404.5
},
{
"n": 4469658,
"pos_rate": 0.6172628867801518,
"auroc": 0.8348793797657998,
"auprc": 0.8871277104956028,
"brier": 0.16710046702995693,
"logloss": 0.5048207582711781,
"fold": 1,
"seconds": 457.3
},
{
"n": 4569998,
"pos_rate": 0.6173429397562099,
"auroc": 0.8345632397054213,
"auprc": 0.8867279699640327,
"brier": 0.16717765756008388,
"logloss": 0.5050623605875793,
"fold": 2,
"seconds": 480.6
},
{
"n": 4570859,
"pos_rate": 0.6168989242503433,
"auroc": 0.8350534258010407,
"auprc": 0.8870572426757822,
"brier": 0.1669604630273807,
"logloss": 0.5044600822147348,
"fold": 3,
"seconds": 546.9
},
{
"n": 4570979,
"pos_rate": 0.6128043904817765,
"auroc": 0.8317845587452297,
"auprc": 0.8823297885222531,
"brier": 0.16790578845588436,
"logloss": 0.5066430427730261,
"fold": 4,
"seconds": 558.6
}
],
"cv_report": {
"overall": {
"n": 22651133,
"pos_rate": 0.6163053742168217,
"auroc": 0.8341539493365068,
"auprc": 0.885996637709877,
"brier": 0.16726047042063633,
"logloss": 0.5052060179718258
},
"calibration": [
{
"bin": "[0.0,0.1)",
"n": 759079,
"mean_pred": 0.06623314081824806,
"obs_rate": 0.027333123429840636
},
{
"bin": "[0.1,0.2)",
"n": 1807689,
"mean_pred": 0.15353118408631086,
"obs_rate": 0.1398288090484591
},
{
"bin": "[0.2,0.3)",
"n": 2278662,
"mean_pred": 0.250703986073481,
"obs_rate": 0.2854271497922904
},
{
"bin": "[0.3,0.4)",
"n": 2401825,
"mean_pred": 0.35114505321433914,
"obs_rate": 0.4219845325950059
},
{
"bin": "[0.4,0.5)",
"n": 2503890,
"mean_pred": 0.4496778698066448,
"obs_rate": 0.5559477453083003
},
{
"bin": "[0.5,0.6)",
"n": 2743182,
"mean_pred": 0.5514420283736253,
"obs_rate": 0.6633803371413198
},
{
"bin": "[0.6,0.7)",
"n": 3201411,
"mean_pred": 0.6513120336728542,
"obs_rate": 0.7673941271520589
},
{
"bin": "[0.7,0.8)",
"n": 2972899,
"mean_pred": 0.7478180823491758,
"obs_rate": 0.8596629081579966
},
{
"bin": "[0.8,0.9)",
"n": 2979925,
"mean_pred": 0.8513437073854806,
"obs_rate": 0.9412015403072225
},
{
"bin": "[0.9,1.0)",
"n": 1002571,
"mean_pred": 0.9221679799864609,
"obs_rate": 0.9910769411842154
}
],
"per_sample_auroc": {
"n_samples": 208,
"median": 0.8353140721290141,
"p25": 0.8326614184016954,
"p75": 0.8373927525350378,
"min": 0.740174387702103,
"max": 0.8401855333526593
},
"by_homopolymer": {
"homopolymer": {
"n": 176,
"pos_rate": 0.0,
"auroc": null,
"auprc": null,
"brier": 0.12461994174893026
},
"other": {
"n": 22650957,
"pos_rate": 0.6163101629657414,
"auroc": 0.8341526308855854,
"auprc": 0.8859973231761953,
"brier": 0.16726080174142982,
"logloss": 0.5052065639352175
}
},
"by_is_pass": {
"PASS": {
"n": 22645309,
"pos_rate": 0.6163365225000904,
"auroc": 0.8341536917536043,
"auprc": 0.8860084593752011,
"brier": 0.1672574382686718,
"logloss": 0.505198302627369
},
"nonPASS": {
"n": 5824,
"pos_rate": 0.4951923076923077,
"auroc": 0.821139738835895,
"auprc": 0.8249088115206255,
"brier": 0.17905030870563365,
"logloss": 0.5352053928461165
}
}
},
"importances": {
"impurity": [
{
"feature": "entropy_flank",
"impurity_importance": 0.28992320685730033
},
{
"feature": "motif_len",
"impurity_importance": 0.15078304844246473
},
{
"feature": "gc_flank",
"impurity_importance": 0.11765967510912077
},
{
"feature": "ref_tract_bp",
"impurity_importance": 0.09594543197447271
},
{
"feature": "allele_vs_readlen",
"impurity_importance": 0.06304989891121958
},
{
"feature": "ref_copynum",
"impurity_importance": 0.06281644250839796
},
{
"feature": "gt_repcn_max",
"impurity_importance": 0.045375808024477604
},
{
"feature": "gt_repcn_min",
"impurity_importance": 0.04503548319154128
},
{
"feature": "flanking_reads",
"impurity_importance": 0.04081082547154657
},
{
"feature": "spanning_frac",
"impurity_importance": 0.02788421749138721
},
{
"feature": "expansion_over_ref",
"impurity_importance": 0.017739812221077934
},
{
"feature": "locus_depth",
"impurity_importance": 0.014556405292958223
},
{
"feature": "spanning_reads",
"impurity_importance": 0.011672664495590936
},
{
"feature": "in_difficult",
"impurity_importance": 0.009656418449637608
},
{
"feature": "gt_hom",
"impurity_importance": 0.0024291103645865167
},
{
"feature": "in_segdup",
"impurity_importance": 0.001648983588740384
},
{
"feature": "flank_lowmap",
"impurity_importance": 0.001477948437034436
},
{
"feature": "repci_width_max",
"impurity_importance": 0.0012018133362063474
},
{
"feature": "inrepeat_reads",
"impurity_importance": 0.00033183288445321743
},
{
"feature": "is_pass",
"impurity_importance": 9.729477856164029e-07
},
{
"feature": "motif_is_homopolymer",
"impurity_importance": 0.0
}
],
"permutation": [
{
"feature": "entropy_flank",
"perm_importance_mean": 0.13934060781658777,
"perm_importance_std": 0.0006361765279266924
},
{
"feature": "motif_len",
"perm_importance_mean": 0.1232472127797279,
"perm_importance_std": 0.0005893220011599711
},
{
"feature": "gc_flank",
"perm_importance_mean": 0.06320217026546789,
"perm_importance_std": 0.00039522027338993824
},
{
"feature": "ref_tract_bp",
"perm_importance_mean": 0.056776687651067095,
"perm_importance_std": 0.00015236878123781785
},
{
"feature": "ref_copynum",
"perm_importance_mean": 0.02267318905161917,
"perm_importance_std": 0.00014989102435837524
},
{
"feature": "allele_vs_readlen",
"perm_importance_mean": 0.020529595235711205,
"perm_importance_std": 0.00017190103816491447
},
{
"feature": "gt_repcn_min",
"perm_importance_mean": 0.01731383830567197,
"perm_importance_std": 0.000195043199990813
},
{
"feature": "gt_repcn_max",
"perm_importance_mean": 0.014405902490600276,
"perm_importance_std": 0.00013955774976049523
},
{
"feature": "expansion_over_ref",
"perm_importance_mean": 0.008579439049389648,
"perm_importance_std": 8.141211169349268e-05
},
{
"feature": "flanking_reads",
"perm_importance_mean": 0.005908979701386818,
"perm_importance_std": 8.933000723756271e-05
},
{
"feature": "spanning_frac",
"perm_importance_mean": 0.005236130437139996,
"perm_importance_std": 4.831785228506296e-05
},
{
"feature": "in_difficult",
"perm_importance_mean": 0.003852866555695589,
"perm_importance_std": 2.129084797378384e-05
},
{
"feature": "spanning_reads",
"perm_importance_mean": 0.0029217009056680563,
"perm_importance_std": 4.176582259464099e-05
},
{
"feature": "gt_hom",
"perm_importance_mean": 0.002172501389781667,
"perm_importance_std": 8.3379119655914e-06
},
{
"feature": "locus_depth",
"perm_importance_mean": 0.0020709165127682284,
"perm_importance_std": 2.549011860464095e-05
},
{
"feature": "in_segdup",
"perm_importance_mean": 0.0009386532858458585,
"perm_importance_std": 1.750671402431846e-05
},
{
"feature": "flank_lowmap",
"perm_importance_mean": 0.0005812032061902617,
"perm_importance_std": 1.3028115550094254e-05
},
{
"feature": "repci_width_max",
"perm_importance_mean": 0.00026026760399893155,
"perm_importance_std": 1.492427417015547e-05
},
{
"feature": "inrepeat_reads",
"perm_importance_mean": 5.1632608300478114e-05,
"perm_importance_std": 5.166444569830962e-06
},
{
"feature": "is_pass",
"perm_importance_mean": 5.758337677796987e-08,
"perm_importance_std": 3.3427425855204445e-08
},
{
"feature": "motif_is_homopolymer",
"perm_importance_mean": 0.0,
"perm_importance_std": 0.0
}
]
}
}