File size: 11,626 Bytes
3c7d0d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
{
  "variant": "str",
  "created_unix": 1782043477,
  "feature_cols": [
    "is_pass",
    "motif_len",
    "ref_copynum",
    "gt_repcn_max",
    "gt_repcn_min",
    "expansion_over_ref",
    "repci_width_max",
    "spanning_reads",
    "flanking_reads",
    "inrepeat_reads",
    "locus_depth",
    "gt_hom",
    "ref_tract_bp",
    "spanning_frac",
    "allele_vs_readlen",
    "motif_is_homopolymer",
    "gc_flank",
    "entropy_flank",
    "in_segdup",
    "in_difficult",
    "flank_lowmap"
  ],
  "n_features": 21,
  "tier_edges": [
    0.3,
    0.5,
    0.7
  ],
  "tier_names": [
    "LOW",
    "Warning",
    "Moderate",
    "High"
  ],
  "missing_sentinel": -99999.0,
  "rf_params": {
    "bootstrap": true,
    "ccp_alpha": 0.0,
    "class_weight": "balanced_subsample",
    "criterion": "gini",
    "max_depth": null,
    "max_features": "sqrt",
    "max_leaf_nodes": null,
    "max_samples": 2000000,
    "min_impurity_decrease": 0.0,
    "min_samples_leaf": 50,
    "min_samples_split": 2,
    "min_weight_fraction_leaf": 0.0,
    "monotonic_cst": null,
    "n_estimators": 300,
    "n_jobs": -1,
    "oob_score": false,
    "random_state": 42,
    "verbose": 0,
    "warm_start": false
  },
  "n_train_rows": 22651133,
  "n_samples": 208,
  "qc": {
    "label_rows_raw": 36254400,
    "label_dist_raw": {
      "concordant": 21350382,
      "discordant": 13838163,
      "unlabeled": 1065855
    },
    "label_rows_usable": 35188545,
    "ambiguous_keys_dropped": 0,
    "ambiguous_feat_rows": 0,
    "ambiguous_label_rows": 0,
    "dup_keys_feature": 0,
    "dup_keys_label": 0,
    "merged_rows": 22651133,
    "match_rate_vs_labels": 0.6437075758602693,
    "match_rate_vs_features": 0.9832673629385175,
    "class_balance": {
      "concordant": 13960015,
      "discordant": 8691118
    },
    "concordant_rate": 0.6163053742168217
  },
  "cv_folds": 5,
  "cv_fold_metrics": [
    {
      "n": 4469639,
      "pos_rate": 0.6172603648751052,
      "auroc": 0.8345731588413778,
      "auprc": 0.8868311937682424,
      "brier": 0.16715199887480572,
      "logloss": 0.505031384190826,
      "fold": 0,
      "seconds": 404.5
    },
    {
      "n": 4469658,
      "pos_rate": 0.6172628867801518,
      "auroc": 0.8348793797657998,
      "auprc": 0.8871277104956028,
      "brier": 0.16710046702995693,
      "logloss": 0.5048207582711781,
      "fold": 1,
      "seconds": 457.3
    },
    {
      "n": 4569998,
      "pos_rate": 0.6173429397562099,
      "auroc": 0.8345632397054213,
      "auprc": 0.8867279699640327,
      "brier": 0.16717765756008388,
      "logloss": 0.5050623605875793,
      "fold": 2,
      "seconds": 480.6
    },
    {
      "n": 4570859,
      "pos_rate": 0.6168989242503433,
      "auroc": 0.8350534258010407,
      "auprc": 0.8870572426757822,
      "brier": 0.1669604630273807,
      "logloss": 0.5044600822147348,
      "fold": 3,
      "seconds": 546.9
    },
    {
      "n": 4570979,
      "pos_rate": 0.6128043904817765,
      "auroc": 0.8317845587452297,
      "auprc": 0.8823297885222531,
      "brier": 0.16790578845588436,
      "logloss": 0.5066430427730261,
      "fold": 4,
      "seconds": 558.6
    }
  ],
  "cv_report": {
    "overall": {
      "n": 22651133,
      "pos_rate": 0.6163053742168217,
      "auroc": 0.8341539493365068,
      "auprc": 0.885996637709877,
      "brier": 0.16726047042063633,
      "logloss": 0.5052060179718258
    },
    "calibration": [
      {
        "bin": "[0.0,0.1)",
        "n": 759079,
        "mean_pred": 0.06623314081824806,
        "obs_rate": 0.027333123429840636
      },
      {
        "bin": "[0.1,0.2)",
        "n": 1807689,
        "mean_pred": 0.15353118408631086,
        "obs_rate": 0.1398288090484591
      },
      {
        "bin": "[0.2,0.3)",
        "n": 2278662,
        "mean_pred": 0.250703986073481,
        "obs_rate": 0.2854271497922904
      },
      {
        "bin": "[0.3,0.4)",
        "n": 2401825,
        "mean_pred": 0.35114505321433914,
        "obs_rate": 0.4219845325950059
      },
      {
        "bin": "[0.4,0.5)",
        "n": 2503890,
        "mean_pred": 0.4496778698066448,
        "obs_rate": 0.5559477453083003
      },
      {
        "bin": "[0.5,0.6)",
        "n": 2743182,
        "mean_pred": 0.5514420283736253,
        "obs_rate": 0.6633803371413198
      },
      {
        "bin": "[0.6,0.7)",
        "n": 3201411,
        "mean_pred": 0.6513120336728542,
        "obs_rate": 0.7673941271520589
      },
      {
        "bin": "[0.7,0.8)",
        "n": 2972899,
        "mean_pred": 0.7478180823491758,
        "obs_rate": 0.8596629081579966
      },
      {
        "bin": "[0.8,0.9)",
        "n": 2979925,
        "mean_pred": 0.8513437073854806,
        "obs_rate": 0.9412015403072225
      },
      {
        "bin": "[0.9,1.0)",
        "n": 1002571,
        "mean_pred": 0.9221679799864609,
        "obs_rate": 0.9910769411842154
      }
    ],
    "per_sample_auroc": {
      "n_samples": 208,
      "median": 0.8353140721290141,
      "p25": 0.8326614184016954,
      "p75": 0.8373927525350378,
      "min": 0.740174387702103,
      "max": 0.8401855333526593
    },
    "by_homopolymer": {
      "homopolymer": {
        "n": 176,
        "pos_rate": 0.0,
        "auroc": null,
        "auprc": null,
        "brier": 0.12461994174893026
      },
      "other": {
        "n": 22650957,
        "pos_rate": 0.6163101629657414,
        "auroc": 0.8341526308855854,
        "auprc": 0.8859973231761953,
        "brier": 0.16726080174142982,
        "logloss": 0.5052065639352175
      }
    },
    "by_is_pass": {
      "PASS": {
        "n": 22645309,
        "pos_rate": 0.6163365225000904,
        "auroc": 0.8341536917536043,
        "auprc": 0.8860084593752011,
        "brier": 0.1672574382686718,
        "logloss": 0.505198302627369
      },
      "nonPASS": {
        "n": 5824,
        "pos_rate": 0.4951923076923077,
        "auroc": 0.821139738835895,
        "auprc": 0.8249088115206255,
        "brier": 0.17905030870563365,
        "logloss": 0.5352053928461165
      }
    }
  },
  "importances": {
    "impurity": [
      {
        "feature": "entropy_flank",
        "impurity_importance": 0.28992320685730033
      },
      {
        "feature": "motif_len",
        "impurity_importance": 0.15078304844246473
      },
      {
        "feature": "gc_flank",
        "impurity_importance": 0.11765967510912077
      },
      {
        "feature": "ref_tract_bp",
        "impurity_importance": 0.09594543197447271
      },
      {
        "feature": "allele_vs_readlen",
        "impurity_importance": 0.06304989891121958
      },
      {
        "feature": "ref_copynum",
        "impurity_importance": 0.06281644250839796
      },
      {
        "feature": "gt_repcn_max",
        "impurity_importance": 0.045375808024477604
      },
      {
        "feature": "gt_repcn_min",
        "impurity_importance": 0.04503548319154128
      },
      {
        "feature": "flanking_reads",
        "impurity_importance": 0.04081082547154657
      },
      {
        "feature": "spanning_frac",
        "impurity_importance": 0.02788421749138721
      },
      {
        "feature": "expansion_over_ref",
        "impurity_importance": 0.017739812221077934
      },
      {
        "feature": "locus_depth",
        "impurity_importance": 0.014556405292958223
      },
      {
        "feature": "spanning_reads",
        "impurity_importance": 0.011672664495590936
      },
      {
        "feature": "in_difficult",
        "impurity_importance": 0.009656418449637608
      },
      {
        "feature": "gt_hom",
        "impurity_importance": 0.0024291103645865167
      },
      {
        "feature": "in_segdup",
        "impurity_importance": 0.001648983588740384
      },
      {
        "feature": "flank_lowmap",
        "impurity_importance": 0.001477948437034436
      },
      {
        "feature": "repci_width_max",
        "impurity_importance": 0.0012018133362063474
      },
      {
        "feature": "inrepeat_reads",
        "impurity_importance": 0.00033183288445321743
      },
      {
        "feature": "is_pass",
        "impurity_importance": 9.729477856164029e-07
      },
      {
        "feature": "motif_is_homopolymer",
        "impurity_importance": 0.0
      }
    ],
    "permutation": [
      {
        "feature": "entropy_flank",
        "perm_importance_mean": 0.13934060781658777,
        "perm_importance_std": 0.0006361765279266924
      },
      {
        "feature": "motif_len",
        "perm_importance_mean": 0.1232472127797279,
        "perm_importance_std": 0.0005893220011599711
      },
      {
        "feature": "gc_flank",
        "perm_importance_mean": 0.06320217026546789,
        "perm_importance_std": 0.00039522027338993824
      },
      {
        "feature": "ref_tract_bp",
        "perm_importance_mean": 0.056776687651067095,
        "perm_importance_std": 0.00015236878123781785
      },
      {
        "feature": "ref_copynum",
        "perm_importance_mean": 0.02267318905161917,
        "perm_importance_std": 0.00014989102435837524
      },
      {
        "feature": "allele_vs_readlen",
        "perm_importance_mean": 0.020529595235711205,
        "perm_importance_std": 0.00017190103816491447
      },
      {
        "feature": "gt_repcn_min",
        "perm_importance_mean": 0.01731383830567197,
        "perm_importance_std": 0.000195043199990813
      },
      {
        "feature": "gt_repcn_max",
        "perm_importance_mean": 0.014405902490600276,
        "perm_importance_std": 0.00013955774976049523
      },
      {
        "feature": "expansion_over_ref",
        "perm_importance_mean": 0.008579439049389648,
        "perm_importance_std": 8.141211169349268e-05
      },
      {
        "feature": "flanking_reads",
        "perm_importance_mean": 0.005908979701386818,
        "perm_importance_std": 8.933000723756271e-05
      },
      {
        "feature": "spanning_frac",
        "perm_importance_mean": 0.005236130437139996,
        "perm_importance_std": 4.831785228506296e-05
      },
      {
        "feature": "in_difficult",
        "perm_importance_mean": 0.003852866555695589,
        "perm_importance_std": 2.129084797378384e-05
      },
      {
        "feature": "spanning_reads",
        "perm_importance_mean": 0.0029217009056680563,
        "perm_importance_std": 4.176582259464099e-05
      },
      {
        "feature": "gt_hom",
        "perm_importance_mean": 0.002172501389781667,
        "perm_importance_std": 8.3379119655914e-06
      },
      {
        "feature": "locus_depth",
        "perm_importance_mean": 0.0020709165127682284,
        "perm_importance_std": 2.549011860464095e-05
      },
      {
        "feature": "in_segdup",
        "perm_importance_mean": 0.0009386532858458585,
        "perm_importance_std": 1.750671402431846e-05
      },
      {
        "feature": "flank_lowmap",
        "perm_importance_mean": 0.0005812032061902617,
        "perm_importance_std": 1.3028115550094254e-05
      },
      {
        "feature": "repci_width_max",
        "perm_importance_mean": 0.00026026760399893155,
        "perm_importance_std": 1.492427417015547e-05
      },
      {
        "feature": "inrepeat_reads",
        "perm_importance_mean": 5.1632608300478114e-05,
        "perm_importance_std": 5.166444569830962e-06
      },
      {
        "feature": "is_pass",
        "perm_importance_mean": 5.758337677796987e-08,
        "perm_importance_std": 3.3427425855204445e-08
      },
      {
        "feature": "motif_is_homopolymer",
        "perm_importance_mean": 0.0,
        "perm_importance_std": 0.0
      }
    ]
  }
}