Tabular Classification
Scikit-learn
Joblib
genomics
structural-variants
short-tandem-repeats
variant-calling
confidence-calibration
random-forest
Instructions to use khyeom/SVSTR-Score with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use khyeom/SVSTR-Score with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("khyeom/SVSTR-Score", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| { | |
| "variant": "str", | |
| "created_unix": 1782043477, | |
| "feature_cols": [ | |
| "is_pass", | |
| "motif_len", | |
| "ref_copynum", | |
| "gt_repcn_max", | |
| "gt_repcn_min", | |
| "expansion_over_ref", | |
| "repci_width_max", | |
| "spanning_reads", | |
| "flanking_reads", | |
| "inrepeat_reads", | |
| "locus_depth", | |
| "gt_hom", | |
| "ref_tract_bp", | |
| "spanning_frac", | |
| "allele_vs_readlen", | |
| "motif_is_homopolymer", | |
| "gc_flank", | |
| "entropy_flank", | |
| "in_segdup", | |
| "in_difficult", | |
| "flank_lowmap" | |
| ], | |
| "n_features": 21, | |
| "tier_edges": [ | |
| 0.3, | |
| 0.5, | |
| 0.7 | |
| ], | |
| "tier_names": [ | |
| "LOW", | |
| "Warning", | |
| "Moderate", | |
| "High" | |
| ], | |
| "missing_sentinel": -99999.0, | |
| "rf_params": { | |
| "bootstrap": true, | |
| "ccp_alpha": 0.0, | |
| "class_weight": "balanced_subsample", | |
| "criterion": "gini", | |
| "max_depth": null, | |
| "max_features": "sqrt", | |
| "max_leaf_nodes": null, | |
| "max_samples": 2000000, | |
| "min_impurity_decrease": 0.0, | |
| "min_samples_leaf": 50, | |
| "min_samples_split": 2, | |
| "min_weight_fraction_leaf": 0.0, | |
| "monotonic_cst": null, | |
| "n_estimators": 300, | |
| "n_jobs": -1, | |
| "oob_score": false, | |
| "random_state": 42, | |
| "verbose": 0, | |
| "warm_start": false | |
| }, | |
| "n_train_rows": 22651133, | |
| "n_samples": 208, | |
| "qc": { | |
| "label_rows_raw": 36254400, | |
| "label_dist_raw": { | |
| "concordant": 21350382, | |
| "discordant": 13838163, | |
| "unlabeled": 1065855 | |
| }, | |
| "label_rows_usable": 35188545, | |
| "ambiguous_keys_dropped": 0, | |
| "ambiguous_feat_rows": 0, | |
| "ambiguous_label_rows": 0, | |
| "dup_keys_feature": 0, | |
| "dup_keys_label": 0, | |
| "merged_rows": 22651133, | |
| "match_rate_vs_labels": 0.6437075758602693, | |
| "match_rate_vs_features": 0.9832673629385175, | |
| "class_balance": { | |
| "concordant": 13960015, | |
| "discordant": 8691118 | |
| }, | |
| "concordant_rate": 0.6163053742168217 | |
| }, | |
| "cv_folds": 5, | |
| "cv_fold_metrics": [ | |
| { | |
| "n": 4469639, | |
| "pos_rate": 0.6172603648751052, | |
| "auroc": 0.8345731588413778, | |
| "auprc": 0.8868311937682424, | |
| "brier": 0.16715199887480572, | |
| "logloss": 0.505031384190826, | |
| "fold": 0, | |
| "seconds": 404.5 | |
| }, | |
| { | |
| "n": 4469658, | |
| "pos_rate": 0.6172628867801518, | |
| "auroc": 0.8348793797657998, | |
| "auprc": 0.8871277104956028, | |
| "brier": 0.16710046702995693, | |
| "logloss": 0.5048207582711781, | |
| "fold": 1, | |
| "seconds": 457.3 | |
| }, | |
| { | |
| "n": 4569998, | |
| "pos_rate": 0.6173429397562099, | |
| "auroc": 0.8345632397054213, | |
| "auprc": 0.8867279699640327, | |
| "brier": 0.16717765756008388, | |
| "logloss": 0.5050623605875793, | |
| "fold": 2, | |
| "seconds": 480.6 | |
| }, | |
| { | |
| "n": 4570859, | |
| "pos_rate": 0.6168989242503433, | |
| "auroc": 0.8350534258010407, | |
| "auprc": 0.8870572426757822, | |
| "brier": 0.1669604630273807, | |
| "logloss": 0.5044600822147348, | |
| "fold": 3, | |
| "seconds": 546.9 | |
| }, | |
| { | |
| "n": 4570979, | |
| "pos_rate": 0.6128043904817765, | |
| "auroc": 0.8317845587452297, | |
| "auprc": 0.8823297885222531, | |
| "brier": 0.16790578845588436, | |
| "logloss": 0.5066430427730261, | |
| "fold": 4, | |
| "seconds": 558.6 | |
| } | |
| ], | |
| "cv_report": { | |
| "overall": { | |
| "n": 22651133, | |
| "pos_rate": 0.6163053742168217, | |
| "auroc": 0.8341539493365068, | |
| "auprc": 0.885996637709877, | |
| "brier": 0.16726047042063633, | |
| "logloss": 0.5052060179718258 | |
| }, | |
| "calibration": [ | |
| { | |
| "bin": "[0.0,0.1)", | |
| "n": 759079, | |
| "mean_pred": 0.06623314081824806, | |
| "obs_rate": 0.027333123429840636 | |
| }, | |
| { | |
| "bin": "[0.1,0.2)", | |
| "n": 1807689, | |
| "mean_pred": 0.15353118408631086, | |
| "obs_rate": 0.1398288090484591 | |
| }, | |
| { | |
| "bin": "[0.2,0.3)", | |
| "n": 2278662, | |
| "mean_pred": 0.250703986073481, | |
| "obs_rate": 0.2854271497922904 | |
| }, | |
| { | |
| "bin": "[0.3,0.4)", | |
| "n": 2401825, | |
| "mean_pred": 0.35114505321433914, | |
| "obs_rate": 0.4219845325950059 | |
| }, | |
| { | |
| "bin": "[0.4,0.5)", | |
| "n": 2503890, | |
| "mean_pred": 0.4496778698066448, | |
| "obs_rate": 0.5559477453083003 | |
| }, | |
| { | |
| "bin": "[0.5,0.6)", | |
| "n": 2743182, | |
| "mean_pred": 0.5514420283736253, | |
| "obs_rate": 0.6633803371413198 | |
| }, | |
| { | |
| "bin": "[0.6,0.7)", | |
| "n": 3201411, | |
| "mean_pred": 0.6513120336728542, | |
| "obs_rate": 0.7673941271520589 | |
| }, | |
| { | |
| "bin": "[0.7,0.8)", | |
| "n": 2972899, | |
| "mean_pred": 0.7478180823491758, | |
| "obs_rate": 0.8596629081579966 | |
| }, | |
| { | |
| "bin": "[0.8,0.9)", | |
| "n": 2979925, | |
| "mean_pred": 0.8513437073854806, | |
| "obs_rate": 0.9412015403072225 | |
| }, | |
| { | |
| "bin": "[0.9,1.0)", | |
| "n": 1002571, | |
| "mean_pred": 0.9221679799864609, | |
| "obs_rate": 0.9910769411842154 | |
| } | |
| ], | |
| "per_sample_auroc": { | |
| "n_samples": 208, | |
| "median": 0.8353140721290141, | |
| "p25": 0.8326614184016954, | |
| "p75": 0.8373927525350378, | |
| "min": 0.740174387702103, | |
| "max": 0.8401855333526593 | |
| }, | |
| "by_homopolymer": { | |
| "homopolymer": { | |
| "n": 176, | |
| "pos_rate": 0.0, | |
| "auroc": null, | |
| "auprc": null, | |
| "brier": 0.12461994174893026 | |
| }, | |
| "other": { | |
| "n": 22650957, | |
| "pos_rate": 0.6163101629657414, | |
| "auroc": 0.8341526308855854, | |
| "auprc": 0.8859973231761953, | |
| "brier": 0.16726080174142982, | |
| "logloss": 0.5052065639352175 | |
| } | |
| }, | |
| "by_is_pass": { | |
| "PASS": { | |
| "n": 22645309, | |
| "pos_rate": 0.6163365225000904, | |
| "auroc": 0.8341536917536043, | |
| "auprc": 0.8860084593752011, | |
| "brier": 0.1672574382686718, | |
| "logloss": 0.505198302627369 | |
| }, | |
| "nonPASS": { | |
| "n": 5824, | |
| "pos_rate": 0.4951923076923077, | |
| "auroc": 0.821139738835895, | |
| "auprc": 0.8249088115206255, | |
| "brier": 0.17905030870563365, | |
| "logloss": 0.5352053928461165 | |
| } | |
| } | |
| }, | |
| "importances": { | |
| "impurity": [ | |
| { | |
| "feature": "entropy_flank", | |
| "impurity_importance": 0.28992320685730033 | |
| }, | |
| { | |
| "feature": "motif_len", | |
| "impurity_importance": 0.15078304844246473 | |
| }, | |
| { | |
| "feature": "gc_flank", | |
| "impurity_importance": 0.11765967510912077 | |
| }, | |
| { | |
| "feature": "ref_tract_bp", | |
| "impurity_importance": 0.09594543197447271 | |
| }, | |
| { | |
| "feature": "allele_vs_readlen", | |
| "impurity_importance": 0.06304989891121958 | |
| }, | |
| { | |
| "feature": "ref_copynum", | |
| "impurity_importance": 0.06281644250839796 | |
| }, | |
| { | |
| "feature": "gt_repcn_max", | |
| "impurity_importance": 0.045375808024477604 | |
| }, | |
| { | |
| "feature": "gt_repcn_min", | |
| "impurity_importance": 0.04503548319154128 | |
| }, | |
| { | |
| "feature": "flanking_reads", | |
| "impurity_importance": 0.04081082547154657 | |
| }, | |
| { | |
| "feature": "spanning_frac", | |
| "impurity_importance": 0.02788421749138721 | |
| }, | |
| { | |
| "feature": "expansion_over_ref", | |
| "impurity_importance": 0.017739812221077934 | |
| }, | |
| { | |
| "feature": "locus_depth", | |
| "impurity_importance": 0.014556405292958223 | |
| }, | |
| { | |
| "feature": "spanning_reads", | |
| "impurity_importance": 0.011672664495590936 | |
| }, | |
| { | |
| "feature": "in_difficult", | |
| "impurity_importance": 0.009656418449637608 | |
| }, | |
| { | |
| "feature": "gt_hom", | |
| "impurity_importance": 0.0024291103645865167 | |
| }, | |
| { | |
| "feature": "in_segdup", | |
| "impurity_importance": 0.001648983588740384 | |
| }, | |
| { | |
| "feature": "flank_lowmap", | |
| "impurity_importance": 0.001477948437034436 | |
| }, | |
| { | |
| "feature": "repci_width_max", | |
| "impurity_importance": 0.0012018133362063474 | |
| }, | |
| { | |
| "feature": "inrepeat_reads", | |
| "impurity_importance": 0.00033183288445321743 | |
| }, | |
| { | |
| "feature": "is_pass", | |
| "impurity_importance": 9.729477856164029e-07 | |
| }, | |
| { | |
| "feature": "motif_is_homopolymer", | |
| "impurity_importance": 0.0 | |
| } | |
| ], | |
| "permutation": [ | |
| { | |
| "feature": "entropy_flank", | |
| "perm_importance_mean": 0.13934060781658777, | |
| "perm_importance_std": 0.0006361765279266924 | |
| }, | |
| { | |
| "feature": "motif_len", | |
| "perm_importance_mean": 0.1232472127797279, | |
| "perm_importance_std": 0.0005893220011599711 | |
| }, | |
| { | |
| "feature": "gc_flank", | |
| "perm_importance_mean": 0.06320217026546789, | |
| "perm_importance_std": 0.00039522027338993824 | |
| }, | |
| { | |
| "feature": "ref_tract_bp", | |
| "perm_importance_mean": 0.056776687651067095, | |
| "perm_importance_std": 0.00015236878123781785 | |
| }, | |
| { | |
| "feature": "ref_copynum", | |
| "perm_importance_mean": 0.02267318905161917, | |
| "perm_importance_std": 0.00014989102435837524 | |
| }, | |
| { | |
| "feature": "allele_vs_readlen", | |
| "perm_importance_mean": 0.020529595235711205, | |
| "perm_importance_std": 0.00017190103816491447 | |
| }, | |
| { | |
| "feature": "gt_repcn_min", | |
| "perm_importance_mean": 0.01731383830567197, | |
| "perm_importance_std": 0.000195043199990813 | |
| }, | |
| { | |
| "feature": "gt_repcn_max", | |
| "perm_importance_mean": 0.014405902490600276, | |
| "perm_importance_std": 0.00013955774976049523 | |
| }, | |
| { | |
| "feature": "expansion_over_ref", | |
| "perm_importance_mean": 0.008579439049389648, | |
| "perm_importance_std": 8.141211169349268e-05 | |
| }, | |
| { | |
| "feature": "flanking_reads", | |
| "perm_importance_mean": 0.005908979701386818, | |
| "perm_importance_std": 8.933000723756271e-05 | |
| }, | |
| { | |
| "feature": "spanning_frac", | |
| "perm_importance_mean": 0.005236130437139996, | |
| "perm_importance_std": 4.831785228506296e-05 | |
| }, | |
| { | |
| "feature": "in_difficult", | |
| "perm_importance_mean": 0.003852866555695589, | |
| "perm_importance_std": 2.129084797378384e-05 | |
| }, | |
| { | |
| "feature": "spanning_reads", | |
| "perm_importance_mean": 0.0029217009056680563, | |
| "perm_importance_std": 4.176582259464099e-05 | |
| }, | |
| { | |
| "feature": "gt_hom", | |
| "perm_importance_mean": 0.002172501389781667, | |
| "perm_importance_std": 8.3379119655914e-06 | |
| }, | |
| { | |
| "feature": "locus_depth", | |
| "perm_importance_mean": 0.0020709165127682284, | |
| "perm_importance_std": 2.549011860464095e-05 | |
| }, | |
| { | |
| "feature": "in_segdup", | |
| "perm_importance_mean": 0.0009386532858458585, | |
| "perm_importance_std": 1.750671402431846e-05 | |
| }, | |
| { | |
| "feature": "flank_lowmap", | |
| "perm_importance_mean": 0.0005812032061902617, | |
| "perm_importance_std": 1.3028115550094254e-05 | |
| }, | |
| { | |
| "feature": "repci_width_max", | |
| "perm_importance_mean": 0.00026026760399893155, | |
| "perm_importance_std": 1.492427417015547e-05 | |
| }, | |
| { | |
| "feature": "inrepeat_reads", | |
| "perm_importance_mean": 5.1632608300478114e-05, | |
| "perm_importance_std": 5.166444569830962e-06 | |
| }, | |
| { | |
| "feature": "is_pass", | |
| "perm_importance_mean": 5.758337677796987e-08, | |
| "perm_importance_std": 3.3427425855204445e-08 | |
| }, | |
| { | |
| "feature": "motif_is_homopolymer", | |
| "perm_importance_mean": 0.0, | |
| "perm_importance_std": 0.0 | |
| } | |
| ] | |
| } | |
| } |