Upload folder using huggingface_hub
Browse files- config.json +24 -0
- edge_case_results.csv +39 -0
- metadata.json +18 -0
- model.safetensors +3 -0
- predictions.csv +0 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +58 -0
- vocab.txt +0 -0
config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"dtype": "float32",
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.1,
|
| 10 |
+
"hidden_size": 768,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 3072,
|
| 13 |
+
"layer_norm_eps": 1e-12,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"model_type": "bert",
|
| 16 |
+
"num_attention_heads": 12,
|
| 17 |
+
"num_hidden_layers": 12,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"position_embedding_type": "absolute",
|
| 20 |
+
"transformers_version": "4.57.1",
|
| 21 |
+
"type_vocab_size": 2,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"vocab_size": 31090
|
| 24 |
+
}
|
edge_case_results.csv
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
id,doi,title,category,expected,predicted,confidence,correct_class_conf,passed,change
|
| 2 |
+
fair_principles,,The FAIR Guiding Principles for scientific data management and stewardship,papers_about_data,False,False,0.955,0.955,True,unchanged
|
| 3 |
+
uniprot_database,,UniProt: the universal protein knowledgebase in 2021,database_paper,True,True,0.9437,0.9437,True,unchanged
|
| 4 |
+
bwa_tool,,Fast and accurate short read alignment with Burrows-Wheeler transform,methods_paper,False,False,0.9572,0.9572,True,unchanged
|
| 5 |
+
metabolome_atlas,,A metabolome atlas of the aging mouse brain,atlas_paper,True,True,0.9358,0.9358,True,unchanged
|
| 6 |
+
data_management_review,,A systematic review of data management practices in clinical research,review_paper,False,False,0.9505,0.9505,True,unchanged
|
| 7 |
+
gtex_consortium,,The GTEx Consortium atlas of genetic regulatory effects across human tissues,consortium_data,True,True,0.9559,0.9559,True,unchanged
|
| 8 |
+
alphafold_method,,Highly accurate protein structure prediction with AlphaFold,methods_paper,False,False,0.9487,0.9487,True,unchanged
|
| 9 |
+
imagenet_dataset,,ImageNet: A large-scale hierarchical image database,benchmark_dataset,True,True,0.9364,0.9364,True,unchanged
|
| 10 |
+
deseq2_method,,Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2,methods_paper,False,False,0.9561,0.9561,True,unchanged
|
| 11 |
+
tcga_pancancer,,The Cancer Genome Atlas Pan-Cancer analysis project,consortium_data,True,True,0.9595,0.9595,True,unchanged
|
| 12 |
+
data_sharing_perspective,,Data sharing in the age of deep learning: A perspective on challenges and opportunities,perspective_paper,False,False,0.9398,0.9398,True,improved
|
| 13 |
+
zenodo_deposited,,A curated dataset of protein-ligand binding affinities,curated_dataset,True,True,0.9464,0.9464,True,unchanged
|
| 14 |
+
nar_methods_spotlight,,SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots,methods_paper,False,False,0.9559,0.9559,True,unchanged
|
| 15 |
+
scientific_data_descriptor,,A comprehensive single-cell transcriptome atlas of human pancreatic islets,data_descriptor,True,True,0.9476,0.9476,True,unchanged
|
| 16 |
+
benchmark_with_method,,A benchmark dataset for evaluating drug response prediction methods,benchmark_dataset,True,True,0.9251,0.9251,True,degraded
|
| 17 |
+
database_update,,The 2024 update of the RCSB Protein Data Bank,database_update,True,True,0.951,0.951,True,unchanged
|
| 18 |
+
survey_ml_datasets,,A survey of datasets for machine learning in computational biology,survey_paper,False,False,0.8789,0.8789,True,FIXED
|
| 19 |
+
protocol_with_data,,Protocol for generating CRISPR knockout libraries with integrated sequencing data,protocol_paper,False,False,0.8418,0.8418,True,improved
|
| 20 |
+
encode_encyclopedia,10.1038/nature11247,An integrated encyclopedia of DNA elements in the human genome,consortium_data,True,True,0.9559,0.9559,True,unchanged
|
| 21 |
+
hgp_initial_draft,10.1038/35057062,Initial sequencing and analysis of the human genome,genome_resource,True,True,0.9597,0.9597,True,unchanged
|
| 22 |
+
hgp_finished_sequence,10.1038/nature03001,Finishing the euchromatic sequence of the human genome,genome_resource,True,True,0.9551,0.9551,True,unchanged
|
| 23 |
+
celera_human_genome,10.1126/science.1058040,The Sequence of the Human Genome,genome_resource,True,True,0.9484,0.9484,True,unchanged
|
| 24 |
+
1000g_phase1,10.1038/nature09534,A map of human genome variation from population-scale sequencing,consortium_data,True,True,0.951,0.951,True,unchanged
|
| 25 |
+
1000g_phase2,10.1038/nature11632,"An integrated map of genetic variation from 1,092 human genomes",consortium_data,True,True,0.9489,0.9489,True,unchanged
|
| 26 |
+
1000g_phase3,10.1038/nature15393,A global reference for human genetic variation,consortium_data,True,True,0.9571,0.9571,True,unchanged
|
| 27 |
+
uk_biobank,10.1371/journal.pmed.1001779,UK Biobank: An Open Access Resource for Identifying the Causes of a Wide Range of Complex Diseases of Middle and Old Age,cohort_resource,True,True,0.9471,0.9471,True,unchanged
|
| 28 |
+
tcia_archive,10.1007/s10278-013-9622-7,The Cancer Imaging Archive (TCIA): Maintaining and Operating a Public Information Repository,imaging_resource,True,True,0.9544,0.9544,True,unchanged
|
| 29 |
+
icgc_data_portal,10.1038/s41587-019-0055-9,The International Cancer Genome Consortium Data Portal,database_paper,True,True,0.9622,0.9622,True,unchanged
|
| 30 |
+
nih_chest_xray,10.1109/CVPR.2017.369,ChestX-Ray8: Hospital-Scale Chest X-Ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases,benchmark_dataset,True,True,0.9238,0.9238,True,unchanged
|
| 31 |
+
mimic_iii,10.1038/sdata.2016.35,"MIMIC-III, a freely accessible critical care database",clinical_database,True,True,0.9519,0.9519,True,unchanged
|
| 32 |
+
mimic_iv,10.1038/s41597-022-01899-x,"MIMIC-IV, a freely accessible electronic health record dataset",clinical_database,True,True,0.9541,0.9541,True,unchanged
|
| 33 |
+
facebase3,10.1242/dev.191213,FaceBase 3: analytical tools and FAIR resources for craniofacial and dental research,database_paper,True,True,0.95,0.95,True,unchanged
|
| 34 |
+
topmed_sequencing,10.1038/s41586-021-03205-y,"Sequencing of 53,831 diverse genomes from the NHLBI TOPMed Program",consortium_data,True,True,0.9556,0.9556,True,unchanged
|
| 35 |
+
fastmri_dataset,10.48550/arXiv.1811.08839,fastMRI: An Open Dataset and Benchmarks for Accelerated MRI,benchmark_dataset,True,True,0.9503,0.9503,True,unchanged
|
| 36 |
+
brain_initiative_cell_census,10.1038/s41586-021-03950-0,A multimodal cell census and atlas of the mammalian primary motor cortex,consortium_data,True,True,0.9446,0.9446,True,unchanged
|
| 37 |
+
brain_initiative_merfish,10.1038/s41586-021-03705-x,Spatially resolved cell atlas of the mouse primary motor cortex by MERFISH,consortium_data,True,True,0.8465,0.8465,True,improved
|
| 38 |
+
framingham_original,10.2105/ajph.41.3.279,Epidemiological Approaches to Heart Disease: The Framingham Study,cohort_study_design,False,False,0.8019,0.8019,True,improved
|
| 39 |
+
framingham_cohort_profile,10.1093/ije/dyv337,Cohort Profile: The Framingham Heart Study (FHS): overview of milestones in cardiovascular epidemiology,cohort_resource,True,True,0.9385,0.9385,True,unchanged
|
metadata.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "scibert",
|
| 3 |
+
"base_model": "allenai/scibert_scivocab_uncased",
|
| 4 |
+
"continued_training": true,
|
| 5 |
+
"initial_epochs": 5,
|
| 6 |
+
"continue_epochs": 3,
|
| 7 |
+
"initial_lr": 2e-05,
|
| 8 |
+
"continue_lr": 5e-06,
|
| 9 |
+
"edge_weight": 5,
|
| 10 |
+
"label_smoothing": 0.1,
|
| 11 |
+
"sample_ratio": 0.3,
|
| 12 |
+
"fp16": true,
|
| 13 |
+
"batch_size": 24,
|
| 14 |
+
"edge_cases_before": 37,
|
| 15 |
+
"edge_cases_after": 38,
|
| 16 |
+
"edge_cases_total": 38,
|
| 17 |
+
"timestamp": "2026-02-10T00:20:39.488330"
|
| 18 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3a8e2b9a93fe67bc1f9db19eb2215eb1c87e1dc18c3d0cf310cb02af44f9872
|
| 3 |
+
size 439703544
|
predictions.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"101": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"102": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"103": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"104": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": true,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 51 |
+
"never_split": null,
|
| 52 |
+
"pad_token": "[PAD]",
|
| 53 |
+
"sep_token": "[SEP]",
|
| 54 |
+
"strip_accents": null,
|
| 55 |
+
"tokenize_chinese_chars": true,
|
| 56 |
+
"tokenizer_class": "BertTokenizer",
|
| 57 |
+
"unk_token": "[UNK]"
|
| 58 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|