zehralx commited on
Commit
0909d27
·
verified ·
1 Parent(s): e387e3d

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.57.1",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 31090
24
+ }
edge_case_results.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id,doi,title,category,expected,predicted,confidence,correct_class_conf,passed,change
2
+ fair_principles,,The FAIR Guiding Principles for scientific data management and stewardship,papers_about_data,False,False,0.955,0.955,True,unchanged
3
+ uniprot_database,,UniProt: the universal protein knowledgebase in 2021,database_paper,True,True,0.9437,0.9437,True,unchanged
4
+ bwa_tool,,Fast and accurate short read alignment with Burrows-Wheeler transform,methods_paper,False,False,0.9572,0.9572,True,unchanged
5
+ metabolome_atlas,,A metabolome atlas of the aging mouse brain,atlas_paper,True,True,0.9358,0.9358,True,unchanged
6
+ data_management_review,,A systematic review of data management practices in clinical research,review_paper,False,False,0.9505,0.9505,True,unchanged
7
+ gtex_consortium,,The GTEx Consortium atlas of genetic regulatory effects across human tissues,consortium_data,True,True,0.9559,0.9559,True,unchanged
8
+ alphafold_method,,Highly accurate protein structure prediction with AlphaFold,methods_paper,False,False,0.9487,0.9487,True,unchanged
9
+ imagenet_dataset,,ImageNet: A large-scale hierarchical image database,benchmark_dataset,True,True,0.9364,0.9364,True,unchanged
10
+ deseq2_method,,Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2,methods_paper,False,False,0.9561,0.9561,True,unchanged
11
+ tcga_pancancer,,The Cancer Genome Atlas Pan-Cancer analysis project,consortium_data,True,True,0.9595,0.9595,True,unchanged
12
+ data_sharing_perspective,,Data sharing in the age of deep learning: A perspective on challenges and opportunities,perspective_paper,False,False,0.9398,0.9398,True,improved
13
+ zenodo_deposited,,A curated dataset of protein-ligand binding affinities,curated_dataset,True,True,0.9464,0.9464,True,unchanged
14
+ nar_methods_spotlight,,SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots,methods_paper,False,False,0.9559,0.9559,True,unchanged
15
+ scientific_data_descriptor,,A comprehensive single-cell transcriptome atlas of human pancreatic islets,data_descriptor,True,True,0.9476,0.9476,True,unchanged
16
+ benchmark_with_method,,A benchmark dataset for evaluating drug response prediction methods,benchmark_dataset,True,True,0.9251,0.9251,True,degraded
17
+ database_update,,The 2024 update of the RCSB Protein Data Bank,database_update,True,True,0.951,0.951,True,unchanged
18
+ survey_ml_datasets,,A survey of datasets for machine learning in computational biology,survey_paper,False,False,0.8789,0.8789,True,FIXED
19
+ protocol_with_data,,Protocol for generating CRISPR knockout libraries with integrated sequencing data,protocol_paper,False,False,0.8418,0.8418,True,improved
20
+ encode_encyclopedia,10.1038/nature11247,An integrated encyclopedia of DNA elements in the human genome,consortium_data,True,True,0.9559,0.9559,True,unchanged
21
+ hgp_initial_draft,10.1038/35057062,Initial sequencing and analysis of the human genome,genome_resource,True,True,0.9597,0.9597,True,unchanged
22
+ hgp_finished_sequence,10.1038/nature03001,Finishing the euchromatic sequence of the human genome,genome_resource,True,True,0.9551,0.9551,True,unchanged
23
+ celera_human_genome,10.1126/science.1058040,The Sequence of the Human Genome,genome_resource,True,True,0.9484,0.9484,True,unchanged
24
+ 1000g_phase1,10.1038/nature09534,A map of human genome variation from population-scale sequencing,consortium_data,True,True,0.951,0.951,True,unchanged
25
+ 1000g_phase2,10.1038/nature11632,"An integrated map of genetic variation from 1,092 human genomes",consortium_data,True,True,0.9489,0.9489,True,unchanged
26
+ 1000g_phase3,10.1038/nature15393,A global reference for human genetic variation,consortium_data,True,True,0.9571,0.9571,True,unchanged
27
+ uk_biobank,10.1371/journal.pmed.1001779,UK Biobank: An Open Access Resource for Identifying the Causes of a Wide Range of Complex Diseases of Middle and Old Age,cohort_resource,True,True,0.9471,0.9471,True,unchanged
28
+ tcia_archive,10.1007/s10278-013-9622-7,The Cancer Imaging Archive (TCIA): Maintaining and Operating a Public Information Repository,imaging_resource,True,True,0.9544,0.9544,True,unchanged
29
+ icgc_data_portal,10.1038/s41587-019-0055-9,The International Cancer Genome Consortium Data Portal,database_paper,True,True,0.9622,0.9622,True,unchanged
30
+ nih_chest_xray,10.1109/CVPR.2017.369,ChestX-Ray8: Hospital-Scale Chest X-Ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases,benchmark_dataset,True,True,0.9238,0.9238,True,unchanged
31
+ mimic_iii,10.1038/sdata.2016.35,"MIMIC-III, a freely accessible critical care database",clinical_database,True,True,0.9519,0.9519,True,unchanged
32
+ mimic_iv,10.1038/s41597-022-01899-x,"MIMIC-IV, a freely accessible electronic health record dataset",clinical_database,True,True,0.9541,0.9541,True,unchanged
33
+ facebase3,10.1242/dev.191213,FaceBase 3: analytical tools and FAIR resources for craniofacial and dental research,database_paper,True,True,0.95,0.95,True,unchanged
34
+ topmed_sequencing,10.1038/s41586-021-03205-y,"Sequencing of 53,831 diverse genomes from the NHLBI TOPMed Program",consortium_data,True,True,0.9556,0.9556,True,unchanged
35
+ fastmri_dataset,10.48550/arXiv.1811.08839,fastMRI: An Open Dataset and Benchmarks for Accelerated MRI,benchmark_dataset,True,True,0.9503,0.9503,True,unchanged
36
+ brain_initiative_cell_census,10.1038/s41586-021-03950-0,A multimodal cell census and atlas of the mammalian primary motor cortex,consortium_data,True,True,0.9446,0.9446,True,unchanged
37
+ brain_initiative_merfish,10.1038/s41586-021-03705-x,Spatially resolved cell atlas of the mouse primary motor cortex by MERFISH,consortium_data,True,True,0.8465,0.8465,True,improved
38
+ framingham_original,10.2105/ajph.41.3.279,Epidemiological Approaches to Heart Disease: The Framingham Study,cohort_study_design,False,False,0.8019,0.8019,True,improved
39
+ framingham_cohort_profile,10.1093/ije/dyv337,Cohort Profile: The Framingham Heart Study (FHS): overview of milestones in cardiovascular epidemiology,cohort_resource,True,True,0.9385,0.9385,True,unchanged
metadata.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "scibert",
3
+ "base_model": "allenai/scibert_scivocab_uncased",
4
+ "continued_training": true,
5
+ "initial_epochs": 5,
6
+ "continue_epochs": 3,
7
+ "initial_lr": 2e-05,
8
+ "continue_lr": 5e-06,
9
+ "edge_weight": 5,
10
+ "label_smoothing": 0.1,
11
+ "sample_ratio": 0.3,
12
+ "fp16": true,
13
+ "batch_size": 24,
14
+ "edge_cases_before": 37,
15
+ "edge_cases_after": 38,
16
+ "edge_cases_total": 38,
17
+ "timestamp": "2026-02-10T00:20:39.488330"
18
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3a8e2b9a93fe67bc1f9db19eb2215eb1c87e1dc18c3d0cf310cb02af44f9872
3
+ size 439703544
predictions.csv ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "101": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "102": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "103": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "104": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff