Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

config.json +24 -0
edge_case_results.csv +39 -0
metadata.json +18 -0
model.safetensors +3 -0
predictions.csv +0 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.57.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 31090
+}

edge_case_results.csv ADDED Viewed

	@@ -0,0 +1,39 @@

+id,doi,title,category,expected,predicted,confidence,correct_class_conf,passed,change
+fair_principles,,The FAIR Guiding Principles for scientific data management and stewardship,papers_about_data,False,False,0.955,0.955,True,unchanged
+uniprot_database,,UniProt: the universal protein knowledgebase in 2021,database_paper,True,True,0.9437,0.9437,True,unchanged
+bwa_tool,,Fast and accurate short read alignment with Burrows-Wheeler transform,methods_paper,False,False,0.9572,0.9572,True,unchanged
+metabolome_atlas,,A metabolome atlas of the aging mouse brain,atlas_paper,True,True,0.9358,0.9358,True,unchanged
+data_management_review,,A systematic review of data management practices in clinical research,review_paper,False,False,0.9505,0.9505,True,unchanged
+gtex_consortium,,The GTEx Consortium atlas of genetic regulatory effects across human tissues,consortium_data,True,True,0.9559,0.9559,True,unchanged
+alphafold_method,,Highly accurate protein structure prediction with AlphaFold,methods_paper,False,False,0.9487,0.9487,True,unchanged
+imagenet_dataset,,ImageNet: A large-scale hierarchical image database,benchmark_dataset,True,True,0.9364,0.9364,True,unchanged
+deseq2_method,,Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2,methods_paper,False,False,0.9561,0.9561,True,unchanged
+tcga_pancancer,,The Cancer Genome Atlas Pan-Cancer analysis project,consortium_data,True,True,0.9595,0.9595,True,unchanged
+data_sharing_perspective,,Data sharing in the age of deep learning: A perspective on challenges and opportunities,perspective_paper,False,False,0.9398,0.9398,True,improved
+zenodo_deposited,,A curated dataset of protein-ligand binding affinities,curated_dataset,True,True,0.9464,0.9464,True,unchanged
+nar_methods_spotlight,,SPOTlight: seeded NMF regression to deconvolute spatial transcriptomics spots,methods_paper,False,False,0.9559,0.9559,True,unchanged
+scientific_data_descriptor,,A comprehensive single-cell transcriptome atlas of human pancreatic islets,data_descriptor,True,True,0.9476,0.9476,True,unchanged
+benchmark_with_method,,A benchmark dataset for evaluating drug response prediction methods,benchmark_dataset,True,True,0.9251,0.9251,True,degraded
+database_update,,The 2024 update of the RCSB Protein Data Bank,database_update,True,True,0.951,0.951,True,unchanged
+survey_ml_datasets,,A survey of datasets for machine learning in computational biology,survey_paper,False,False,0.8789,0.8789,True,FIXED
+protocol_with_data,,Protocol for generating CRISPR knockout libraries with integrated sequencing data,protocol_paper,False,False,0.8418,0.8418,True,improved
+encode_encyclopedia,10.1038/nature11247,An integrated encyclopedia of DNA elements in the human genome,consortium_data,True,True,0.9559,0.9559,True,unchanged
+hgp_initial_draft,10.1038/35057062,Initial sequencing and analysis of the human genome,genome_resource,True,True,0.9597,0.9597,True,unchanged
+hgp_finished_sequence,10.1038/nature03001,Finishing the euchromatic sequence of the human genome,genome_resource,True,True,0.9551,0.9551,True,unchanged
+celera_human_genome,10.1126/science.1058040,The Sequence of the Human Genome,genome_resource,True,True,0.9484,0.9484,True,unchanged
+1000g_phase1,10.1038/nature09534,A map of human genome variation from population-scale sequencing,consortium_data,True,True,0.951,0.951,True,unchanged
+1000g_phase2,10.1038/nature11632,"An integrated map of genetic variation from 1,092 human genomes",consortium_data,True,True,0.9489,0.9489,True,unchanged
+1000g_phase3,10.1038/nature15393,A global reference for human genetic variation,consortium_data,True,True,0.9571,0.9571,True,unchanged
+uk_biobank,10.1371/journal.pmed.1001779,UK Biobank: An Open Access Resource for Identifying the Causes of a Wide Range of Complex Diseases of Middle and Old Age,cohort_resource,True,True,0.9471,0.9471,True,unchanged
+tcia_archive,10.1007/s10278-013-9622-7,The Cancer Imaging Archive (TCIA): Maintaining and Operating a Public Information Repository,imaging_resource,True,True,0.9544,0.9544,True,unchanged
+icgc_data_portal,10.1038/s41587-019-0055-9,The International Cancer Genome Consortium Data Portal,database_paper,True,True,0.9622,0.9622,True,unchanged
+nih_chest_xray,10.1109/CVPR.2017.369,ChestX-Ray8: Hospital-Scale Chest X-Ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases,benchmark_dataset,True,True,0.9238,0.9238,True,unchanged
+mimic_iii,10.1038/sdata.2016.35,"MIMIC-III, a freely accessible critical care database",clinical_database,True,True,0.9519,0.9519,True,unchanged
+mimic_iv,10.1038/s41597-022-01899-x,"MIMIC-IV, a freely accessible electronic health record dataset",clinical_database,True,True,0.9541,0.9541,True,unchanged
+facebase3,10.1242/dev.191213,FaceBase 3: analytical tools and FAIR resources for craniofacial and dental research,database_paper,True,True,0.95,0.95,True,unchanged
+topmed_sequencing,10.1038/s41586-021-03205-y,"Sequencing of 53,831 diverse genomes from the NHLBI TOPMed Program",consortium_data,True,True,0.9556,0.9556,True,unchanged
+fastmri_dataset,10.48550/arXiv.1811.08839,fastMRI: An Open Dataset and Benchmarks for Accelerated MRI,benchmark_dataset,True,True,0.9503,0.9503,True,unchanged
+brain_initiative_cell_census,10.1038/s41586-021-03950-0,A multimodal cell census and atlas of the mammalian primary motor cortex,consortium_data,True,True,0.9446,0.9446,True,unchanged
+brain_initiative_merfish,10.1038/s41586-021-03705-x,Spatially resolved cell atlas of the mouse primary motor cortex by MERFISH,consortium_data,True,True,0.8465,0.8465,True,improved
+framingham_original,10.2105/ajph.41.3.279,Epidemiological Approaches to Heart Disease: The Framingham Study,cohort_study_design,False,False,0.8019,0.8019,True,improved
+framingham_cohort_profile,10.1093/ije/dyv337,Cohort Profile: The Framingham Heart Study (FHS): overview of milestones in cardiovascular epidemiology,cohort_resource,True,True,0.9385,0.9385,True,unchanged

metadata.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "model": "scibert",
+  "base_model": "allenai/scibert_scivocab_uncased",
+  "continued_training": true,
+  "initial_epochs": 5,
+  "continue_epochs": 3,
+  "initial_lr": 2e-05,
+  "continue_lr": 5e-06,
+  "edge_weight": 5,
+  "label_smoothing": 0.1,
+  "sample_ratio": 0.3,
+  "fp16": true,
+  "batch_size": 24,
+  "edge_cases_before": 37,
+  "edge_cases_after": 38,
+  "edge_cases_total": 38,
+  "timestamp": "2026-02-10T00:20:39.488330"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3a8e2b9a93fe67bc1f9db19eb2215eb1c87e1dc18c3d0cf310cb02af44f9872
+size 439703544

predictions.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff