Nex-T1 commited on
Commit
a363a69
·
verified ·
1 Parent(s): d0e8f97

Forked from talaugust/bart-sci-definition

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -9
  2. README.md +57 -0
  3. config.json +73 -0
  4. pytorch_model.bin +3 -0
.gitattributes CHANGED
@@ -2,34 +2,26 @@
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
5
  *.ftz filter=lfs diff=lfs merge=lfs -text
6
  *.gz filter=lfs diff=lfs merge=lfs -text
7
  *.h5 filter=lfs diff=lfs merge=lfs -text
8
  *.joblib filter=lfs diff=lfs merge=lfs -text
9
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
10
  *.model filter=lfs diff=lfs merge=lfs -text
11
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.onnx filter=lfs diff=lfs merge=lfs -text
13
  *.ot filter=lfs diff=lfs merge=lfs -text
14
  *.parquet filter=lfs diff=lfs merge=lfs -text
15
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
16
  *.pt filter=lfs diff=lfs merge=lfs -text
17
  *.pth filter=lfs diff=lfs merge=lfs -text
18
  *.rar filter=lfs diff=lfs merge=lfs -text
 
19
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tflite filter=lfs diff=lfs merge=lfs -text
22
  *.tgz filter=lfs diff=lfs merge=lfs -text
23
  *.wasm filter=lfs diff=lfs merge=lfs -text
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## BART Scientific Definition Generation
2
+ This is a finetuned BART Large model from the paper:
3
+
4
+ "Generating Scientific Definitions with Controllable Complexity"
5
+
6
+ By Tal August, Katharina Reinecke, and Noah A. Smith
7
+
8
+ Abstract: Unfamiliar terminology and complex language can present barriers to understanding science. Natural language processing stands to help address these issues by automatically defining unfamiliar terms. We introduce a new task and dataset for defining scientific terms and controlling the complexity of gen- erated definitions as a way of adapting to a specific reader’s background knowledge. We test four definition generation methods for this new task, finding that a sequence-to-sequence approach is most successful. We then explore the version of the task in which definitions are generated at a target complexity level. We in- troduce a novel reranking approach and find in human evaluations that it offers superior fluency while also controlling complexity, compared to several controllable generation baselines.
9
+
10
+ ## Description
11
+
12
+ The model is finetuned on the task of generating definitions of scientific terms. We frame our task as generating an answer to the question “What is (are) X?” Along with the question, the model takes a support document of scientific abstracted related to the term being defined.
13
+
14
+
15
+ ## Intended use
16
+
17
+ The intended use of this model is to generate definitions of scientific terms. It is NOT intended for public deployment due to the risk of hallucinated information in model output. Strong supervision of definition factuality is important for any future deployment of such a system. While hallucinated information can be damaging in any generation context, incorrect scientific definitions could mislead readers and potentially contribute to broader scientific misinformation. The model is trained on data we believe is trustworthy (e.g., questions and answers from NIH websites); however, this is no guarantee that model output will be.
18
+
19
+ ## Training data
20
+
21
+ The model is trained on data from two sources: Wikipedia science glossaries and a portion of the [MedQuAD dataset](https://github.com/abachaa/MedQuAD), which contains healthcare consumer questions and answers from NIH websites. For more information on these data sources, see the [github repository](https://github.com/talaugust/definition-complexity) for the paper.
22
+
23
+ ## How to use
24
+ Note that this model was trained and evaluated using transformers version 4.2.2
25
+
26
+
27
+ from transformers import (
28
+ AutoTokenizer,
29
+ AutoModelForSeq2SeqLM,
30
+ AutoConfig,
31
+ )
32
+
33
+ bart_sci_def_tokenizer = AutoTokenizer.from_pretrained("talaugust/bart-sci-definition")
34
+ bart_sci_def_model = AutoModelForSeq2SeqLM.from_pretrained("talaugust/bart-sci-definition")
35
+
36
+ inputs = bart_sci_def_tokenizer("question: What is (are) surfactants? context: <P> .... <P> ...." , return_tensors='pt')
37
+
38
+ outputs = bart_sci_def_model.generate(**inputs,
39
+ decoder_start_token_id=tokenizer.bos_token_id,
40
+ num_return_sequences=1,
41
+ num_beams=5,
42
+ max_length=64,
43
+ min_length=8,
44
+ early_stopping=True,
45
+ temperature=None,
46
+ do_sample=True,
47
+ top_k=50,
48
+ top_p=0.9,
49
+ max_input_length=1024,
50
+ no_repeat_ngram_size=3,
51
+ device=None)
52
+ answers = [bart_sci_def_tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in outputs[0]]
53
+
54
+
55
+
56
+ ## Biases & Limitations
57
+ The goal of this model is to enable a wider audience of readers to understand and engage with scientific writing. A risk, though, is that such attempts might instead widen the gap to accessing scientific information. The texts in the datasets we train our models on are in General or Academic American. English. Many people, especially those who have been historically underrepresented in STEM disciplines and medicine, may not be comfortable with this dialect of English. This risks further alienating the readers we hope to serve. An important and exciting direction in NLP is making models more flexible to dialects and low-resource languages.
config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bart_medq_wiki_gen",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 12,
26
+ "eos_token_id": 2,
27
+ "forced_eos_token_id": 2,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_position_embeddings": 1024,
42
+ "model_type": "bart",
43
+ "no_repeat_ngram_size": 3,
44
+ "normalize_before": false,
45
+ "num_beams": 4,
46
+ "num_hidden_layers": 12,
47
+ "pad_token_id": 1,
48
+ "scale_embedding": false,
49
+ "task_specific_params": {
50
+ "summarization": {
51
+ "length_penalty": 1.0,
52
+ "max_length": 128,
53
+ "min_length": 12,
54
+ "num_beams": 4
55
+ },
56
+ "summarization_cnn": {
57
+ "length_penalty": 2.0,
58
+ "max_length": 142,
59
+ "min_length": 56,
60
+ "num_beams": 4
61
+ },
62
+ "summarization_xsum": {
63
+ "length_penalty": 1.0,
64
+ "max_length": 62,
65
+ "min_length": 11,
66
+ "num_beams": 6
67
+ }
68
+ },
69
+ "torch_dtype": "float32",
70
+ "transformers_version": "4.18.0",
71
+ "use_cache": true,
72
+ "vocab_size": 50265
73
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dc7d89110191442a2e31f0e85d7611f364dd00b74dd6da30c3edec19506e220
3
+ size 1625543727