dyxohjl666 commited on
Commit
0aa44d0
·
unverified ·
1 Parent(s): 58c425b

Add Single Document Summarization and examples

Browse files
app.py CHANGED
@@ -1,20 +1,21 @@
1
  import gradio as gr
2
  from description import *
3
 
4
-
5
-
6
  from reference_string_parsing import *
 
7
 
8
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
9
  gr.Markdown("# Gradio Demo for SciAssist")
10
  with gr.Tabs():
 
11
  with gr.TabItem("Reference String Parsing"):
12
  with gr.Box():
13
  gr.Markdown(rsp_str_md)
14
  with gr.Row():
15
  with gr.Column():
16
  rsp_str = gr.Textbox(label="Input String")
17
- rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
 
18
  with gr.Row():
19
  rsp_str_btn = gr.Button("Parse")
20
  rsp_str_output = gr.HighlightedText(
@@ -27,13 +28,13 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
27
  "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
28
  True],
29
  [
30
- "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
31
  False]], inputs=[rsp_str, rsp_str_dehyphen])
32
  with gr.Box():
33
  gr.Markdown(rsp_file_md)
34
  with gr.Row():
35
  with gr.Column():
36
- rsp_file = gr.File()
37
  rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
38
  with gr.Row():
39
  rsp_file_btn = gr.Button("Parse")
@@ -44,19 +45,69 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
44
  combine_adjacent=True,
45
  adjacent_separator=" ",
46
  )
 
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  with gr.TabItem("Source Code"):
49
  gr.Markdown(value=gradio_code)
50
 
51
- rsp_file_btn.click(
52
- fn=rsp_for_file,
53
- inputs=[rsp_file, rsp_file_dehyphen],
54
- outputs=rsp_file_output
55
  )
56
- rsp_str_btn.click(
57
- fn=rsp_for_str,
58
- inputs=[rsp_str, rsp_str_dehyphen],
59
- outputs=rsp_str_output
60
  )
61
 
62
- demo.launch()
 
 
 
 
1
  import gradio as gr
2
  from description import *
3
 
 
 
4
  from reference_string_parsing import *
5
+ from single_summarization import *
6
 
7
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
8
  gr.Markdown("# Gradio Demo for SciAssist")
9
  with gr.Tabs():
10
+ # Reference String Parsing
11
  with gr.TabItem("Reference String Parsing"):
12
  with gr.Box():
13
  gr.Markdown(rsp_str_md)
14
  with gr.Row():
15
  with gr.Column():
16
  rsp_str = gr.Textbox(label="Input String")
17
+ with gr.Column():
18
+ rsp_str_dehyphen = gr.Checkbox(label="dehyphen")
19
  with gr.Row():
20
  rsp_str_btn = gr.Button("Parse")
21
  rsp_str_output = gr.HighlightedText(
 
28
  "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
29
  True],
30
  [
31
+ "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval-2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
32
  False]], inputs=[rsp_str, rsp_str_dehyphen])
33
  with gr.Box():
34
  gr.Markdown(rsp_file_md)
35
  with gr.Row():
36
  with gr.Column():
37
+ rsp_file = gr.File(label="Input File")
38
  rsp_file_dehyphen = gr.Checkbox(label="dehyphen")
39
  with gr.Row():
40
  rsp_file_btn = gr.Button("Parse")
 
45
  combine_adjacent=True,
46
  adjacent_separator=" ",
47
  )
48
+ rsp_file_examples = gr.Examples(examples=[["examples/N18-3011_ref.txt", False],], inputs=[rsp_file, rsp_file_dehyphen])
49
+
50
 
51
+ rsp_file_btn.click(
52
+ fn=rsp_for_file,
53
+ inputs=[rsp_file, rsp_file_dehyphen],
54
+ outputs=rsp_file_output
55
+ )
56
+ rsp_str_btn.click(
57
+ fn=rsp_for_str,
58
+ inputs=[rsp_str, rsp_str_dehyphen],
59
+ outputs=rsp_str_output
60
+ )
61
+
62
+ # Single Document Summarization
63
+ with gr.TabItem("Single Document Summarization"):
64
+ with gr.Box():
65
+ gr.Markdown(ssum_str_md)
66
+ with gr.Row():
67
+ with gr.Column():
68
+ ssum_str = gr.Textbox(label="Input String")
69
+ with gr.Column():
70
+ ssum_str_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
71
+ ssum_str_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
72
+ with gr.Row():
73
+ ssum_str_btn = gr.Button("Generate")
74
+ ssum_str_output = gr.Textbox(
75
+ elem_id="htext",
76
+ label="Summary",
77
+ )
78
+ ssum_str_examples = gr.Examples(examples=[[ssum_str_example, 1, 1], ],
79
+ inputs=[ssum_str, ssum_str_beams, ssum_str_sequences])
80
+ with gr.Box():
81
+ gr.Markdown(ssum_file_md)
82
+ with gr.Row():
83
+ with gr.Column():
84
+ ssum_file = gr.File(label="Input File")
85
+ with gr.Column():
86
+ ssum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
87
+ ssum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
88
+ with gr.Row():
89
+ ssum_file_btn = gr.Button("Generate")
90
+ ssum_file_output = gr.Textbox(
91
+ elem_id="htext",
92
+ label="Summary",
93
+ )
94
+ ssum_file_examples = gr.Examples(examples=[["examples/N18-3011_body.txt", 10, 2],],
95
+ inputs=[ssum_file, ssum_file_beams, ssum_file_sequences])
96
  with gr.TabItem("Source Code"):
97
  gr.Markdown(value=gradio_code)
98
 
99
+ ssum_file_btn.click(
100
+ fn=ssum_for_file,
101
+ inputs=[ssum_file, ssum_file_beams, ssum_file_sequences],
102
+ outputs=ssum_file_output
103
  )
104
+ ssum_str_btn.click(
105
+ fn=ssum_for_str,
106
+ inputs=[ssum_str, ssum_str_beams, ssum_str_sequences],
107
+ outputs=ssum_str_output
108
  )
109
 
110
+
111
+
112
+
113
+ demo.launch(share=True)
description.py CHANGED
@@ -43,16 +43,33 @@ def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
43
  ```
44
  '''
45
 
 
46
  rsp_str_md = '''
47
- To **test on strings**, simply input one or more strings.
48
  '''
49
 
50
  rsp_file_md = '''
51
- To **test on a file**, the input can be either:
52
 
53
  - A txt file which contains a reference string in each line.
54
 
55
 
56
  '''
 
57
 
58
- # - A pdf file which contains a whole scientific document without any processing (including title, author...).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ```
44
  '''
45
 
46
+ # Reference string parsing Markdown
47
  rsp_str_md = '''
48
+ To **test on strings**, simply input one or more strings.
49
  '''
50
 
51
  rsp_file_md = '''
52
+ To **test on a file**, the input can be:
53
 
54
  - A txt file which contains a reference string in each line.
55
 
56
 
57
  '''
58
+ # - A pdf file which contains a whole scientific document without any processing (including title, author...).
59
 
60
+ ssum_str_md = '''
61
+ To **test on strings**, simply input a string.
62
+
63
+ **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
64
+
65
+ '''
66
+
67
+ ssum_file_md = '''
68
+ To **test on a file**, the input can be:
69
+
70
+ - A txt file which contains the content to be summarized.
71
+
72
+ **Note**: The **number of beams** should be **divisible** by the **number of generated summaries** for group beam search.
73
+
74
+ '''
75
+ # - A pdf file which contains a whole scientific document without any processing (including title, author...).
examples/N18-3011_body.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The goal of this work is to facilitate algorithmic discovery in the scientific literature . Despite notable advances in scientific search engines , data mining and digital libraries ( e.g. , Wu et al. , 2014 ) , researchers remain unable to answer simple questions such as : What is the percentage of female subjects in depression clinical trials ? Which of my co-authors published one or more papers on coreference resolution ? Which papers discuss the effects of Ranibizumab on the Retina ? In this paper , we focus on the problem of extracting structured data from scientific documents , which can later be used in natural language interfaces ( e.g. , Iyer et al. , 2017 ) or to improve ranking of results in academic search ( e.g. , Xiong et al. , Figure 1 : Part of the literature graph . 2017 ) . We describe methods used in a scalable deployed production system for extracting structured information from scientific documents into the literature graph ( see Fig . 1 ) . The literature graph is a directed property graph which summarizes key information in the literature and can be used to answer the queries mentioned earlier as well as more complex queries . For example , in order to compute the Erdős number of an author X , the graph can be queried to find the number of nodes on the shortest undirected path between author X and Paul Erdős such that all edges on the path are labeled `` authored '' . We reduce literature graph construction into familiar NLP tasks such as sequence labeling , entity linking and relation extraction , and address some of the impractical assumptions commonly made in the standard formulations of these tasks . For example , most research on named entity recognition tasks report results on large labeled datasets such as CoNLL-2003 and ACE-2005 ( e.g. , Lample et al. , 2016 , and assume that entity types in the test set match those labeled in the training set ( including work on domain adaptation , e.g. , Daumé , 2007 ) . These assumptions , while useful for developing and benchmarking new methods , are unrealistic for many domains and applications . The paper also serves as an overview of the approach we adopt at www.semanticscholar.org in a step towards more intelligent academic search engines ( Etzioni , 2011 ) . In the next section , we start by describing our symbolic representation of the literature . Then , we discuss how we extract metadata associated with a paper such as authors and references , then how we extract the entities mentioned in paper text . Before we conclude , we briefly describe other research challenges we are actively working on in order to improve the quality of the literature graph . The literature graph is a property graph with directed edges . Unlike Resource Description Framework ( RDF ) graphs , nodes and edges in property graphs have an internal structure which is more suitable for representing complex data types such as papers and entities . In this section , we describe the attributes associated with nodes and edges of different types in the literature graph . Papers . We obtain metadata and PDF files of papers via partnerships with publishers ( e.g. , Springer , Nature ) , catalogs ( e.g. , DBLP , MED-LINE ) , pre-publishing services ( e.g. , arXiv , bioRxive ) , as well as web-crawling . Paper nodes are associated with a set of attributes such as 'title ' , 'abstract ' , 'full text ' , 'venues ' and 'publication year ' . While some of the paper sources provide these attributes as metadata , it is often necessary to extract them from the paper PDF ( details in §3 ) . We deterministically remove duplicate papers based on string similarity of their metadata , resulting in 37M unique paper nodes . Papers in the literature graph cover a variety of scientific disciplines , including computer science , molecular biology , microbiology and neuroscience . Authors . Each node of this type represents a unique author , with attributes such as 'first name ' and 'last name ' . The literature graph has 12M nodes of this type . Entities . Each node of this type represents a unique scientific concept discussed in the literature , with attributes such as 'canonical name ' , 'aliases ' and 'description ' . Our literature graph has 0.4M nodes of this type . We describe how we populate entity nodes in §4.3 . Entity mentions . Each node of this type represents a textual reference of an entity in one of the papers , with attributes such as 'mention text ' , 'context ' , and 'confidence ' . We describe how we populate the 237M mentions in the literature graph in §4.1 . Citations . We instantiate a directed citation edge from paper nodes p 1 ! p 2 for each p 2 referenced in p 1 . Citation edges have attributes such as 'from paper id ' , 'to paper id ' and 'contexts ' ( the textual contexts where p 2 is referenced in p 1 ) . While some of the paper sources provide these attributes as metadata , it is often necessary to extract them from the paper PDF as detailed in §3 . Authorship . We instantiate a directed authorship edge between an author node and a paper node a ! p for each author of that paper . Entity linking edges . We instantiate a directed edge from an extracted entity mention node to the entity it refers to . Mention-mention relations . We instantiate a directed edge between a pair of mentions in the same sentential context if the textual relation extraction model predicts one of a predefined list of relation types between them in a sentential context . 1 We encode a symmetric relation between m 1 and m 2 as two directed edges m 1 ! m 2 and m 2 ! m 1 . Entity-entity relations . While mentionmention edges represent relations between mentions in a particular context , entity-entity edges represent relations between abstract entities . These relations may be imported from an existing knowledge base ( KB ) or inferred from other edges in the graph . In the previous section , we described the overall structure of the literature graph . Next , we discuss how we populate paper nodes , author nodes , authorship edges , and citation edges . Although some publishers provide sufficient metadata about their papers , many papers are provided with incomplete metadata . Also , papers obtained via web-crawling are not associated with any metadata . To fill in this gap , we built the Sci-enceParse system to predict structured data from the raw PDFs using recurrent neural networks ( RNNs ) . 2 For each paper , the system extracts the paper title , list of authors , and list of references ; each reference consists of a title , a list of authors , a venue , and a year . Preparing the input layer . We split each PDF into individual pages , and feed each page to Apache 's PDFBox library 3 to convert it into a sequence of tokens , where each token has features , e.g. , 'text ' , 'font size ' , 'space width ' , 'position on the page ' . We normalize the token-level features before feeding them as inputs to the model . For each of the 'font size ' and 'space width ' features , we compute three normalized values ( with respect to current page , current document , and the whole training corpus ) , each value ranging between -0.5 to +0.5 . The token 's 'position on the page ' is given in XY coordinate points . We scale the values linearly to range from . 0:5 ; 0:5/ at the top-left corner of the page to .0:5 ; 0:5/ at the bottom-right corner . In order to capture case information , we add seven numeric features to the input representation of each token : whether the first/second letter is uppercase/lowercase , the fraction of uppercase/lowercase letters and the fraction of digits . To help the model make correct predictions for metadata which tend to appear at the beginning ( e.g. , titles and authors ) or at the end of papers ( e.g. , references ) , we provide the current page number as two discrete variables ( relative to the beginning and end of the PDF file ) with values 0 , 1 and 2+ . These features are repeated for each token on the same page . For the k-th token in the sequence , we compute the input representation i k by concatenating the numeric features , an embedding of the 'font size ' , and the word embedding of the lowercased token . Word embeddings are initialized with GloVe ( Pennington et al. , 2014 ) . Model . The input token representations are passed through one fully-connected layer and then g ! k D LSTM.Wi k ; g ! k 1 / ; g k D OEg ! k I g k ; h ! k D LSTM.g k ; h ! k 1 / ; h k D OEh ! k I g k where W is a weight matrix , g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction . Following Collobert et al . 2011 , we feed the output of the second layer h k into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights ( often described as a conditional random field layer when used in neural architectures ) to account for dependencies between labels . Training . The ScienceParse system is trained on a snapshot of the data at PubMed Central . It consists of 1.4M PDFs and their associated metadata , which specify the correct titles , authors , and bibliographies . We use a heuristic labeling process that finds the strings from the metadata in the tokenized PDFs to produce labeled tokens . This labeling process succeeds for 76 % of the documents . The remaining documents are not used in the training process . During training , we only use pages which have at least one token with a label that is not `` none '' . Decoding . At test time , we use Viterbi decoding to find the most likely global sequence , with no further constraints . To get the title , we use the longest continuous sequence of tokens with the `` title '' label . Since there can be multiple authors , we use all continuous sequences of tokens with the `` author '' label as authors , but require that all authors of a paper are mentioned on the same page . If the author labels are predicted in multiple pages , we use the one with the largest number of authors . Results . We run our final tests on a held-out set from PubMed Central , consisting of about 54K documents . The results are detailed in Table 1 . We use a conservative evaluation where an instance is correct if it exactly matches the gold annotation , with no credit for partial matching . To give an example for the type of errors our model makes , consider the paper ( Wang et al. , 2013 ) titled `` Clinical review : Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and metaanalysis . '' The title we extract for this paper omits the first part `` Clinical review : '' . This is likely to be a result of the pattern `` Foo : Bar Baz '' appearing in many training examples with only `` Bar Baz '' labeled as the title . In the previous section , we described how we populate the backbone of the literature graph , i.e. , paper nodes , author nodes and citation edges . Next , we discuss how we populate mentions and entities in the literature graph using entity extraction and linking on the paper text . In order to focus on more salient entities in a given paper , we only use the title and abstract . We experiment with three approaches for entity extraction and linking : I . Statistical : uses one or more statistical models for predicting mention spans , then uses another statistical model to link mentions to candidate entities in a KB . II . Hybrid : defines a small number of handengineered , deterministic rules for string-based matching of the input text to candidate entities in the KB , then uses a statistical model to disambiguate the mentions . 4 III . Off-the-shelf : uses existing libraries , namely ( Ferragina and Scaiella , 2010 , TagMe ) 5 and ( Demner-Fushman et al. , 2017 , MetaMap Lite ) 6 , with minimal post-processing to extract and link entities to the KB . Table 2 : Document-level evaluation of three approaches in two scientific areas : computer science ( CS ) and biomedical ( Bio ) . We evaluate the performance of each approach in two broad scientific areas : computer science ( CS ) and biomedical research ( Bio ) . For each unique ( paper ID , entity ID ) pair predicted by one of the approaches , we ask human annotators to label each mention extracted for this entity in the paper . We use CrowdFlower to manage human annotations and only include instances where three or more annotators agree on the label . If one or more of the entity mentions in that paper is judged to be correct , the pair ( paper ID , entity ID ) counts as one correct instance . Otherwise , it counts as an incorrect instance . We report 'yield ' in lieu of 'recall ' due to the difficulty of doing a scalable comprehensive annotation . Table 2 shows the results based on 500 papers using v1.1.2 of our entity extraction and linking components . In both domains , the statistical approach gives the highest precision and the lowest yield . The hybrid approach consistently gives the highest yield , but sacrifices precision . The TagMe off-the-shelf library used for the CS domain gives surprisingly good results , with precision within 1 point from the statistical models . However , the MetaMap Lite off-the-shelf library we used for the biomedical domain suffered a huge loss in precision . Our error analysis showed that each of the approaches is able to predict entities not predicted by the other approaches so we decided to pool their outputs in our deployed system , which gives significantly higher yield than any individual approach while maintaining reasonably high precision . Given the token sequence t 1 ; : : : ; t N in a sentence , we need to identify spans which correspond to entity mentions . We use the BILOU scheme to encode labels at the token level . Unlike most formulations of named entity recognition problems ( NER ) , we do not identify the entity type ( e.g. , protein , drug , chemical , disease ) for each mention since the output mentions are further grounded in a KB with further information about the entity ( including its type ) , using an entity linking module . Model . First , we construct the token embedding x k D OEc k I w k for each token t k in the input sequence , where c k is a character-based representation computed using a convolutional neural network ( CNN ) with filter of size 3 characters , and w k are learned word embeddings initialized with the GloVe embeddings ( Pennington et al. , 2014 ) . We also compute context-sensitive word embeddings , denoted as lm k D OElm ! k I lm k , by concatenating the projected outputs of forward and backward recurrent neural network language models ( RNN-LM ) at position k. The language model ( LM ) for each direction is trained independently and consists of a single layer long short-term memory ( LSTM ) network followed by a linear project layer . While training the LM parameters , lm ! k is used to predict t kC1 and lm k is used to predict t k 1 . We fix the LM parameters during training of the entity extraction model . See and for more details . Given the x k and lm k embeddings for each token k 2 f1 ; : : : ; N g , we use a two-layer bidirectional LSTM to encode the sequence with x k and lm k feeding into the first and second layer , respectively . That is , g ! k D LSTM.x k ; g ! k 1 / ; g k D OEg ! k I g k ; h ! k D LSTM.OEg k I lm k ; h ! k 1 / ; h k D OEh ! k I h k ; where g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction . Similar to the model described in §3 , we feed the output of the second LSTM into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights to account for dependencies between labels . Results . We use the standard data splits of the SemEval-2017 Task 10 on entity ( and relation ) extraction from scientific papers ( Augenstein et al. , 2017 ) . Table 3 compares three variants of our entity extraction model . The first line omits the LM embeddings lm k , while the second line is the full model ( including LM embeddings ) showing a large improvement of 4.2 F1 points . The third line shows that creating an ensemble of 15 models further improves the results by 1.1 F1 points . Model instances . In the deployed system , we use three instances of the entity extraction model Description F1 Without LM 49.9 With LM 54.1 Avg . of 15 models with LM 55.2 Table 3 : Results of the entity extraction model on the development set of SemEval-2017 task 10. with a similar architecture , but trained on different datasets . Two instances are trained on the BC5CDR ( Li et al. , 2016 ) and the CHEMDNER datasets ( Krallinger et al. , 2015 ) to extract key entity mentions in the biomedical domain such as diseases , drugs and chemical compounds . The third instance is trained on mention labels induced from Wikipedia articles in the computer science domain . The output of all model instances are pooled together and combined with the rule-based entity extraction module , then fed into the entity linking model ( described below ) . In this section , we describe the construction of entity nodes and entity-entity edges . Unlike other knowledge extraction systems such as the Never-Ending Language Learner ( NELL ) 7 and OpenIE 4 , 8 we use existing knowledge bases ( KBs ) of entities to reduce the burden of identifying coherent concepts . Grounding the entity mentions in a manually-curated KB also increases user confidence in automated predictions . We use two KBs : UMLS : The UMLS metathesaurus integrates information about concepts in specialized ontologies in several biomedical domains , and is funded by the U.S. National Library of Medicine . DBpedia : DBpedia provides access to structured information in Wikipedia . Rather than including all Wikipedia pages , we used a short list of Wikipedia categories about CS and included all pages up to depth four in their trees in order to exclude irrelevant entities , e.g. , `` Lord of the Rings '' in DBpedia . Given a text span s identified by the entity extraction model in §4.2 ( or with heuristics ) and a reference KB , the goal of the entity linking model is to associate the span with the entity it refers to . A span and its surrounding words are collectively referred to as a mention . We first identify a set of candidate entities that a given mention may refer to . Then , we rank the candidate entities based on a score computed using a neural model trained on labeled data . For example , given the string `` . . . database of facts , an ILP system will . . . `` , the entity extraction model identifies the span `` ILP '' as a possible entity and the entity linking model associates it with `` Inductive_Logic_Programming '' as the referent entity ( from among other candidates like `` Integer_Linear_Programming '' or `` Instruction-level_Parallelism '' ) . Datasets . We used two datasets : i ) a biomedical dataset formed by combining MSH ( Jimeno-Yepes et al. , 2011 ) and BC5CDR ( Li et al. , 2016 ) with UMLS as the reference KB , and ii ) a CS dataset we curated using Wikipedia articles about CS concepts with DBpedia as the reference KB . Candidate selection . In a preprocessing step , we build an index which maps any token used in a labeled mention or an entity name in the KB to associated entity IDs , along with the frequency this token is associated with that entity . This is similar to the index used in previous entity linking systems ( e.g. , Bhagavatula et al. , 2015 ) to estimate the probability that a given mention refers to an entity . At train and test time , we use this index to find candidate entities for a given mention by looking up the tokens in the mention . This method also serves as our baseline in Table 4 by selecting the entity with the highest frequency for a given mention . Scoring candidates . Given a mention ( m ) and a candidate entity ( e ) , the neural model constructs a vector encoding of the mention and the entity . We encode the mention and entity using the functions f and g , respectively , as follows : f.m/ D OEv m.name I avg.v m.lc ; v m.rc / ; g.e/ D OEv e.name I v e.def ; where m.surface , m.lc and m.rc are the mention 's surface form , left and right contexts , and e.name and e.def are the candidate entity 's name and definition , respectively . v text is a bag-of-words sum encoder for text . We use the same encoder for the mention surface form and the candidate name , and another encoder for the mention contexts and entity definition . Additionally , we include numerical features to estimate the confidence of a candidate entity based on the statistics collected in the index described Table 4 : The Bag of Concepts F1 score of the baseline and neural model on the two curated datasets . earlier . We compute two scores based on the word overlap of ( i ) mention 's context and candidate 's definition and ( ii ) mention 's surface span and the candidate entity 's name . Finally , we feed the concatenation of the cosine similarity between f.m/ and g.e/ and the intersection-based scores into an affine transformation followed by a sigmoid nonlinearity to compute the final score for the pair ( m , e ) . Results . We use the Bag of Concepts F1 metric ( Ling et al. , 2015 ) for comparison . Table 4 compares the performance of the most-frequent-entity baseline and our neural model described above . In the previous sections , we discussed how we construct the main components of the literature graph . In this section , we briefly describe several other related challenges we are actively working on . Author disambiguation . Despite initiatives to have global author IDs ORCID and ResearcherID , most publishers provide author information as names ( e.g. , arXiv ) . However , author names can not be used as a unique identifier since several people often share the same name . Moreover , different venues and sources use different conventions in reporting the author names , e.g. , `` first initial , last name '' vs. `` last name , first name '' . Inspired by Culotta et al . ( 2007 ) , we train a supervised binary classifier for merging pairs of author instances and use it to incrementally create author clusters . We only consider merging two author instances if they have the same last name and share the first initial . If the first name is spelled out ( rather than abbreviated ) in both author instances , we also require that the first name matches . Ontology matching . Popular concepts are often represented in multiple KBs . For example , the concept of `` artificial neural networks '' is represented as entity ID D016571 in the MESH ontology , and represented as page ID '21523 ' in DBpedia . Ontology matching is the problem of identifying semantically-equivalent entities across KBs or ontologies . 9 Limited KB coverage . The convenience of grounding entities in a hand-curated KB comes at the cost of limited coverage . Introduction of new concepts and relations in the scientific literature occurs at a faster pace than KB curation , resulting in a large gap in KB coverage of scientific concepts . In order to close this gap , we need to develop models which can predict textual relations as well as detailed concept descriptions in scientific papers . For the same reasons , we also need to augment the relations imported from the KB with relations extracted from text . Our approach to address both entity and relation coverage is based on distant supervision ( Mintz et al. , 2009 ) . In short , we train two models for identifying entity definitions and relations expressed in natural language in scientific documents , and automatically generate labeled data for training these models using known definitions and relations in the KB . We note that the literature graph currently lacks coverage for important entity types ( e.g. , affiliations ) and domains ( e.g. , physics ) . Covering affiliations requires small modifications to the metadata extraction model followed by an algorithm for matching author names with their affiliations . In order to cover additional scientific domains , more agreements need to be signed with publishers . Figure and table extraction . Non-textual components such as charts , diagrams and tables provide key information in many scientific documents , but the lack of large labeled datasets has impeded the development of data-driven methods for scientific figure extraction . In Siegel et al . ( 2018 ) , we induced high-quality training labels for the task of figure extraction in a large number of scientific documents , with no human intervention . To accomplish this we leveraged the auxiliary data provided in two large web collections of scientific documents ( arXiv and PubMed ) to locate figures and their associated captions in the rasterized PDF . We use the resulting dataset to train a deep neural network for end-to-end figure detection , yielding a model that can be more easily extended to new domains compared to previous work . Understanding and predicting citations . The citation edges in the literature graph provide a wealth of information ( e.g. , at what rate a paper is being cited and whether it is accelerating ) , and opens the door for further research to better understand and predict citations . For example , in order to allow users to better understand what impact a paper had and effectively navigate its citations , we experimented with methods for classifying a citation as important or incidental , as well as more finegrained classes ( Valenzuela et al. , 2015 ) . The citation information also enables us to develop models for estimating the potential of a paper or an author . In Weihs and Etzioni ( 2017 ) , we predict citationbased metrics such as an author 's h-index and the citation rate of a paper in the future . Also related is the problem of predicting which papers should be cited in a given draft ( Bhagavatula et al. , 2018 ) , which can help improve the quality of a paper draft before it is submitted for peer review , or used to supplement the list of references after a paper is published . In this paper , we discuss the construction of a graph , providing a symbolic representation of the scientific literature . We describe deployed models for identifying authors , references and entities in the paper text , and provide experimental results to evaluate the performance of each model . Three research directions follow from this work and other similar projects , e.g. , Hahn-Powell et al . ( 2017 ) ; Wu et al . ( 2014 ) : i ) improving quality and enriching content of the literature graph ( e.g. , ontology matching and knowledge base population ) . ii ) aggregating domain-specific extractions across many papers to enable a better understanding of the literature as a whole ( e.g. , identifying demographic biases in clinical trial participants and summarizing empirical results on important tasks ) . iii ) exploring the literature via natural language interfaces . In order to help future research efforts , we make the following resources publicly available : metadata for over 20 million papers , 10 meaningful citations dataset , 11 models for figure and table extraction , 12 models for predicting citations in a paper draft 13 and models for extracting paper metadata , 14 among other resources . 15 Due to space constraints , we opted not to discuss our relation extraction models in this draft . The ScienceParse libraries can be found at http : // allenai.org/software/.3 https : //pdfbox.apache.org We also experimented with a `` pure '' rules-based approach which disambiguates deterministically but the hybrid approach consistently gave better results.5 The TagMe APIs are described at https : //sobigdata . d4science.org/web/tagme/tagme-help6 We use v3.4 ( L0 ) of MetaMap Lite , available at https : //metamap.nlm.nih.gov/MetaMapLite.shtml http : //rtw.ml.cmu.edu/rtw/ 8 https : //github.com/allenai/ openie-standalone Variants of this problem are also known as deduplication or record linkage .
examples/N18-3011_ref.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).
2
+ Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).
3
+ Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-based citation recommendation. In NAACL.
4
+ Chandra Bhagavatula, Thanapon Noraset, and Doug Downey. 2015. TabEL: entity linking in web tables. In ISWC.
5
+ Ronan Collobert, Jason Weston, Léon Bottou, Michael Karlen, Koray Kavukcuoglu, and Pavel P. Kuksa. 2011. Natural language processing (almost) from scratch. In JMLR.
6
+ Aron Culotta, Pallika Kanani, Robert Hall, Michael Wick, and Andrew D. McCallum. 2007. Author disambiguation using error-driven machine learning with a ranking loss function. In IIWeb Workshop.
7
+ Hal Daumé. 2007. Frustratingly easy domain adapta- tion. In ACL.
8
+ Dina Demner-Fushman, Willie J. Rogers, and Alan R. Aronson. 2017. MetaMap Lite: an evaluation of a new Java implementation of MetaMap. In JAMIA.
9
+ Oren Etzioni. 2011. Search needs a shake-up. Nature 476 7358:25-6.
10
+ Paolo Ferragina and Ugo Scaiella. 2010. TAGME: on-the-fly annotation of short text fragments (by wikipedia entities). In CIKM.
11
+ Gus Hahn-Powell, Marco Antonio Valenzuela- Escarcega, and Mihai Surdeanu. 2017. Swanson linking revisited: Accelerating literature-based dis- covery across domains using a conceptual influence graph. In ACL.
12
+ Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long short-term memory. Neural computation .
13
+ Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke S. Zettlemoyer. 2017. Learning a neural semantic parser from user feed- back. In ACL.
14
+ Antonio J. Jimeno-Yepes, Bridget T. McInnes, and Alan R. Aronson. 2011. Exploiting mesh indexing in medline to generate a data set for word sense dis- ambiguation. BMC bioinformatics 12(1):223.
15
+ Martin Krallinger, Florian Leitner, Obdulia Rabal, Miguel Vazquez, Julen Oyarzabal, and Alfonso Va- lencia. 2015. CHEMDNER: The drugs and chemi- cal names extraction challenge. In J. Cheminformat- ics.
16
+ Guillaume Lample, Miguel Ballesteros, Sandeep K Subramanian, Kazuya Kawakami, and Chris Dyer. 2016. Neural architectures for named entity recog- nition. In HLT-NAACL.
17
+ Jiao Li, Yueping Sun, Robin J. Johnson, Daniela Sci- aky, Chih-Hsuan Wei, Robert Leaman, Allan Peter Davis, Carolyn J. Mattingly, Thomas C. Wiegers, and Zhiyong Lu. 2016. Biocreative v cdr task cor- pus: a resource for chemical disease relation extrac- tion. Database : the journal of biological databases and curation 2016.
18
+ Xiao Ling, Sameer Singh, and Daniel S. Weld. 2015. Design challenges for entity linking. Transactions of the Association for Computational Linguistics 3:315-328.
19
+ Mike Mintz, Steven Bills, Rion Snow, and Daniel Ju- rafsky. 2009. Distant supervision for relation extrac- tion without labeled data. In ACL.
20
+ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global vectors for word rep- resentation. In EMNLP.
21
+ Matthew E. Peters, Waleed Ammar, Chandra Bhagavat- ula, and Russell Power. 2017. Semi-supervised se- quence tagging with bidirectional language models. In ACL.
22
+ Noah Siegel, Nicholas Lourie, Russell Power, and Waleed Ammar. 2018. Extracting scientific figures with distantly supervised neural networks. In JCDL.
23
+ Marco Valenzuela, Vu Ha, and Oren Etzioni. 2015. Identifying meaningful citations. In AAAI Workshop (Scholarly Big Data).
24
+ Xiang Wang, Yan Dong, Xiang qian Qi, Yi-Ming Li, Cheng-Guang Huang, and Lijun Hou. 2013. Clin- ical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a system- atic review and meta-analysis. In Critical care.
25
+ Luca Weihs and Oren Etzioni. 2017. Learning to pre- dict citation-based impact measures. In JCDL.
26
+ Jian Wu, Kyle Williams, Hung-Hsuan Chen, Madian Khabsa, Cornelia Caragea, Alexander Ororbia, Dou- glas Jordan, and C. Lee Giles. 2014. CiteSeerX: AI in a digital library search engine. In AAAI.
27
+ Chenyan Xiong, Russell Power, and Jamie Callan. 2017. Explicit semantic ranking for academic search via knowledge graph embedding. In WWW.
reference_string_parsing.py CHANGED
@@ -23,8 +23,8 @@ def rsp_for_file(input, dehyphen=False) -> List[Tuple[str, str]]:
23
  # Identify the format of input and parse reference strings
24
  if filename[-4:] == ".txt":
25
  results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen)
26
- elif filename[-4:] == ".pdf":
27
- results = rsp_pipeline.predict(filename, dehyphen=dehyphen)
28
  else:
29
  return [("File Format Error !", None)]
30
  # Prepare for the input gradio.HighlightedText accepts.
 
23
  # Identify the format of input and parse reference strings
24
  if filename[-4:] == ".txt":
25
  results = rsp_pipeline.predict(filename, type="txt", dehyphen=dehyphen)
26
+ # elif filename[-4:] == ".pdf":
27
+ # results = rsp_pipeline.predict(filename, dehyphen=dehyphen)
28
  else:
29
  return [("File Format Error !", None)]
30
  # Prepare for the input gradio.HighlightedText accepts.
single_summarization.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import torch
3
+ from SciAssist import SingleSummarization
4
+
5
+ device = "gpu" if torch.cuda.is_available() else "cpu"
6
+ ssum_pipeline = SingleSummarization()
7
+
8
+
9
+ def ssum_for_str(input, num_beams=1, num_return_sequences=1) -> List[Tuple[str, str]]:
10
+ results = ssum_pipeline.predict(input, type="str", num_beams=num_beams, num_return_sequences=num_return_sequences)
11
+
12
+ output = []
13
+ for res in results["summary"]:
14
+ output.append(f"{res}\n\n")
15
+ return "".join(output)
16
+
17
+
18
+ def ssum_for_file(input, num_beams=1, num_return_sequences=1) -> List[Tuple[str, str]]:
19
+ if input == None:
20
+ return None
21
+ filename = input.name
22
+ # Identify the format of input and parse reference strings
23
+ if filename[-4:] == ".txt":
24
+ results = ssum_pipeline.predict(filename, type="txt", num_beams=num_beams,
25
+ num_return_sequences=num_return_sequences, save_results=False)
26
+ # elif filename[-4:] == ".pdf":
27
+ # results = rsp_pipeline.predict(filename, num_beams=num_beams, num_return_sequences=num_return_sequences)
28
+ else:
29
+ return [("File Format Error !", None)]
30
+
31
+ output = []
32
+ for res in results["summary"]:
33
+ output.append(f"{res}\n\n")
34
+ return "".join(output)
35
+
36
+
37
+ ssum_str_example = """
38
+ 1 INTRODUCTION . Statistical learning theory studies the learning
39
+ properties of machine learning algorithms , and more fundamentally , the conditions
40
+ under which learning from finite data is possible . In this context , classical
41
+ learning theory focuses on the size of the hypothesis space in terms of different
42
+ complexity measures , such as combinatorial dimensions , covering numbers and
43
+ Rademacher/Gaussian complexities ( Shalev-Shwartz & Ben-David , 2014 ; Boucheron
44
+ et al. , 2005 ) . Another more recent approach is based on defining suitable notions
45
+ of stability with respect to perturbation of the data ( Bousquet & Elisseeff , 2001 ;
46
+ Kutin & Niyogi , 2002 ) . In this view , the continuity of the process that maps
47
+ data to estimators is crucial , rather than the complexity of the hypothesis space .
48
+ Different notions of stability can be considered , depending on the data perturbation
49
+ and metric considered ( Kutin & Niyogi , 2002 ) . Interestingly , the stability and
50
+ complexity approaches to characterizing the learnability of problems are not at odds
51
+ with each other , and can be shown to be equivalent as shown in Poggio et al .
52
+ ( 2004 ) and Shalev-Shwartz et al . ( 2010 ) . In modern machine learning
53
+ overparameterized models , with a larger number of parameters than the size of the
54
+ training data , have become common . The ability of these models to generalize is well
55
+ explained by classical statistical learning theory as long as some form of
56
+ regularization is used in the training process ( Bühlmann & Van De Geer , 2011 ;
57
+ Steinwart & Christmann , 2008 ) . However , it was recently shown - first for deep
58
+ networks ( Zhang et al. , 2017 ) , and more recently for kernel methods ( Belkin et
59
+ al. , 2019 ) - that learning is possible in the absence of regularization , i.e. ,
60
+ when perfectly fitting/interpolating the data . Much recent work in statistical
61
+ learning theory has tried to find theoretical ground for this empirical finding .
62
+ Since learning using models that interpolate is not exclusive to deep neural networks
63
+ , we study generalization in the presence of interpolation in the case of kernel
64
+ methods . We study both linear and kernel least squares problems in this paper . Our
65
+ Contributions : • We characterize the generalization properties of interpolating
66
+ solutions for linear and kernel least squares problems using a stability approach .
67
+ While the ( uniform ) stability properties of regularized kernel methods are well
68
+ known ( Bousquet & Elisseeff , 2001 ) , we study interpolating solutions of the
69
+ unregularized ( `` ridgeless '' ) regression problems . • We obtain an upper bound
70
+ on the stability of interpolating solutions , and show that this upper bound is
71
+ minimized by the minimum norm interpolating solution . This also means that among
72
+ all interpolating solutions , the minimum norm solution has the best test error .
73
+ In particular , the same conclusion is also true for gradient descent , since it
74
+ converges to the minimum norm solution in the setting we consider , see e.g . Rosasco
75
+ & Villa ( 2015 ) . • Our stability bounds show that the average stability of the
76
+ minimum norm solution is controlled by the condition number of the empirical kernel
77
+ matrix . It is well known that the numerical stability of the least squares solution
78
+ is governed by the condition number of the associated kernel matrix ( see the
79
+ discussion of why overparametrization is “ good ” in Poggio et al . ( 2019 ) ) . Our
80
+ results show that the condition number also controls stability ( and hence , test
81
+ error ) in a statistical sense . Organization : In section 2 , we introduce basic
82
+ ideas in statistical learning and empirical risk minimization , as well as the
83
+ notation used in the rest of the paper . In section 3 , we briefly recall some
84
+ definitions of stability . In section 4 , we study the stability of interpolating
85
+ solutions to kernel least squares and show that the minimum norm solutions minimize
86
+ an upper bound on the stability . In section 5 we discuss our results in the context
87
+ of recent work on high dimensional regression .
88
+ """