lfoppiano commited on
Commit
169ce83
·
verified ·
1 Parent(s): 5655972

Create grobid.yaml

Browse files
Files changed (1) hide show
  1. grobid.yaml +387 -0
grobid.yaml ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is the configuration file for the GROBID instance
2
+
3
+ grobid:
4
+ # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
5
+ grobidHome: "grobid-home"
6
+
7
+ # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
8
+ temp: "tmp"
9
+
10
+ # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
11
+ nativelibrary: "lib"
12
+
13
+ pdf:
14
+ pdfalto:
15
+ # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
16
+ path: "pdfalto"
17
+ # security for PDF parsing
18
+ memoryLimitMb: 6096
19
+ timeoutSec: 120
20
+
21
+ # security relative to the PDF parsing result
22
+ blocksMax: 200000
23
+ tokensMax: 1000000
24
+
25
+ consolidation:
26
+ # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
27
+ # "glutton" for https://github.com/kermitt2/biblio-glutton
28
+ service: "crossref"
29
+ #service: "glutton"
30
+ glutton:
31
+ #url: "https://cloud.science-miner.com/glutton"
32
+ url: "http://localhost:8080"
33
+ crossref:
34
+ mailto: luca@sciencialab.com
35
+ # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
36
+ #mailto: "toto@titi.tutu"
37
+ token:
38
+ # to use Crossref metadata plus service (available by subscription)
39
+ #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
40
+
41
+ proxy:
42
+ # proxy to be used when doing external call to the consolidation service
43
+ host:
44
+ port:
45
+
46
+ # CORS configuration for the GROBID web API service
47
+ corsAllowedOrigins: "*"
48
+ corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
49
+ corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
50
+
51
+ # the actual implementation for language recognition to be used
52
+ languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
53
+
54
+ # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
55
+ #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
56
+ sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
57
+
58
+ # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
59
+ # for a production server running only GROBID, set the value slightly above the available number of threads of the server
60
+ # to get best performance and security
61
+ concurrency: 4
62
+ # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
63
+ # to get an engine (in seconds) - normally never change it
64
+ poolMaxWait: 1
65
+
66
+ delft:
67
+ # DeLFT global parameters
68
+ # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
69
+ # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
70
+ install: "../delft"
71
+ pythonVirtualEnv:
72
+
73
+ wapiti:
74
+ # Wapiti global parameters
75
+ # number of threads for training the wapiti models (0 to use all available processors)
76
+ nbThreads: 0
77
+
78
+ models:
79
+ # we configure here how each sequence labeling model should be implemented
80
+ # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
81
+ # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
82
+ # parameters then depends on this selected DL architecture
83
+
84
+ - name: "segmentation"
85
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
86
+ engine: "wapiti"
87
+ #engine: "delft"
88
+ wapiti:
89
+ # wapiti training parameters, they will be used at training time only
90
+ epsilon: 0.0000001
91
+ window: 50
92
+ nbMaxIterations: 2000
93
+ delft:
94
+ # deep learning parameters
95
+ architecture: "BidLSTM_CRF_FEATURES"
96
+ useELMo: false
97
+ runtime:
98
+ # parameters used at runtime/prediction
99
+ max_sequence_length: 3000
100
+ batch_size: 1
101
+ training:
102
+ # parameters used for training
103
+ max_sequence_length: 3000
104
+ batch_size: 10
105
+
106
+ - name: "segmentation-article-light"
107
+ engine: "wapiti"
108
+ wapiti:
109
+ # wapiti training parameters, they will be used at training time only
110
+ epsilon: 0.0000001
111
+ window: 50
112
+ nbMaxIterations: 2000
113
+
114
+ - name: "segmentation-article-light-ref"
115
+ engine: "wapiti"
116
+ wapiti:
117
+ # wapiti training parameters, they will be used at training time only
118
+ epsilon: 0.0000001
119
+ window: 50
120
+ nbMaxIterations: 2000
121
+
122
+ - name: "segmentation-sdo-ietf"
123
+ engine: "wapiti"
124
+ wapiti:
125
+ # wapiti training parameters, they will be used at training time only
126
+ epsilon: 0.0000001
127
+ window: 50
128
+ nbMaxIterations: 2000
129
+
130
+ - name: "fulltext"
131
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
132
+ engine: "wapiti"
133
+ wapiti:
134
+ # wapiti training parameters, they will be used at training time only
135
+ epsilon: 0.0001
136
+ window: 20
137
+ nbMaxIterations: 1500
138
+
139
+ - name: "header"
140
+ engine: "wapiti"
141
+ #engine: "delft"
142
+ wapiti:
143
+ # wapiti training parameters, they will be used at training time only
144
+ epsilon: 0.000001
145
+ window: 30
146
+ nbMaxIterations: 1500
147
+ delft:
148
+ # deep learning parameters
149
+ architecture: "BidLSTM_ChainCRF_FEATURES"
150
+ #transformer: "allenai/scibert_scivocab_cased"
151
+ useELMo: false
152
+ runtime:
153
+ # parameters used at runtime/prediction
154
+ #max_sequence_length: 510
155
+ max_sequence_length: 3000
156
+ batch_size: 1
157
+ training:
158
+ # parameters used for training
159
+ #max_sequence_length: 510
160
+ #batch_size: 6
161
+ max_sequence_length: 3000
162
+ batch_size: 9
163
+
164
+ - name: "header-article-light"
165
+ engine: "wapiti"
166
+ # engine: "delft"
167
+ wapiti:
168
+ # wapiti training parameters, they will be used at training time only
169
+ epsilon: 0.000001
170
+ window: 30
171
+ nbMaxIterations: 1500
172
+ delft:
173
+ architecture: "BidLSTM_ChainCRF_FEATURES"
174
+ useELMo: false
175
+
176
+ - name: "header-article-light-ref"
177
+ engine: "wapiti"
178
+ # engine: "delft"
179
+ wapiti:
180
+ # wapiti training parameters, they will be used at training time only
181
+ epsilon: 0.000001
182
+ window: 30
183
+ nbMaxIterations: 1500
184
+ delft:
185
+ architecture: "BidLSTM_ChainCRF_FEATURES"
186
+ useELMo: false
187
+
188
+ - name: "header-sdo-ietf"
189
+ engine: "wapiti"
190
+ wapiti:
191
+ # wapiti training parameters, they will be used at training time only
192
+ epsilon: 0.000001
193
+ window: 30
194
+ nbMaxIterations: 1500
195
+
196
+ - name: "reference-segmenter"
197
+ engine: "wapiti"
198
+ #engine: "delft"
199
+ wapiti:
200
+ # wapiti training parameters, they will be used at training time only
201
+ epsilon: 0.00001
202
+ window: 20
203
+ delft:
204
+ # deep learning parameters
205
+ architecture: "BidLSTM_ChainCRF_FEATURES"
206
+ useELMo: false
207
+ runtime:
208
+ # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
209
+ max_sequence_length: 3000
210
+ batch_size: 2
211
+ training:
212
+ # parameters used for training
213
+ max_sequence_length: 3000
214
+ batch_size: 10
215
+
216
+ - name: "name-header"
217
+ engine: "wapiti"
218
+ #engine: "delft"
219
+ delft:
220
+ # deep learning parameters
221
+ architecture: "BidLSTM_CRF_FEATURES"
222
+
223
+ - name: "name-citation"
224
+ engine: "wapiti"
225
+ #engine: "delft"
226
+ delft:
227
+ # deep learning parameters
228
+ architecture: "BidLSTM_CRF_FEATURES"
229
+
230
+ - name: "date"
231
+ engine: "wapiti"
232
+ #engine: "delft"
233
+ delft:
234
+ # deep learning parameters
235
+ architecture: "BidLSTM_CRF_FEATURES"
236
+
237
+ - name: "figure"
238
+ engine: "wapiti"
239
+ #engine: "delft"
240
+ wapiti:
241
+ # wapiti training parameters, they will be used at training time only
242
+ epsilon: 0.00001
243
+ window: 20
244
+ delft:
245
+ # deep learning parameters
246
+ architecture: "BidLSTM_CRF"
247
+
248
+ - name: "table"
249
+ engine: "wapiti"
250
+ #engine: "delft"
251
+ wapiti:
252
+ # wapiti training parameters, they will be used at training time only
253
+ epsilon: 0.00001
254
+ window: 20
255
+ delft:
256
+ # deep learning parameters
257
+ architecture: "BidLSTM_CRF"
258
+
259
+ - name: "affiliation-address"
260
+ engine: "wapiti"
261
+ #engine: "delft"
262
+ delft:
263
+ # deep learning parameters
264
+ architecture: "BidLSTM_CRF_FEATURES"
265
+
266
+ - name: "citation"
267
+ engine: "wapiti"
268
+ #engine: "delft"
269
+ wapiti:
270
+ # wapiti training parameters, they will be used at training time only
271
+ epsilon: 0.00001
272
+ window: 50
273
+ nbMaxIterations: 3000
274
+ delft:
275
+ # deep learning parameters
276
+ architecture: "BidLSTM_CRF_FEATURES"
277
+ #architecture: "BERT_CRF"
278
+ #transformer: "michiyasunaga/LinkBERT-base"
279
+ useELMo: false
280
+ runtime:
281
+ # parameters used at runtime/prediction
282
+ max_sequence_length: 500
283
+ batch_size: 30
284
+ training:
285
+ # parameters used for training
286
+ max_sequence_length: 500
287
+ batch_size: 50
288
+
289
+ - name: "patent-citation"
290
+ engine: "wapiti"
291
+ #engine: "delft"
292
+ wapiti:
293
+ # wapiti training parameters, they will be used at training time only
294
+ epsilon: 0.0001
295
+ window: 20
296
+ delft:
297
+ # deep learning parameters
298
+ architecture: "BidLSTM_CRF_FEATURES"
299
+ #architecture: "BERT_CRF"
300
+ runtime:
301
+ # parameters used at runtime/prediction
302
+ max_sequence_length: 800
303
+ batch_size: 20
304
+ training:
305
+ # parameters used for training
306
+ max_sequence_length: 1000
307
+ batch_size: 40
308
+
309
+ - name: "funding-acknowledgement"
310
+ engine: "wapiti"
311
+ #engine: "delft"
312
+ wapiti:
313
+ # wapiti training parameters, they will be used at training time only
314
+ epsilon: 0.00001
315
+ window: 50
316
+ nbMaxIterations: 2000
317
+ delft:
318
+ # deep learning parameters
319
+ architecture: "BidLSTM_CRF_FEATURES"
320
+ #architecture: "BERT_CRF"
321
+ #transformer: "michiyasunaga/LinkBERT-base"
322
+ useELMo: false
323
+ runtime:
324
+ # parameters used at runtime/prediction
325
+ max_sequence_length: 800
326
+ batch_size: 20
327
+ training:
328
+ # parameters used for training
329
+ max_sequence_length: 500
330
+ batch_size: 40
331
+
332
+ - name: "copyright"
333
+ # at this time, we only have a DeLFT implementation,
334
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
335
+ #engine: "delft"
336
+ engine: "wapiti"
337
+ delft:
338
+ # deep learning parameters
339
+ architecture: "gru"
340
+ #architecture: "bert"
341
+ #transformer: "allenai/scibert_scivocab_cased"
342
+
343
+ - name: "license"
344
+ # at this time, for being active, it must be DeLFT, no other implementation is available
345
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
346
+ #engine: "delft"
347
+ engine: "wapiti"
348
+ delft:
349
+ # deep learning parameters
350
+ architecture: "gru"
351
+ #architecture: "bert"
352
+ #transformer: "allenai/scibert_scivocab_cased"
353
+
354
+ # for **service only**: how to load the models,
355
+ # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
356
+ # significantly the service at first call
357
+ # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
358
+ # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
359
+ modelPreload: true
360
+
361
+ server:
362
+ type: custom
363
+ applicationConnectors:
364
+ - type: http
365
+ port: 8070
366
+ adminConnectors:
367
+ - type: http
368
+ port: 8071
369
+ registerDefaultExceptionMappers: false
370
+ # change the following for having all http requests logged
371
+ requestLog:
372
+ appenders: []
373
+
374
+ # these logging settings apply to the Grobid service usage mode
375
+ logging:
376
+ level: INFO
377
+ loggers:
378
+ org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
379
+ org.glassfish.jersey.internal: "OFF"
380
+ com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
381
+ appenders:
382
+ - type: console
383
+ threshold: INFO
384
+ timeZone: UTC
385
+ # uncomment to have the logs in json format
386
+ #layout:
387
+ # type: json