Daniel Ecer commited on
Commit
4e6720f
·
1 Parent(s): 2c8a4e0

Copy default config file and set preload to true

Browse files
Files changed (2) hide show
  1. Dockerfile +4 -0
  2. config.yml +133 -0
Dockerfile CHANGED
@@ -1 +1,5 @@
1
  FROM elifesciences/sciencebeam-parser:0.1.8
 
 
 
 
 
1
  FROM elifesciences/sciencebeam-parser:0.1.8
2
+
3
+ COPY \
4
+ ./config.yml \
5
+ /opt/sciencebeam_parser/sciencebeam_parser/resources/default_config/config.yml
config.yml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ logging:
2
+ # Python logging config (passed to dictConfig)
3
+ version: 1
4
+ formatters:
5
+ default:
6
+ format: '[%(asctime)s] %(levelname)s in %(name)s:%(lineno)s: %(message)s'
7
+ handlers:
8
+ wsgi:
9
+ level: INFO
10
+ formatter: default
11
+ class: logging.StreamHandler
12
+ stream: 'ext://flask.logging.wsgi_errors_stream'
13
+ log_file:
14
+ level: INFO
15
+ formatter: default
16
+ class: logging.handlers.RotatingFileHandler
17
+ filename: 'logs/server.log'
18
+ mode: 'a'
19
+ maxBytes: 67108864 # 64 MB
20
+ backupCount: 10
21
+ loggers:
22
+ # root logger
23
+ '':
24
+ level: INFO
25
+ handlers:
26
+ - wsgi
27
+ - log_file
28
+ sciencebeam_parser:
29
+ level: DEBUG
30
+ __main__:
31
+ level: DEBUG
32
+ delft:
33
+ level: INFO
34
+ sciencebeam_trainer_delft:
35
+ level: INFO
36
+
37
+ # The download directory for ScienceBeam Parser resources
38
+ # Note: the CV model may download resources to `~/.torch/iopath_cache`,
39
+ # unless FVCORE_CACHE is set. (See `iopath.common.file_io.get_cache_dir`)
40
+ download_dir: '~/.cache/sciencebeam-parser/downloads'
41
+
42
+ pdfalto:
43
+ path: https://github.com/kermitt2/pdfalto/files/6104204/pdfalto-4b4e983413278a07bb4cc4b2836de03adc8ca6dc-dockcross-linux-64.gz
44
+ wapiti:
45
+ install_source: 'https://github.com/kermitt2/Wapiti/archive/5f9a52351fddf21916008daa4becd41d56e7f608.tar.gz'
46
+ xslt:
47
+ tei_to_jats:
48
+ parameters:
49
+ # parameters for tei-to-jats.xsl
50
+ output_parameters: 'false'
51
+ output_bold: 'false'
52
+ output_italic: 'false'
53
+ output_empty_figure_graphic: 'true'
54
+ acknowledgement_target: 'ack'
55
+ annex_target: 'back'
56
+ lookup:
57
+ country:
58
+ paths:
59
+ - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/countries/CountryCodes.xml
60
+ first_name:
61
+ paths:
62
+ - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/firstname.5k
63
+ - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.female
64
+ - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.male
65
+ last_name:
66
+ paths:
67
+ - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/lastname.5k
68
+ - https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.family
69
+ processors:
70
+ fulltext:
71
+ merge_raw_authors: false
72
+ use_cv_model: false
73
+ cv_render_dpi: 100
74
+ use_ocr_model: false
75
+ replace_text_by_cv_graphic: false
76
+ max_graphic_distance: 0.3
77
+ models:
78
+ segmentation:
79
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-segmentation-biorxiv-10k-auto-v0.0.23-train-1966-e133.tar.gz'
80
+ use_first_token_of_block: false
81
+ header:
82
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-header-biorxiv-no-word-embedding.tar.gz'
83
+ name_header:
84
+ # path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-21-grobid-0.6.1-name-header-no-word-embedding-no-layout-features-e800.tar.gz'
85
+ path: 'https://github.com/kermitt2/grobid/raw/0.6.0/grobid-home/models/name/header'
86
+ engine: 'wapiti'
87
+ name_citation:
88
+ # path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-28-grobid-0.6.1-name-citation-no-word-embedding-no-layout-features-e500.tar.gz'
89
+ path: 'https://github.com/kermitt2/grobid/raw/0.6.2/grobid-home/models/name/citation'
90
+ engine: 'wapiti'
91
+ affiliation_address:
92
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-affiliation-address-biorxiv-no-word-embedding.tar.gz'
93
+ fulltext:
94
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-fulltext-biorxiv-10k-auto-v0.0.21-train-1986-e159.tar.gz'
95
+ figure:
96
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-figure-biorxiv-10k-auto-v0.0.18-train-1865-e219.tar.gz'
97
+ table:
98
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-table-biorxiv-10k-auto-v0.0.18-train-1865-e569.tar.gz'
99
+ reference_segmenter:
100
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-reference-segmenter-biorxiv-no-word-embedding.tar.gz'
101
+ citation:
102
+ path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-citation-biorxiv-no-word-embedding.tar.gz'
103
+
104
+ cv_models:
105
+ default:
106
+ path: 'lp://efficientdet/PubLayNet'
107
+ engine: 'layout_parser'
108
+ score_threshold: 0.1
109
+
110
+ ocr_models:
111
+ default:
112
+ engine: 'tesserocr'
113
+ lang: 'eng'
114
+ # see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L75-L90
115
+ # (specify literal or int value)
116
+ oem: 'DEFAULT'
117
+ # see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L102-L121
118
+ # (specify literal or int value)
119
+ psm: 'SPARSE_TEXT'
120
+
121
+ doc_to_pdf:
122
+ enabled: true
123
+ listener:
124
+ port: 2003
125
+ process_timeout: 600
126
+ max_uptime: 10
127
+ convert:
128
+ remove_line_no: true
129
+ remove_header_footer: true
130
+ remove_redline: true
131
+
132
+ # preload resources and models on startup
133
+ preload_on_startup: true