Daniel Ecer commited on
Commit ·
4e6720f
1
Parent(s): 2c8a4e0
Copy default config file and set preload to true
Browse files- Dockerfile +4 -0
- config.yml +133 -0
Dockerfile
CHANGED
|
@@ -1 +1,5 @@
|
|
| 1 |
FROM elifesciences/sciencebeam-parser:0.1.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
FROM elifesciences/sciencebeam-parser:0.1.8
|
| 2 |
+
|
| 3 |
+
COPY \
|
| 4 |
+
./config.yml \
|
| 5 |
+
/opt/sciencebeam_parser/sciencebeam_parser/resources/default_config/config.yml
|
config.yml
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
logging:
|
| 2 |
+
# Python logging config (passed to dictConfig)
|
| 3 |
+
version: 1
|
| 4 |
+
formatters:
|
| 5 |
+
default:
|
| 6 |
+
format: '[%(asctime)s] %(levelname)s in %(name)s:%(lineno)s: %(message)s'
|
| 7 |
+
handlers:
|
| 8 |
+
wsgi:
|
| 9 |
+
level: INFO
|
| 10 |
+
formatter: default
|
| 11 |
+
class: logging.StreamHandler
|
| 12 |
+
stream: 'ext://flask.logging.wsgi_errors_stream'
|
| 13 |
+
log_file:
|
| 14 |
+
level: INFO
|
| 15 |
+
formatter: default
|
| 16 |
+
class: logging.handlers.RotatingFileHandler
|
| 17 |
+
filename: 'logs/server.log'
|
| 18 |
+
mode: 'a'
|
| 19 |
+
maxBytes: 67108864 # 64 MB
|
| 20 |
+
backupCount: 10
|
| 21 |
+
loggers:
|
| 22 |
+
# root logger
|
| 23 |
+
'':
|
| 24 |
+
level: INFO
|
| 25 |
+
handlers:
|
| 26 |
+
- wsgi
|
| 27 |
+
- log_file
|
| 28 |
+
sciencebeam_parser:
|
| 29 |
+
level: DEBUG
|
| 30 |
+
__main__:
|
| 31 |
+
level: DEBUG
|
| 32 |
+
delft:
|
| 33 |
+
level: INFO
|
| 34 |
+
sciencebeam_trainer_delft:
|
| 35 |
+
level: INFO
|
| 36 |
+
|
| 37 |
+
# The download directory for ScienceBeam Parser resources
|
| 38 |
+
# Note: the CV model may download resources to `~/.torch/iopath_cache`,
|
| 39 |
+
# unless FVCORE_CACHE is set. (See `iopath.common.file_io.get_cache_dir`)
|
| 40 |
+
download_dir: '~/.cache/sciencebeam-parser/downloads'
|
| 41 |
+
|
| 42 |
+
pdfalto:
|
| 43 |
+
path: https://github.com/kermitt2/pdfalto/files/6104204/pdfalto-4b4e983413278a07bb4cc4b2836de03adc8ca6dc-dockcross-linux-64.gz
|
| 44 |
+
wapiti:
|
| 45 |
+
install_source: 'https://github.com/kermitt2/Wapiti/archive/5f9a52351fddf21916008daa4becd41d56e7f608.tar.gz'
|
| 46 |
+
xslt:
|
| 47 |
+
tei_to_jats:
|
| 48 |
+
parameters:
|
| 49 |
+
# parameters for tei-to-jats.xsl
|
| 50 |
+
output_parameters: 'false'
|
| 51 |
+
output_bold: 'false'
|
| 52 |
+
output_italic: 'false'
|
| 53 |
+
output_empty_figure_graphic: 'true'
|
| 54 |
+
acknowledgement_target: 'ack'
|
| 55 |
+
annex_target: 'back'
|
| 56 |
+
lookup:
|
| 57 |
+
country:
|
| 58 |
+
paths:
|
| 59 |
+
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/countries/CountryCodes.xml
|
| 60 |
+
first_name:
|
| 61 |
+
paths:
|
| 62 |
+
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/firstname.5k
|
| 63 |
+
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.female
|
| 64 |
+
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.male
|
| 65 |
+
last_name:
|
| 66 |
+
paths:
|
| 67 |
+
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/lastname.5k
|
| 68 |
+
- https://raw.githubusercontent.com/kermitt2/grobid/0.6.2/grobid-home/lexicon/names/names.family
|
| 69 |
+
processors:
|
| 70 |
+
fulltext:
|
| 71 |
+
merge_raw_authors: false
|
| 72 |
+
use_cv_model: false
|
| 73 |
+
cv_render_dpi: 100
|
| 74 |
+
use_ocr_model: false
|
| 75 |
+
replace_text_by_cv_graphic: false
|
| 76 |
+
max_graphic_distance: 0.3
|
| 77 |
+
models:
|
| 78 |
+
segmentation:
|
| 79 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-segmentation-biorxiv-10k-auto-v0.0.23-train-1966-e133.tar.gz'
|
| 80 |
+
use_first_token_of_block: false
|
| 81 |
+
header:
|
| 82 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-header-biorxiv-no-word-embedding.tar.gz'
|
| 83 |
+
name_header:
|
| 84 |
+
# path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-21-grobid-0.6.1-name-header-no-word-embedding-no-layout-features-e800.tar.gz'
|
| 85 |
+
path: 'https://github.com/kermitt2/grobid/raw/0.6.0/grobid-home/models/name/header'
|
| 86 |
+
engine: 'wapiti'
|
| 87 |
+
name_citation:
|
| 88 |
+
# path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-28-grobid-0.6.1-name-citation-no-word-embedding-no-layout-features-e500.tar.gz'
|
| 89 |
+
path: 'https://github.com/kermitt2/grobid/raw/0.6.2/grobid-home/models/name/citation'
|
| 90 |
+
engine: 'wapiti'
|
| 91 |
+
affiliation_address:
|
| 92 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-affiliation-address-biorxiv-no-word-embedding.tar.gz'
|
| 93 |
+
fulltext:
|
| 94 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-fulltext-biorxiv-10k-auto-v0.0.21-train-1986-e159.tar.gz'
|
| 95 |
+
figure:
|
| 96 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-figure-biorxiv-10k-auto-v0.0.18-train-1865-e219.tar.gz'
|
| 97 |
+
table:
|
| 98 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-table-biorxiv-10k-auto-v0.0.18-train-1865-e569.tar.gz'
|
| 99 |
+
reference_segmenter:
|
| 100 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-reference-segmenter-biorxiv-no-word-embedding.tar.gz'
|
| 101 |
+
citation:
|
| 102 |
+
path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-citation-biorxiv-no-word-embedding.tar.gz'
|
| 103 |
+
|
| 104 |
+
cv_models:
|
| 105 |
+
default:
|
| 106 |
+
path: 'lp://efficientdet/PubLayNet'
|
| 107 |
+
engine: 'layout_parser'
|
| 108 |
+
score_threshold: 0.1
|
| 109 |
+
|
| 110 |
+
ocr_models:
|
| 111 |
+
default:
|
| 112 |
+
engine: 'tesserocr'
|
| 113 |
+
lang: 'eng'
|
| 114 |
+
# see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L75-L90
|
| 115 |
+
# (specify literal or int value)
|
| 116 |
+
oem: 'DEFAULT'
|
| 117 |
+
# see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L102-L121
|
| 118 |
+
# (specify literal or int value)
|
| 119 |
+
psm: 'SPARSE_TEXT'
|
| 120 |
+
|
| 121 |
+
doc_to_pdf:
|
| 122 |
+
enabled: true
|
| 123 |
+
listener:
|
| 124 |
+
port: 2003
|
| 125 |
+
process_timeout: 600
|
| 126 |
+
max_uptime: 10
|
| 127 |
+
convert:
|
| 128 |
+
remove_line_no: true
|
| 129 |
+
remove_header_footer: true
|
| 130 |
+
remove_redline: true
|
| 131 |
+
|
| 132 |
+
# preload resources and models on startup
|
| 133 |
+
preload_on_startup: true
|