| logging: |
| |
| version: 1 |
| formatters: |
| default: |
| format: '[%(asctime)s] %(levelname)s in %(name)s:%(lineno)s: %(message)s' |
| handlers: |
| console: |
| level: INFO |
| formatter: default |
| class: logging.StreamHandler |
| stream: ext://sys.stdout |
| log_file: |
| level: INFO |
| formatter: default |
| class: logging.handlers.RotatingFileHandler |
| filename: 'logs/server.log' |
| mode: 'a' |
| maxBytes: 67108864 |
| backupCount: 10 |
| loggers: |
| |
| '': |
| level: INFO |
| handlers: |
| - console |
| - log_file |
| sciencebeam_parser: |
| level: DEBUG |
| __main__: |
| level: DEBUG |
| delft: |
| level: INFO |
| sciencebeam_trainer_delft: |
| level: INFO |
|
|
| |
| |
| |
| download_dir: '/data/.sciencebeam-parser/.cache' |
|
|
| pdfalto: |
| path: https://github.com/kermitt2/pdfalto/files/6104204/pdfalto-4b4e983413278a07bb4cc4b2836de03adc8ca6dc-dockcross-linux-64.gz |
| wapiti: |
| install_source: 'https://github.com/kermitt2/Wapiti/archive/a9c25d2bcccd60f1a54a7019689bd5229e866f00.tar.gz' |
| xslt: |
| tei_to_jats: |
| parameters: |
| |
| output_parameters: 'false' |
| output_bold: 'false' |
| output_italic: 'false' |
| output_empty_figure_graphic: 'true' |
| acknowledgement_target: 'ack' |
| annex_target: 'back' |
| lookup: |
| country: |
| paths: |
| - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/countries/CountryCodes.xml |
| first_name: |
| paths: |
| - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/firstname.5k |
| - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.female |
| - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.male |
| last_name: |
| paths: |
| - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/lastname.5k |
| - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.family |
| processors: |
| fulltext: |
| merge_raw_authors: false |
| use_cv_model: false |
| cv_render_dpi: 100 |
| use_ocr_model: false |
| replace_text_by_cv_graphic: false |
| max_graphic_distance: 0.3 |
| models: |
| segmentation: |
| path: 'https://github.com/kermitt2/grobid/raw/refs/tags/0.6.2/grobid-home/models/segmentation' |
| engine: 'wapiti' |
| use_first_token_of_block: false |
| header: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-header-biorxiv-no-word-embedding.tar.gz' |
| name_header: |
| |
| path: 'https://github.com/kermitt2/grobid/raw/0.6.0/grobid-home/models/name/header' |
| engine: 'wapiti' |
| name_citation: |
| |
| path: 'https://github.com/kermitt2/grobid/raw/0.6.2/grobid-home/models/name/citation' |
| engine: 'wapiti' |
| affiliation_address: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-affiliation-address-biorxiv-no-word-embedding.tar.gz' |
| fulltext: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-fulltext-biorxiv-10k-auto-v0.0.21-train-1986-e159.tar.gz' |
| figure: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-figure-biorxiv-10k-auto-v0.0.18-train-1865-e219.tar.gz' |
| table: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-table-biorxiv-10k-auto-v0.0.18-train-1865-e569.tar.gz' |
| reference_segmenter: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-reference-segmenter-biorxiv-no-word-embedding.tar.gz' |
| citation: |
| path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-citation-biorxiv-no-word-embedding.tar.gz' |
|
|
| cv_models: |
| default: |
| path: 'lp://efficientdet/PubLayNet' |
| engine: 'layout_parser' |
| score_threshold: 0.1 |
|
|
| ocr_models: |
| default: |
| engine: 'tesserocr' |
| lang: 'eng' |
| |
| |
| oem: 'DEFAULT' |
| |
| |
| psm: 'SPARSE_TEXT' |
|
|
| doc_to_pdf: |
| enabled: true |
| listener: |
| port: 2003 |
| process_timeout: 600 |
| max_uptime: 10 |
| convert: |
| remove_line_no: true |
| remove_header_footer: true |
| remove_redline: true |
|
|
| |
| preload_on_startup: true |
|
|