logging: # Python logging config (passed to dictConfig) version: 1 formatters: default: format: '[%(asctime)s] %(levelname)s in %(name)s:%(lineno)s: %(message)s' handlers: console: level: INFO formatter: default class: logging.StreamHandler stream: ext://sys.stdout log_file: level: INFO formatter: default class: logging.handlers.RotatingFileHandler filename: 'logs/server.log' mode: 'a' maxBytes: 67108864 # 64 MB backupCount: 10 loggers: # root logger '': level: INFO handlers: - console - log_file sciencebeam_parser: level: DEBUG __main__: level: DEBUG delft: level: INFO sciencebeam_trainer_delft: level: INFO # The download directory for ScienceBeam Parser resources # Note: the CV model may download resources to `~/.torch/iopath_cache`, # unless FVCORE_CACHE is set. (See `iopath.common.file_io.get_cache_dir`) download_dir: '/data/.sciencebeam-parser/.cache' pdfalto: path: https://github.com/kermitt2/pdfalto/files/6104204/pdfalto-4b4e983413278a07bb4cc4b2836de03adc8ca6dc-dockcross-linux-64.gz wapiti: install_source: 'https://github.com/kermitt2/Wapiti/archive/a9c25d2bcccd60f1a54a7019689bd5229e866f00.tar.gz' xslt: tei_to_jats: parameters: # parameters for tei-to-jats.xsl output_parameters: 'false' output_bold: 'false' output_italic: 'false' output_empty_figure_graphic: 'true' acknowledgement_target: 'ack' annex_target: 'back' lookup: country: paths: - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/countries/CountryCodes.xml first_name: paths: - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/firstname.5k - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.female - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.male last_name: paths: - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/lastname.5k - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.family processors: fulltext: merge_raw_authors: false use_cv_model: false cv_render_dpi: 100 use_ocr_model: false replace_text_by_cv_graphic: false max_graphic_distance: 0.3 models: segmentation: path: 'https://github.com/kermitt2/grobid/raw/refs/tags/0.6.2/grobid-home/models/segmentation' engine: 'wapiti' use_first_token_of_block: false header: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-header-biorxiv-no-word-embedding.tar.gz' name_header: # path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-21-grobid-0.6.1-name-header-no-word-embedding-no-layout-features-e800.tar.gz' path: 'https://github.com/kermitt2/grobid/raw/0.6.0/grobid-home/models/name/header' engine: 'wapiti' name_citation: # path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-28-grobid-0.6.1-name-citation-no-word-embedding-no-layout-features-e500.tar.gz' path: 'https://github.com/kermitt2/grobid/raw/0.6.2/grobid-home/models/name/citation' engine: 'wapiti' affiliation_address: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-affiliation-address-biorxiv-no-word-embedding.tar.gz' fulltext: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-fulltext-biorxiv-10k-auto-v0.0.21-train-1986-e159.tar.gz' figure: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-figure-biorxiv-10k-auto-v0.0.18-train-1865-e219.tar.gz' table: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-table-biorxiv-10k-auto-v0.0.18-train-1865-e569.tar.gz' reference_segmenter: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-reference-segmenter-biorxiv-no-word-embedding.tar.gz' citation: path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-citation-biorxiv-no-word-embedding.tar.gz' cv_models: default: path: 'lp://efficientdet/PubLayNet' engine: 'layout_parser' score_threshold: 0.1 ocr_models: default: engine: 'tesserocr' lang: 'eng' # see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L75-L90 # (specify literal or int value) oem: 'DEFAULT' # see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L102-L121 # (specify literal or int value) psm: 'SPARSE_TEXT' doc_to_pdf: enabled: true listener: port: 2003 process_timeout: 600 max_uptime: 10 convert: remove_line_no: true remove_header_footer: true remove_redline: true # preload resources and models on startup preload_on_startup: true