File size: 5,249 Bytes
4e6720f
 
 
 
 
 
 
f23af6d
4e6720f
 
 
f23af6d
4e6720f
 
 
 
 
 
 
 
 
 
 
 
 
f23af6d
4e6720f
 
 
 
 
 
 
 
 
 
 
 
 
8091970
4e6720f
 
 
 
f23af6d
4e6720f
 
 
 
 
 
 
 
 
 
 
 
 
d946a39
4e6720f
 
d946a39
 
 
4e6720f
 
d946a39
 
4e6720f
 
 
 
 
 
 
 
 
 
b1a6583
69f2423
4e6720f
 
42040aa
4e6720f
42040aa
 
4e6720f
 
42040aa
4e6720f
 
 
42040aa
4e6720f
42040aa
4e6720f
42040aa
4e6720f
42040aa
4e6720f
42040aa
4e6720f
42040aa
4e6720f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
logging:
  # Python logging config (passed to dictConfig)
  version: 1
  formatters:
    default:
      format: '[%(asctime)s] %(levelname)s in %(name)s:%(lineno)s: %(message)s'
  handlers:
    console:
      level: INFO
      formatter: default
      class: logging.StreamHandler
      stream: ext://sys.stdout
    log_file:
      level: INFO
      formatter: default
      class: logging.handlers.RotatingFileHandler
      filename: 'logs/server.log'
      mode: 'a'
      maxBytes: 67108864  # 64 MB
      backupCount: 10
  loggers:
    # root logger
    '':
      level: INFO
      handlers:
        - console
        - log_file
    sciencebeam_parser:
      level: DEBUG
    __main__:
      level: DEBUG
    delft:
      level: INFO
    sciencebeam_trainer_delft:
      level: INFO

# The download directory for ScienceBeam Parser resources
# Note: the CV model may download resources to `~/.torch/iopath_cache`,
#    unless FVCORE_CACHE is set. (See `iopath.common.file_io.get_cache_dir`)
download_dir: '/data/.sciencebeam-parser/.cache'

pdfalto:
  path: https://github.com/kermitt2/pdfalto/files/6104204/pdfalto-4b4e983413278a07bb4cc4b2836de03adc8ca6dc-dockcross-linux-64.gz
wapiti:
  install_source: 'https://github.com/kermitt2/Wapiti/archive/a9c25d2bcccd60f1a54a7019689bd5229e866f00.tar.gz'
xslt:
  tei_to_jats:
    parameters:
      # parameters for tei-to-jats.xsl
      output_parameters: 'false'
      output_bold: 'false'
      output_italic: 'false'
      output_empty_figure_graphic: 'true'
      acknowledgement_target: 'ack'
      annex_target: 'back'
lookup:
  country:
    paths:
      - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/countries/CountryCodes.xml
  first_name:
    paths:
      - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/firstname.5k
      - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.female
      - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.male
  last_name:
    paths:
      - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/lastname.5k
      - https://raw.githubusercontent.com/kermitt2/grobid/0.8.2/grobid-home/lexicon/names/names.family
processors:
  fulltext:
    merge_raw_authors: false
    use_cv_model: false
    cv_render_dpi: 100
    use_ocr_model: false
    replace_text_by_cv_graphic: false
    max_graphic_distance: 0.3
models:
  segmentation:
    path: 'https://github.com/kermitt2/grobid/raw/refs/tags/0.6.2/grobid-home/models/segmentation'
    engine: 'wapiti'
    use_first_token_of_block: false
  header:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-header-biorxiv-no-word-embedding.tar.gz'
  name_header:
    # path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-21-grobid-0.6.1-name-header-no-word-embedding-no-layout-features-e800.tar.gz'
    path: 'https://github.com/kermitt2/grobid/raw/0.6.0/grobid-home/models/name/header'
    engine: 'wapiti'
  name_citation:
    # path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/grobid-0.6.1/2021-06-28-grobid-0.6.1-name-citation-no-word-embedding-no-layout-features-e500.tar.gz'
    path: 'https://github.com/kermitt2/grobid/raw/0.6.2/grobid-home/models/name/citation'
    engine: 'wapiti'
  affiliation_address:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-affiliation-address-biorxiv-no-word-embedding.tar.gz'
  fulltext:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-fulltext-biorxiv-10k-auto-v0.0.21-train-1986-e159.tar.gz'
  figure:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-figure-biorxiv-10k-auto-v0.0.18-train-1865-e219.tar.gz'
  table:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/biorxiv-grobid/2021-05-11-delft-grobid-table-biorxiv-10k-auto-v0.0.18-train-1865-e569.tar.gz'
  reference_segmenter:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-reference-segmenter-biorxiv-no-word-embedding.tar.gz'
  citation:
    path: 'https://github.com/elifesciences/sciencebeam-models/releases/download/v0.0.1/2020-10-04-delft-grobid-citation-biorxiv-no-word-embedding.tar.gz'

cv_models:
  default:
    path: 'lp://efficientdet/PubLayNet'
    engine: 'layout_parser'
    score_threshold: 0.1

ocr_models:
  default:
    engine: 'tesserocr'
    lang: 'eng'
    # see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L75-L90
    # (specify literal or int value)
    oem: 'DEFAULT'
    # see https://github.com/sirfz/tesserocr/blob/v2.5.2/tesserocr.pyx#L102-L121
    # (specify literal or int value)
    psm: 'SPARSE_TEXT'

doc_to_pdf:
  enabled: true
  listener:
    port: 2003
    process_timeout: 600
    max_uptime: 10
  convert:
    remove_line_no: true
    remove_header_footer: true
    remove_redline: true

# preload resources and models on startup
preload_on_startup: true