pub2tei-dev

Runtime error

App Files Files Community

lfoppiano commited on May 10, 2024

Commit

ad4e920

verified ·

1 Parent(s): bf58cda

Delete grobid.yaml

Browse files

Files changed (1) hide show

grobid.yaml +0 -331

grobid.yaml DELETED Viewed

@@ -1,331 +0,0 @@
-# this is the configuration file for the GROBID instance
-grobid:
-  # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
-  grobidHome: "grobid-home"
-  # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
-  temp: "tmp"
-  # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
-  nativelibrary: "lib"
-  pdf:
-    pdfalto:
-      # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
-      path: "pdfalto"
-      # security for PDF parsing
-      memoryLimitMb: 6096
-      timeoutSec: 120
-    # security relative to the PDF parsing result
-    blocksMax: 200000
-    tokensMax: 1000000
-  consolidation:
-    # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
-    # "glutton" for https://github.com/kermitt2/biblio-glutton
-    service: "crossref"
-    #service: "glutton"
-    glutton:
-      url: "https://cloud.science-miner.com/glutton"
-      #url: "http://localhost:8080"
-    crossref:
-      mailto:
-      # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
-      #mailto: "toto@titi.tutu"
-      token:
-      # to use Crossref metadata plus service (available by subscription)
-      #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
-  proxy:
-    # proxy to be used when doing external call to the consolidation service
-    host:
-    port:
-  # CORS configuration for the GROBID web API service
-  corsAllowedOrigins: "*"
-  corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
-  corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
-  # the actual implementation for language recognition to be used
-  languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
-  # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
-  sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
-  # sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
-  # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
-  # for a production server running only GROBID, set the value slightly above the available number of threads of the server
-  # to get best performance and security
-  concurrency: 10
-  # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
-  # to get an engine (in seconds) - normally never change it
-  poolMaxWait: 1
-  delft:
-    # DeLFT global parameters
-    # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
-    # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
-    install: "../delft"
-    pythonVirtualEnv:
-  wapiti:
-    # Wapiti global parameters
-    # number of threads for training the wapiti models (0 to use all available processors)
-    nbThreads: 0
-  models:
-    # we configure here how each sequence labeling model should be implemented
-    # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
-    # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
-    # parameters then depends on this selected DL architecture
-    - name: "segmentation"
-      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.0000001
-        window: 50
-        nbMaxIterations: 2000
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-        useELMo: false
-        runtime:
-          # parameters used at runtime/prediction
-          max_sequence_length: 3000
-          batch_size: 1
-        training:
-          # parameters used for training
-          max_sequence_length: 3000
-          batch_size: 10
-    - name: "fulltext"
-      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
-      engine: "wapiti"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.0001
-        window: 20
-        nbMaxIterations: 1500
-    - name: "header"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.000001
-        window: 30
-        nbMaxIterations: 1500
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_ChainCRF_FEATURES"
-        #transformer: "allenai/scibert_scivocab_cased"
-        useELMo: false
-        runtime:
-          # parameters used at runtime/prediction
-          #max_sequence_length: 510
-          max_sequence_length: 3000
-          batch_size: 1
-        training:
-          # parameters used for training
-          #max_sequence_length: 510
-          #batch_size: 6
-          max_sequence_length: 3000
-          batch_size: 9
-    - name: "reference-segmenter"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.00001
-        window: 20
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_ChainCRF_FEATURES"
-        useELMo: false
-        runtime:
-          # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
-          max_sequence_length: 3000
-          batch_size: 2
-        training:
-          # parameters used for training
-          max_sequence_length: 3000
-          batch_size: 10
-    - name: "name-header"
-      engine: "wapiti"
-      #engine: "delft"
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-    - name: "name-citation"
-      engine: "wapiti"
-      #engine: "delft"
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-    - name: "date"
-      engine: "wapiti"
-      #engine: "delft"
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-    - name: "figure"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.00001
-        window: 20
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF"
-    - name: "table"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.00001
-        window: 20
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF"
-    - name: "affiliation-address"
-      engine: "wapiti"
-      #engine: "delft"
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-    - name: "citation"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.00001
-        window: 50
-        nbMaxIterations: 3000
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-        #architecture: "BERT_CRF"
-        #transformer: "michiyasunaga/LinkBERT-base"
-        useELMo: false
-        runtime:
-          # parameters used at runtime/prediction
-          max_sequence_length: 500
-          batch_size: 30
-        training:
-          # parameters used for training
-          max_sequence_length: 500
-          batch_size: 50
-    - name: "patent-citation"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.0001
-        window: 20
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-        #architecture: "BERT_CRF"
-        runtime:
-          # parameters used at runtime/prediction
-          max_sequence_length: 800
-          batch_size: 20
-        training:
-          # parameters used for training
-          max_sequence_length: 1000
-          batch_size: 40
-    - name: "funding-acknowledgement"
-      engine: "wapiti"
-      #engine: "delft"
-      wapiti:
-        # wapiti training parameters, they will be used at training time only
-        epsilon: 0.00001
-        window: 50
-        nbMaxIterations: 2000
-      delft:
-        # deep learning parameters
-        architecture: "BidLSTM_CRF_FEATURES"
-        #architecture: "BERT_CRF"
-        #transformer: "michiyasunaga/LinkBERT-base"
-        useELMo: false
-        runtime:
-          # parameters used at runtime/prediction
-          max_sequence_length: 800
-          batch_size: 20
-        training:
-          # parameters used for training
-          max_sequence_length: 500
-          batch_size: 40
-    - name: "copyright"
-      # at this time, we only have a DeLFT implementation,
-      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
-      #engine: "delft"
-      engine: "wapiti"
-      delft:
-        # deep learning parameters
-        architecture: "gru"
-        #architecture: "bert"
-        #transformer: "allenai/scibert_scivocab_cased"
-    - name: "license"
-      # at this time, for being active, it must be DeLFT, no other implementation is available
-      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
-      #engine: "delft"
-      engine: "wapiti"
-      delft:
-        # deep learning parameters
-        architecture: "gru"
-        #architecture: "bert"
-        #transformer: "allenai/scibert_scivocab_cased"
-  # for **service only**: how to load the models,
-  # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
-  #          significantly the service at first call
-  # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
-  #         and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
-  modelPreload: true
-server:
-    type: custom
-    applicationConnectors:
-    - type: http
-      port: 8070
-    adminConnectors:
-    - type: http
-      port: 8071
-    registerDefaultExceptionMappers: false
-    # change the following for having all http requests logged
-    requestLog:
-      appenders: []
-# these logging settings apply to the Grobid service usage mode
-logging:
-  level: INFO
-  loggers:
-    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
-    org.glassfish.jersey.internal: "OFF"
-    com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
-  appenders:
-    - type: console
-      threshold: INFO
-      timeZone: UTC
-      # uncomment to have the logs in json format
-      # layout:
-       # type: json