Spaces:

lfoppiano
/

grobid

Running

App Files Files Community

lfoppiano commited on Oct 12, 2025

Commit

169ce83

verified ·

1 Parent(s): 5655972

Create grobid.yaml

Browse files

Files changed (1) hide show

grobid.yaml +387 -0

grobid.yaml ADDED Viewed

	@@ -0,0 +1,387 @@

+# this is the configuration file for the GROBID instance
+grobid:
+  # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
+  grobidHome: "grobid-home"
+  # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
+  temp: "tmp"
+  # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
+  nativelibrary: "lib"
+  pdf:
+    pdfalto:
+      # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
+      path: "pdfalto"
+      # security for PDF parsing
+      memoryLimitMb: 6096
+      timeoutSec: 120
+    # security relative to the PDF parsing result
+    blocksMax: 200000
+    tokensMax: 1000000
+  consolidation:
+    # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
+    # "glutton" for https://github.com/kermitt2/biblio-glutton
+    service: "crossref"
+    #service: "glutton"
+    glutton:
+      #url: "https://cloud.science-miner.com/glutton"
+      url: "http://localhost:8080"
+    crossref:
+      mailto: luca@sciencialab.com
+      # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
+      #mailto: "toto@titi.tutu"
+      token:
+      # to use Crossref metadata plus service (available by subscription)
+      #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
+  proxy:
+    # proxy to be used when doing external call to the consolidation service
+    host:
+    port:
+  # CORS configuration for the GROBID web API service
+  corsAllowedOrigins: "*"
+  corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
+  corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
+  # the actual implementation for language recognition to be used
+  languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
+  # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
+  #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
+  sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
+  # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
+  # for a production server running only GROBID, set the value slightly above the available number of threads of the server
+  # to get best performance and security
+  concurrency: 4
+  # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
+  # to get an engine (in seconds) - normally never change it
+  poolMaxWait: 1
+  delft:
+    # DeLFT global parameters
+    # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
+    # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
+    install: "../delft"
+    pythonVirtualEnv:
+  wapiti:
+    # Wapiti global parameters
+    # number of threads for training the wapiti models (0 to use all available processors)
+    nbThreads: 0
+  models:
+    # we configure here how each sequence labeling model should be implemented
+    # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
+    # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
+    # parameters then depends on this selected DL architecture
+    - name: "segmentation"
+      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 3000
+          batch_size: 1
+        training:
+          # parameters used for training
+          max_sequence_length: 3000
+          batch_size: 10
+    - name: "segmentation-article-light"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+    - name: "segmentation-article-light-ref"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+    - name: "segmentation-sdo-ietf"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0000001
+        window: 50
+        nbMaxIterations: 2000
+    - name: "fulltext"
+      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0001
+        window: 20
+        nbMaxIterations: 1500
+    - name: "header"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        #transformer: "allenai/scibert_scivocab_cased"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          #max_sequence_length: 510
+          max_sequence_length: 3000
+          batch_size: 1
+        training:
+          # parameters used for training
+          #max_sequence_length: 510
+          #batch_size: 6
+          max_sequence_length: 3000
+          batch_size: 9
+    - name: "header-article-light"
+      engine: "wapiti"
+#      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+      delft:
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        useELMo: false
+    - name: "header-article-light-ref"
+      engine: "wapiti"
+#      engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+      delft:
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        useELMo: false
+    - name: "header-sdo-ietf"
+      engine: "wapiti"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.000001
+        window: 30
+        nbMaxIterations: 1500
+    - name: "reference-segmenter"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_ChainCRF_FEATURES"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
+          max_sequence_length: 3000
+          batch_size: 2
+        training:
+          # parameters used for training
+          max_sequence_length: 3000
+          batch_size: 10
+    - name: "name-header"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+    - name: "name-citation"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+    - name: "date"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+    - name: "figure"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF"
+    - name: "table"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF"
+    - name: "affiliation-address"
+      engine: "wapiti"
+      #engine: "delft"
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+    - name: "citation"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 50
+        nbMaxIterations: 3000
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        #architecture: "BERT_CRF"
+        #transformer: "michiyasunaga/LinkBERT-base"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 500
+          batch_size: 30
+        training:
+          # parameters used for training
+          max_sequence_length: 500
+          batch_size: 50
+    - name: "patent-citation"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.0001
+        window: 20
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        #architecture: "BERT_CRF"
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 800
+          batch_size: 20
+        training:
+          # parameters used for training
+          max_sequence_length: 1000
+          batch_size: 40
+    - name: "funding-acknowledgement"
+      engine: "wapiti"
+      #engine: "delft"
+      wapiti:
+        # wapiti training parameters, they will be used at training time only
+        epsilon: 0.00001
+        window: 50
+        nbMaxIterations: 2000
+      delft:
+        # deep learning parameters
+        architecture: "BidLSTM_CRF_FEATURES"
+        #architecture: "BERT_CRF"
+        #transformer: "michiyasunaga/LinkBERT-base"
+        useELMo: false
+        runtime:
+          # parameters used at runtime/prediction
+          max_sequence_length: 800
+          batch_size: 20
+        training:
+          # parameters used for training
+          max_sequence_length: 500
+          batch_size: 40
+    - name: "copyright"
+      # at this time, we only have a DeLFT implementation,
+      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
+      #engine: "delft"
+      engine: "wapiti"
+      delft:
+        # deep learning parameters
+        architecture: "gru"
+        #architecture: "bert"
+        #transformer: "allenai/scibert_scivocab_cased"
+    - name: "license"
+      # at this time, for being active, it must be DeLFT, no other implementation is available
+      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
+      #engine: "delft"
+      engine: "wapiti"
+      delft:
+        # deep learning parameters
+        architecture: "gru"
+        #architecture: "bert"
+        #transformer: "allenai/scibert_scivocab_cased"
+  # for **service only**: how to load the models,
+  # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
+  #          significantly the service at first call
+  # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
+  #         and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
+  modelPreload: true
+server:
+    type: custom
+    applicationConnectors:
+    - type: http
+      port: 8070
+    adminConnectors:
+    - type: http
+      port: 8071
+    registerDefaultExceptionMappers: false
+    # change the following for having all http requests logged
+    requestLog:
+      appenders: []
+# these logging settings apply to the Grobid service usage mode
+logging:
+  level: INFO
+  loggers:
+    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
+    org.glassfish.jersey.internal: "OFF"
+    com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
+  appenders:
+    - type: console
+      threshold: INFO
+      timeZone: UTC
+      # uncomment to have the logs in json format
+      #layout:
+      #  type: json