Docgenie-API / docgenie /generation /constants.py
Ahadhassan-2003
deploy: update HF Space
dc4e6da
SEED_IMAGE_MAX_WIDTH: int = 500
SEED_IMAGE_QUALITY: int = 80
PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_MAX_SIZE = 50
PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_BATCH = 5
PIPELINE_01_RETRIEVE_DOCUMENT_HTML__PROMPT_BATCH_POLL_SLEEP_SECONDS_BETWEEN_ITERATION = 5
PIPELINE_06_GT_VERIFICATION__GT_SIMILARITY_CUTOFF = 0.75
PIPELINE_03_RENDER_PDF__MAX_WORKERS = 8
PIPELINE_03_RENDER_PDF__CHROMIUM_CONCURRENCY = 10
PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_TIMEOUT = 30
PIPELINE_03_RENDER_PDF__PER_PDF_RENDER_MAX_RETRIES = 2
BS_PARSER = "lxml" # "html.parser"
PIPELINE_06_EXTRACT_HANDWRITING__MAX_WORD_LEN = -1
PIPELINE_04_3_SCALE_UP_FACTOR = 3
# bboxes read from pdf (and probably also those retrieved via OCR) dont fit exactly into the geo extracted via javascript
BBOX_TO_GEO_MATCHING_THRESHOLD = 25
IMAGE_RENDER_EXT = "png"
HANDWRITING_DEFAULT_BATCH_SIZE = 256
HANDWRITING_CLASS_NAME = "handwritten"
SIGNATURE_CLASS_NAME = "signature"
HANDWRITING_FONT_SIZE = "26"
FIXED_HANDWRITING_X_OFFSET = (
2 # place all handwritten text 2px to the right to look better
)
MAX_HANDWRITING_RAND_X_OFFSET_LEFT = 1
MAX_HANDWRITING_RAND_X_OFFSET_RIGHT = 2
MAX_HANDWRITING_RAND_Y_OFFSET_UP = 1
MAX_HANDWRITING_RAND_Y_OFFSET_DOWN = 2
MAX_HANDWRITING_RAND_DEG_ROT = 1
PDF_DPI = 200
WRITER_STYLES = [
404,
347,
156,
253,
354,
166,
320,
]
# VISUAL_ELEMENT_TYPES = ["stamp", "logo", "barcode", "photo", "chart"]
VISUAL_ELEMENT_TYPES = ["stamp", "logo", "figure", "barcode", "photo"]
VISUAL_ELEMENT_TYPE_SYNONYMS = {
"chart": "figure",
"diagram": "figure",
"plot": "figure",
"graph": "figure",
"illustration": "figure",
"infographic": "figure",
"image": "photo",
"seal": "stamp",
}