Spaces:
Running on Zero
Running on Zero
| # Rename this file to app_config.env and place it in the folder config/ (i.e. it will be located at app_base_folder/config/app_config.env). The app will then automatically load in these variables at startup. See tools/config.py for all the possible config variables you can set, or src/app_settings.qmd for descriptions. Below are some suggested config variables to start | |
| # General app run options | |
| TESSERACT_FOLDER=tesseract/ # If in a custom folder, not needed if in PATH | |
| POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/ # If in a custom folder, Not needed if in PATH | |
| GRADIO_SERVER_NAME=127.0.0.1 | |
| GRADIO_SERVER_PORT=7860 | |
| USER_GUIDE_URL=<ENTER_URL> | |
| CUSTOM_BOX_COLOUR=(128, 128, 128) | |
| RUN_FASTAPI=False | |
| FAVICON_PATH=favicon.png | |
| INTRO_TEXT=intros/short_intro.txt | |
| # GUI options | |
| SHOW_QUICKSTART=False | |
| SHOW_SUMMARISATION=True | |
| SHOW_EXAMPLES=True | |
| SHOW_DIFFICULT_OCR_EXAMPLES=True | |
| SHOW_LANGUAGE_SELECTION=True | |
| SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=False | |
| SHOW_COSTS=True | |
| SHOW_LOCAL_OCR_MODEL_OPTIONS=True | |
| SHOW_ALL_OUTPUTS_IN_OUTPUT_FOLDER=True | |
| SHOW_PII_IDENTIFICATION_OPTIONS=True | |
| SHOW_LOCAL_PII_DETECTION_OPTIONS=True | |
| SHOW_OCR_GUI_OPTIONS=True | |
| EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT=True | |
| # Model / redaction process options | |
| DEFAULT_LOCAL_OCR_MODEL=tesseract | |
| OVERWRITE_EXISTING_OCR_RESULTS=False | |
| PREPROCESS_LOCAL_OCR_IMAGES=False # Whether to apply corrections to input images before processing. Will slow down redaction processes | |
| MAX_WORKERS=4 # How many workers should be working in parallel to run various text extraction/redaction tasks. Adjust depending on how many CPUs your computer has | |
| EFFICIENT_OCR=True | |
| OVERWRITE_EXISTING_OCR_RESULTS=True | |
| INCLUDE_OCR_VISUALISATION_IN_OUTPUT_FILES=True | |
| # Redaction box appearance | |
| CUSTOM_BOX_COLOUR=(128, 128, 128) | |
| USE_GUI_BOX_COLOURS_FOR_OUTPUTS=False | |
| # Image save options | |
| SAVE_PAGE_OCR_VISUALISATIONS=True | |
| SAVE_PREPROCESS_IMAGES=True | |
| # Saving and logging variables | |
| SAVE_LOGS_TO_CSV=True | |
| SESSION_OUTPUT_FOLDER=True # Save outputs into user session folders | |
| DISPLAY_FILE_NAMES_IN_LOGS=False | |
| # PaddleOCR | |
| SHOW_PADDLE_MODEL_OPTIONS=False | |
| LOAD_PADDLE_AT_STARTUP=False | |
| PADDLE_MAX_WORKERS=4 # Number of simultaneous workers for Paddle OCR tasks. Generally advised to keep at 1, but may work with 2 or more depending on your system. | |
| # GUI show VLM/LLM models | |
| SHOW_HYBRID_MODELS=False | |
| SHOW_CUSTOM_VLM_ENTITIES=False | |
| SHOW_VLM_MODEL_OPTIONS=True | |
| SHOW_INFERENCE_SERVER_PII_OPTIONS=False | |
| SHOW_INFERENCE_SERVER_VLM_OPTIONS=False | |
| SHOW_TRANSFORMERS_LLM_PII_DETECTION_OPTIONS=False | |
| # VLM using Transformers options | |
| SELECTED_LOCAL_TRANSFORMERS_VLM_MODEL=Qwen3.5-9B | |
| QUANTISE_VLM_MODELS=False | |
| USE_TRANSFORMERS_VLM_MODEL_AS_LLM=True | |
| LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE=None | |
| QUANTISE_TRANSFORMERS_LLM_MODELS=False | |
| LOAD_TRANSFORMERS_LLM_PII_MODEL_AT_START=False | |
| LOAD_TRANSFORMERS_VLM_MODEL_AT_START=True | |
| # VLM using inference server options (vLLM / Llama.cpp server) | |
| INFERENCE_SERVER_API_URL=http://192.168.0.220:8080 | |
| USE_LLAMA_SWAP=True | |
| INFERENCE_SERVER_LLM_PII_MODEL_CHOICE=qwen_3_5_27b | |
| # General VLM / LLM options | |
| VLM_DISABLE_QWEN3_5_THINKING=True | |
| LLM_MAX_NEW_TOKENS=8192 | |
| CUSTOM_VLM_BACKEND=bedrock_vlm # Which model type to use to do face / signature detection. Can choose from "transformers_vlm", "inference_vlm", "bedrock_vlm" | |
| # AWS related variables | |
| RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions. You can remove all the environment variables in the following section if you don't want to use them | |
| AWS_REGION=example-region | |
| DOCUMENT_REDACTION_BUCKET=example-bucket | |
| SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True | |
| SHOW_AWS_PII_DETECTION_OPTIONS=True | |
| SHOW_AWS_EXAMPLES=True | |
| RUN_ALL_EXAMPLES_THROUGH_AWS=True | |
| SAVE_LOGS_TO_DYNAMODB=True | |
| ACCESS_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-access-log | |
| USAGE_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-usage | |
| FEEDBACK_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-feedback | |
| # AWS Textract options | |
| SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True | |
| LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True | |
| TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output | |
| INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION=False | |
| INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION=False | |
| INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION=False | |
| INCLUDE_FACE_IDENTIFICATION_TEXTRACT_OPTION=False # Needs a VLM option available to work | |
| # AWS VLM / LLM options | |
| SHOW_BEDROCK_VLM_MODELS=False | |
| SHOW_AWS_BEDROCK_LLM_MODELS=False | |
| HYBRID_TEXTRACT_BEDROCK_VLM=False | |
| CLOUD_LLM_PII_MODEL_CHOICE=amazon.nova-pro-v1:0 | |
| CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE=anthropic.claude-sonnet-4-6 #amazon.nova-pro-v1:0 | |
| CLOUD_VLM_MODEL_CHOICE=amazon.nova-pro-v1:0 # other possibles: anthropic.claude-sonnet-4-6 #qwen.qwen3-vl-235b-a22b # anthropic.claude-sonnet-4-6 # | |
| CLOUD_SUMMARISATION_MODEL_CHOICE=amazon.nova-lite-v1:0 | |
| # Cost code related variables | |
| SHOW_COSTS=True | |
| GET_COST_CODES=True | |
| COST_CODES_PATH=config/cost_codes.csv | |
| ENFORCE_COST_CODES=True | |
| DEFAULT_COST_CODE=example_cost_code | |
| # S3 cost codes | |
| S3_COST_CODES_PATH=cost_codes.csv |