File size: 1,796 Bytes
eb53bb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
Configuration settings for the document text extraction system.
"""
import os
from pathlib import Path
class Config:
"""Global configuration settings."""
# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = PROJECT_ROOT / "models"
RESULTS_DIR = PROJECT_ROOT / "results"
# Data paths
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
# Model settings
DEFAULT_MODEL_NAME = "distilbert-base-uncased"
DEFAULT_MODEL_PATH = MODELS_DIR / "document_ner_model"
# Training settings
DEFAULT_BATCH_SIZE = 16
DEFAULT_LEARNING_RATE = 2e-5
DEFAULT_NUM_EPOCHS = 3
DEFAULT_MAX_LENGTH = 512
# OCR settings
TESSERACT_PATH = os.getenv('TESSERACT_PATH', None)
# API settings
API_HOST = "0.0.0.0"
API_PORT = 8000
# Entity labels
ENTITY_LABELS = [
'O', 'B-NAME', 'I-NAME', 'B-DATE', 'I-DATE',
'B-INVOICE_NO', 'I-INVOICE_NO', 'B-AMOUNT', 'I-AMOUNT',
'B-ADDRESS', 'I-ADDRESS', 'B-PHONE', 'I-PHONE',
'B-EMAIL', 'I-EMAIL'
]
# Supported file formats
SUPPORTED_FORMATS = ['.pdf', '.docx', '.png', '.jpg', '.jpeg', '.tiff', '.bmp']
@classmethod
def create_directories(cls):
"""Create necessary directories."""
directories = [
cls.DATA_DIR,
cls.RAW_DATA_DIR,
cls.PROCESSED_DATA_DIR,
cls.MODELS_DIR,
cls.RESULTS_DIR,
cls.RESULTS_DIR / "plots",
cls.RESULTS_DIR / "metrics"
]
for directory in directories:
directory.mkdir(parents=True, exist_ok=True) |