File size: 1,796 Bytes
eb53bb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""

Configuration settings for the document text extraction system.

"""

import os
from pathlib import Path


class Config:
    """Global configuration settings."""
    
    # Project paths
    PROJECT_ROOT = Path(__file__).parent.parent
    DATA_DIR = PROJECT_ROOT / "data"
    MODELS_DIR = PROJECT_ROOT / "models"
    RESULTS_DIR = PROJECT_ROOT / "results"
    
    # Data paths
    RAW_DATA_DIR = DATA_DIR / "raw"
    PROCESSED_DATA_DIR = DATA_DIR / "processed"
    
    # Model settings
    DEFAULT_MODEL_NAME = "distilbert-base-uncased"
    DEFAULT_MODEL_PATH = MODELS_DIR / "document_ner_model"
    
    # Training settings
    DEFAULT_BATCH_SIZE = 16
    DEFAULT_LEARNING_RATE = 2e-5
    DEFAULT_NUM_EPOCHS = 3
    DEFAULT_MAX_LENGTH = 512
    
    # OCR settings
    TESSERACT_PATH = os.getenv('TESSERACT_PATH', None)
    
    # API settings
    API_HOST = "0.0.0.0"
    API_PORT = 8000
    
    # Entity labels
    ENTITY_LABELS = [
        'O', 'B-NAME', 'I-NAME', 'B-DATE', 'I-DATE', 
        'B-INVOICE_NO', 'I-INVOICE_NO', 'B-AMOUNT', 'I-AMOUNT',
        'B-ADDRESS', 'I-ADDRESS', 'B-PHONE', 'I-PHONE',
        'B-EMAIL', 'I-EMAIL'
    ]
    
    # Supported file formats
    SUPPORTED_FORMATS = ['.pdf', '.docx', '.png', '.jpg', '.jpeg', '.tiff', '.bmp']
    
    @classmethod
    def create_directories(cls):
        """Create necessary directories."""
        directories = [
            cls.DATA_DIR,
            cls.RAW_DATA_DIR,
            cls.PROCESSED_DATA_DIR,
            cls.MODELS_DIR,
            cls.RESULTS_DIR,
            cls.RESULTS_DIR / "plots",
            cls.RESULTS_DIR / "metrics"
        ]
        
        for directory in directories:
            directory.mkdir(parents=True, exist_ok=True)