File size: 6,729 Bytes
dc4e6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Configuration settings for DocGenie API
"""
import os
from typing import Optional, List


class Settings:
    """API configuration settings"""
    
    # ==================== LLM Configuration ====================
    ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "")
    CLAUDE_MODEL: str = os.getenv("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
    # Backward compatibility
    LLM_MODEL: str = os.getenv("LLM_MODEL", CLAUDE_MODEL)
    
    # ==================== Handwriting Service (Stage 3) ====================
    HANDWRITING_SERVICE_URL: str = os.getenv(
        "HANDWRITING_SERVICE_URL",
        "http://localhost:8080"
    )
    RUNPOD_API_KEY: str = os.getenv("RUNPOD_API_KEY", "")
    HANDWRITING_SERVICE_TIMEOUT: int = int(os.getenv("HANDWRITING_SERVICE_TIMEOUT", "300"))
    HANDWRITING_SERVICE_MAX_RETRIES: int = int(os.getenv("HANDWRITING_SERVICE_MAX_RETRIES", "3"))
    HANDWRITING_SERVICE_ENABLED: bool = os.getenv("HANDWRITING_SERVICE_ENABLED", "false").lower() == "true"
    HANDWRITING_SERVICE_SUPPORTS_BATCH: bool = os.getenv("HANDWRITING_SERVICE_SUPPORTS_BATCH", "true").lower() == "true"
    
    # ==================== OCR Service (Stage 4) ====================
    OCR_SERVICE_URL: str = os.getenv("OCR_SERVICE_URL", "http://localhost:8000")
    OCR_SERVICE_TIMEOUT: int = int(os.getenv("OCR_SERVICE_TIMEOUT", "30"))
    OCR_SERVICE_ENABLED: bool = os.getenv("OCR_SERVICE_ENABLED", "false").lower() == "true"
    OCR_ENGINE: str = os.getenv("OCR_ENGINE", "microsoft_di")
    OCR_DPI: int = int(os.getenv("OCR_DPI", "300"))  # DPI for PDF to image conversion
    
    # Local Tesseract OCR (alternative to remote service)
    OCR_USE_LOCAL: bool = os.getenv("OCR_USE_LOCAL", "false").lower() == "true"
    OCR_TESSERACT_LANG: str = os.getenv("OCR_TESSERACT_LANG", "eng")  # Tesseract language
    OCR_TESSERACT_CONFIG: str = os.getenv("OCR_TESSERACT_CONFIG", "--psm 3")  # Tesseract config
    
    # ==================== Stage 5: Dataset Packaging ====================
    # Stage 16: BBox normalization
    BBOX_NORMALIZATION_ENABLED: bool = os.getenv("BBOX_NORMALIZATION_ENABLED", "false").lower() == "true"
    BBOX_NORMALIZATION_SCALE: str = os.getenv("BBOX_NORMALIZATION_SCALE", "0-1")  # "0-1" or "0-1000"
    
    # Stage 17: GT verification
    GT_VERIFICATION_ENABLED: bool = os.getenv("GT_VERIFICATION_ENABLED", "false").lower() == "true"
    GT_VERIFICATION_SIMILARITY_CUTOFF: float = float(os.getenv("GT_VERIFICATION_SIMILARITY_CUTOFF", "0.8"))
    GT_VERIFICATION_OVERLAP_THRESHOLD: float = float(os.getenv("GT_VERIFICATION_OVERLAP_THRESHOLD", "0.5"))
    
    # Stage 18: Analysis
    ANALYSIS_ENABLED: bool = os.getenv("ANALYSIS_ENABLED", "false").lower() == "true"
    ANALYSIS_MIN_ANNOTATION_COUNT: int = int(os.getenv("ANALYSIS_MIN_ANNOTATION_COUNT", "1"))
    
    # Stage 19: Debug visualization
    DEBUG_VISUALIZATION_ENABLED: bool = os.getenv("DEBUG_VISUALIZATION_ENABLED", "false").lower() == "true"
    DEBUG_SHOW_TEXT_IN_BBOX: bool = os.getenv("DEBUG_SHOW_TEXT_IN_BBOX", "true").lower() == "true"
    DEBUG_BBOX_COLOR_RGB: str = os.getenv("DEBUG_BBOX_COLOR_RGB", "255,0,0")  # Red default
    
    # Dataset export
    DATASET_EXPORT_ENABLED: bool = os.getenv("DATASET_EXPORT_ENABLED", "false").lower() == "true"
    DATASET_EXPORT_FORMAT: str = os.getenv("DATASET_EXPORT_FORMAT", "msgpack")  # msgpack, coco, huggingface
    DATASET_EXPORT_DIR: str = os.getenv("DATASET_EXPORT_DIR", "/tmp/docgenie_datasets")
    DATASET_RESIZE_IMAGES: bool = os.getenv("DATASET_RESIZE_IMAGES", "false").lower() == "true"
    DATASET_CLIP_BBOXES_TO_FOREGROUND: bool = os.getenv("DATASET_CLIP_BBOXES_TO_FOREGROUND", "false").lower() == "true"
    
    # ==================== API Server Configuration ====================
    API_HOST: str = os.getenv("API_HOST", "0.0.0.0")
    API_PORT: int = int(os.getenv("API_PORT", "8000"))
    DEBUG_MODE: bool = os.getenv("DEBUG_MODE", "false").lower() == "true"
    
    # ==================== CORS Configuration ====================
    CORS_ORIGINS: List[str] = [
        origin.strip() 
        for origin in os.getenv("CORS_ORIGINS", "*").split(",")
        if origin.strip()
    ] or ["*"]
    
    # ==================== File Storage ====================
    TEMP_DIR: str = os.getenv("TEMP_DIR", "/tmp/docgenie_api")
    
    # ==================== Logging ====================
    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
    
    # ==================== Database (Optional) ====================
    DATABASE_URL: Optional[str] = os.getenv("DATABASE_URL", None)
    REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")
    
    # ==================== Supabase ====================
    SUPABASE_URL: str = os.getenv("SUPABASE_URL", "")
    SUPABASE_KEY: str = os.getenv("SUPABASE_KEY", "")
    
    # ==================== Background Jobs ====================
    RQ_QUEUE_NAME: str = os.getenv("RQ_QUEUE_NAME", "docgenie")
    BATCH_POLL_INTERVAL: int = int(os.getenv("BATCH_POLL_INTERVAL", "30"))  # seconds
    BATCH_PROMPT_CHUNK_SIZE: int = int(os.getenv("BATCH_PROMPT_CHUNK_SIZE", "4"))  # documents per prompt
    BATCH_DATA_DIR: str = os.getenv("BATCH_DATA_DIR", "/tmp/docgenie_batches")
    MESSAGE_DATA_DIR: str = os.getenv("MESSAGE_DATA_DIR", "/tmp/docgenie_messages")
    
    # ==================== Google Drive ====================
    GOOGLE_DRIVE_FOLDER_NAME: str = os.getenv("GOOGLE_DRIVE_FOLDER_NAME", "DocGenie Documents")
    GOOGLE_CLIENT_ID: Optional[str] = os.getenv("GOOGLE_CLIENT_ID", None)  # For token refresh only
    GOOGLE_CLIENT_SECRET: Optional[str] = os.getenv("GOOGLE_CLIENT_SECRET", None)  # For token refresh only
    
    # ==================== Monitoring ====================
    SENTRY_DSN: Optional[str] = os.getenv("SENTRY_DSN", None)
    ENABLE_METRICS: bool = os.getenv("ENABLE_METRICS", "false").lower() == "true"
    METRICS_PORT: int = int(os.getenv("METRICS_PORT", "9090"))
    
    # ==================== AWS (Optional) ====================
    AWS_ACCESS_KEY_ID: Optional[str] = os.getenv("AWS_ACCESS_KEY_ID", None)
    AWS_SECRET_ACCESS_KEY: Optional[str] = os.getenv("AWS_SECRET_ACCESS_KEY", None)
    AWS_REGION: str = os.getenv("AWS_REGION", "us-east-1")
    S3_BUCKET: Optional[str] = os.getenv("S3_BUCKET", None)
    
    @classmethod
    def validate(cls) -> bool:
        """Validate required settings"""
        if not cls.ANTHROPIC_API_KEY:
            raise ValueError("ANTHROPIC_API_KEY environment variable is required")
        return True
    
    @classmethod
    def get_cors_origins(cls) -> List[str]:
        """Get CORS origins list"""
        return cls.CORS_ORIGINS if cls.CORS_ORIGINS != ["*"] else ["*"]


settings = Settings()