""" Validation module constants. This module defines all magic numbers used across the validation system, providing clear documentation and a single source of truth for tunable parameters. """ # ============================================================================= # Text Quality Thresholds # ============================================================================= # Minimum ratio of ASCII characters for valid text output # Below this threshold, output is considered corrupted or non-English MIN_ASCII_RATIO: float = 0.7 # Maximum ratio of repeated n-grams in text # Above this threshold, output is considered too repetitive MAX_REPETITION_RATIO: float = 0.5 # Minimum average sample length (characters) for valid output # Shorter outputs may indicate model collapse or truncation issues MIN_SAMPLE_LENGTH: float = 5.0 # Grammar score threshold for quality alerts # Scores below this trigger warnings in validation callbacks LOW_GRAMMAR_SCORE_THRESHOLD: float = 0.5 # Target grammar score for production-ready outputs # This is the aspirational quality level (>95% grammatically correct) TARGET_GRAMMAR_SCORE: float = 0.95 # ============================================================================= # LanguageTool API Configuration # ============================================================================= # Maximum text length sent to LanguageTool API (characters) # Longer texts are truncated to prevent timeouts and rate limit issues MAX_API_TEXT_LENGTH: int = 500 # Request timeout for LanguageTool API calls (seconds) # Prevents hanging on slow/unresponsive API API_TIMEOUT_SECONDS: float = 2.0 # Minimum interval between API requests (seconds) # Rate limiting for free tier: 20 req/min = 3s interval API_MIN_INTERVAL: float = 3.0 # Health check timeout for LanguageTool availability (seconds) API_HEALTH_CHECK_TIMEOUT: float = 1.0 # ============================================================================= # Text Generation Parameters # ============================================================================= # Maximum length for validation sample generation (tokens) # Short generations for fast validation (<1s overhead) VALIDATION_MAX_LENGTH: int = 30 # Temperature for validation sample generation # Moderate temperature balances diversity and coherence VALIDATION_TEMPERATURE: float = 0.8 # Maximum length for knowledge validation answers (tokens) # Knowledge questions expect concise answers KNOWLEDGE_MAX_LENGTH: int = 15 # Temperature for knowledge validation (more deterministic) # Lower temperature for fact-based answers KNOWLEDGE_TEMPERATURE: float = 0.5 # ============================================================================= # History and Caching # ============================================================================= # Maximum number of sample outputs stored in history (deque size) # Bounded to prevent unbounded memory growth SAMPLE_HISTORY_SIZE: int = 20 # Maximum number of grammar scores stored for trend analysis # Larger window enables longer-term trend detection GRAMMAR_HISTORY_SIZE: int = 50 # Maximum number of timestamps stored for rate limiting TIMESTAMP_HISTORY_SIZE: int = 50 # Default window size for trend analysis (number of recent scores) TREND_ANALYSIS_WINDOW: int = 5 # ============================================================================= # Validation Callback Configuration # ============================================================================= # Default frequency for fast validation (every N steps) FAST_VALIDATION_FREQUENCY: int = 100 # Default frequency for grammar validation (every N steps) GRAMMAR_VALIDATION_FREQUENCY: int = 200 # Maximum number of samples to display in TensorBoard logs MAX_TENSORBOARD_SAMPLES: int = 3 # ============================================================================= # N-gram Analysis # ============================================================================= # N-gram size for repetition detection # 3-grams balance sensitivity to repetition vs. false positives NGRAM_SIZE: int = 3 # Minimum text length for n-gram analysis (characters) # Shorter texts use fallback repetition score MIN_NGRAM_TEXT_LENGTH: int = 10 # Fallback repetition score for too-short texts # Assumes short texts are maximally repetitive (conservative) FALLBACK_REPETITION_SCORE: float = 1.0 # ============================================================================= # Fallback/Error Values # ============================================================================= # Fallback grammar score when API is unavailable # Set to 0.0 (worst) to signal quality gate failures when LanguageTool unavailable FALLBACK_GRAMMAR_SCORE: float = 0.0 # Fallback error count for failed validations FALLBACK_ERROR_COUNT: int = 999 # Truncation length for error logging (characters) ERROR_LOG_TRUNCATE_LENGTH: int = 50