icarus112's picture
Upload src/sem_v6/validation/constants.py with huggingface_hub
3c1b981 verified
"""
Validation module constants.
This module defines all magic numbers used across the validation system,
providing clear documentation and a single source of truth for tunable
parameters.
"""
# =============================================================================
# Text Quality Thresholds
# =============================================================================
# Minimum ratio of ASCII characters for valid text output
# Below this threshold, output is considered corrupted or non-English
MIN_ASCII_RATIO: float = 0.7
# Maximum ratio of repeated n-grams in text
# Above this threshold, output is considered too repetitive
MAX_REPETITION_RATIO: float = 0.5
# Minimum average sample length (characters) for valid output
# Shorter outputs may indicate model collapse or truncation issues
MIN_SAMPLE_LENGTH: float = 5.0
# Grammar score threshold for quality alerts
# Scores below this trigger warnings in validation callbacks
LOW_GRAMMAR_SCORE_THRESHOLD: float = 0.5
# Target grammar score for production-ready outputs
# This is the aspirational quality level (>95% grammatically correct)
TARGET_GRAMMAR_SCORE: float = 0.95
# =============================================================================
# LanguageTool API Configuration
# =============================================================================
# Maximum text length sent to LanguageTool API (characters)
# Longer texts are truncated to prevent timeouts and rate limit issues
MAX_API_TEXT_LENGTH: int = 500
# Request timeout for LanguageTool API calls (seconds)
# Prevents hanging on slow/unresponsive API
API_TIMEOUT_SECONDS: float = 2.0
# Minimum interval between API requests (seconds)
# Rate limiting for free tier: 20 req/min = 3s interval
API_MIN_INTERVAL: float = 3.0
# Health check timeout for LanguageTool availability (seconds)
API_HEALTH_CHECK_TIMEOUT: float = 1.0
# =============================================================================
# Text Generation Parameters
# =============================================================================
# Maximum length for validation sample generation (tokens)
# Short generations for fast validation (<1s overhead)
VALIDATION_MAX_LENGTH: int = 30
# Temperature for validation sample generation
# Moderate temperature balances diversity and coherence
VALIDATION_TEMPERATURE: float = 0.8
# Maximum length for knowledge validation answers (tokens)
# Knowledge questions expect concise answers
KNOWLEDGE_MAX_LENGTH: int = 15
# Temperature for knowledge validation (more deterministic)
# Lower temperature for fact-based answers
KNOWLEDGE_TEMPERATURE: float = 0.5
# =============================================================================
# History and Caching
# =============================================================================
# Maximum number of sample outputs stored in history (deque size)
# Bounded to prevent unbounded memory growth
SAMPLE_HISTORY_SIZE: int = 20
# Maximum number of grammar scores stored for trend analysis
# Larger window enables longer-term trend detection
GRAMMAR_HISTORY_SIZE: int = 50
# Maximum number of timestamps stored for rate limiting
TIMESTAMP_HISTORY_SIZE: int = 50
# Default window size for trend analysis (number of recent scores)
TREND_ANALYSIS_WINDOW: int = 5
# =============================================================================
# Validation Callback Configuration
# =============================================================================
# Default frequency for fast validation (every N steps)
FAST_VALIDATION_FREQUENCY: int = 100
# Default frequency for grammar validation (every N steps)
GRAMMAR_VALIDATION_FREQUENCY: int = 200
# Maximum number of samples to display in TensorBoard logs
MAX_TENSORBOARD_SAMPLES: int = 3
# =============================================================================
# N-gram Analysis
# =============================================================================
# N-gram size for repetition detection
# 3-grams balance sensitivity to repetition vs. false positives
NGRAM_SIZE: int = 3
# Minimum text length for n-gram analysis (characters)
# Shorter texts use fallback repetition score
MIN_NGRAM_TEXT_LENGTH: int = 10
# Fallback repetition score for too-short texts
# Assumes short texts are maximally repetitive (conservative)
FALLBACK_REPETITION_SCORE: float = 1.0
# =============================================================================
# Fallback/Error Values
# =============================================================================
# Fallback grammar score when API is unavailable
# Set to 0.0 (worst) to signal quality gate failures when LanguageTool unavailable
FALLBACK_GRAMMAR_SCORE: float = 0.0
# Fallback error count for failed validations
FALLBACK_ERROR_COUNT: int = 999
# Truncation length for error logging (characters)
ERROR_LOG_TRUNCATE_LENGTH: int = 50