Upload src/sem_v6/validation/constants.py with huggingface_hub

3c1b981 verified 4 months ago

4.88 kB

	"""
	Validation module constants.

	This module defines all magic numbers used across the validation system,
	providing clear documentation and a single source of truth for tunable
	parameters.
	"""

	# =============================================================================
	# Text Quality Thresholds
	# =============================================================================

	# Minimum ratio of ASCII characters for valid text output
	# Below this threshold, output is considered corrupted or non-English
	MIN_ASCII_RATIO: float = 0.7

	# Maximum ratio of repeated n-grams in text
	# Above this threshold, output is considered too repetitive
	MAX_REPETITION_RATIO: float = 0.5

	# Minimum average sample length (characters) for valid output
	# Shorter outputs may indicate model collapse or truncation issues
	MIN_SAMPLE_LENGTH: float = 5.0

	# Grammar score threshold for quality alerts
	# Scores below this trigger warnings in validation callbacks
	LOW_GRAMMAR_SCORE_THRESHOLD: float = 0.5

	# Target grammar score for production-ready outputs
	# This is the aspirational quality level (>95% grammatically correct)
	TARGET_GRAMMAR_SCORE: float = 0.95

	# =============================================================================
	# LanguageTool API Configuration
	# =============================================================================

	# Maximum text length sent to LanguageTool API (characters)
	# Longer texts are truncated to prevent timeouts and rate limit issues
	MAX_API_TEXT_LENGTH: int = 500

	# Request timeout for LanguageTool API calls (seconds)
	# Prevents hanging on slow/unresponsive API
	API_TIMEOUT_SECONDS: float = 2.0

	# Minimum interval between API requests (seconds)
	# Rate limiting for free tier: 20 req/min = 3s interval
	API_MIN_INTERVAL: float = 3.0

	# Health check timeout for LanguageTool availability (seconds)
	API_HEALTH_CHECK_TIMEOUT: float = 1.0

	# =============================================================================
	# Text Generation Parameters
	# =============================================================================

	# Maximum length for validation sample generation (tokens)
	# Short generations for fast validation (<1s overhead)
	VALIDATION_MAX_LENGTH: int = 30

	# Temperature for validation sample generation
	# Moderate temperature balances diversity and coherence
	VALIDATION_TEMPERATURE: float = 0.8

	# Maximum length for knowledge validation answers (tokens)
	# Knowledge questions expect concise answers
	KNOWLEDGE_MAX_LENGTH: int = 15

	# Temperature for knowledge validation (more deterministic)
	# Lower temperature for fact-based answers
	KNOWLEDGE_TEMPERATURE: float = 0.5

	# =============================================================================
	# History and Caching
	# =============================================================================

	# Maximum number of sample outputs stored in history (deque size)
	# Bounded to prevent unbounded memory growth
	SAMPLE_HISTORY_SIZE: int = 20

	# Maximum number of grammar scores stored for trend analysis
	# Larger window enables longer-term trend detection
	GRAMMAR_HISTORY_SIZE: int = 50

	# Maximum number of timestamps stored for rate limiting
	TIMESTAMP_HISTORY_SIZE: int = 50

	# Default window size for trend analysis (number of recent scores)
	TREND_ANALYSIS_WINDOW: int = 5

	# =============================================================================
	# Validation Callback Configuration
	# =============================================================================

	# Default frequency for fast validation (every N steps)
	FAST_VALIDATION_FREQUENCY: int = 100

	# Default frequency for grammar validation (every N steps)
	GRAMMAR_VALIDATION_FREQUENCY: int = 200

	# Maximum number of samples to display in TensorBoard logs
	MAX_TENSORBOARD_SAMPLES: int = 3

	# =============================================================================
	# N-gram Analysis
	# =============================================================================

	# N-gram size for repetition detection
	# 3-grams balance sensitivity to repetition vs. false positives
	NGRAM_SIZE: int = 3

	# Minimum text length for n-gram analysis (characters)
	# Shorter texts use fallback repetition score
	MIN_NGRAM_TEXT_LENGTH: int = 10

	# Fallback repetition score for too-short texts
	# Assumes short texts are maximally repetitive (conservative)
	FALLBACK_REPETITION_SCORE: float = 1.0

	# =============================================================================
	# Fallback/Error Values
	# =============================================================================

	# Fallback grammar score when API is unavailable
	# Set to 0.0 (worst) to signal quality gate failures when LanguageTool unavailable
	FALLBACK_GRAMMAR_SCORE: float = 0.0

	# Fallback error count for failed validations
	FALLBACK_ERROR_COUNT: int = 999

	# Truncation length for error logging (characters)
	ERROR_LOG_TRUNCATE_LENGTH: int = 50