| """
|
| Validation functions for data models and user input
|
| """
|
|
|
| from typing import Dict, List, Tuple, Any, Optional
|
| import re
|
| from datetime import datetime
|
| from corpus_collection_engine.models.data_models import UserContribution, CorpusEntry, ActivitySession, ActivityType
|
| from corpus_collection_engine.config import VALIDATION_CONFIG, SUPPORTED_LANGUAGES
|
|
|
|
|
| class ValidationError(Exception):
|
| """Custom exception for validation errors"""
|
| pass
|
|
|
|
|
| class DataValidator:
|
| """Validator class for all data models and user input"""
|
|
|
| @staticmethod
|
| def validate_text_content(text: str, min_length: int = None, max_length: int = None) -> Tuple[bool, str]:
|
| """Validate text content length and basic format"""
|
| if not text or not text.strip():
|
| return False, "Text content cannot be empty"
|
|
|
| text = text.strip()
|
| min_len = min_length or VALIDATION_CONFIG['min_text_length']
|
| max_len = max_length or VALIDATION_CONFIG['max_text_length']
|
|
|
| if len(text) < min_len:
|
| return False, f"Text must be at least {min_len} characters long"
|
|
|
| if len(text) > max_len:
|
| return False, f"Text must not exceed {max_len} characters"
|
|
|
|
|
| if re.search(r'(.)\1{10,}', text):
|
| return False, "Text contains suspicious repeated patterns"
|
|
|
| return True, "Valid text content"
|
|
|
| @staticmethod
|
| def validate_language_code(language: str) -> Tuple[bool, str]:
|
| """Validate language code against supported languages"""
|
| if not language:
|
| return False, "Language code cannot be empty"
|
|
|
| if language not in SUPPORTED_LANGUAGES:
|
| return False, f"Unsupported language code: {language}"
|
|
|
| return True, f"Valid language: {SUPPORTED_LANGUAGES[language]}"
|
|
|
| @staticmethod
|
| def validate_image_data(image_data: bytes, max_size: int = None) -> Tuple[bool, str]:
|
| """Validate image data size and basic format"""
|
| if not image_data:
|
| return False, "Image data cannot be empty"
|
|
|
| max_size = max_size or VALIDATION_CONFIG['max_image_size']
|
|
|
| if len(image_data) > max_size:
|
| size_mb = len(image_data) / (1024 * 1024)
|
| max_mb = max_size / (1024 * 1024)
|
| return False, f"Image size ({size_mb:.1f}MB) exceeds maximum ({max_mb:.1f}MB)"
|
|
|
|
|
| image_headers = {
|
| b'\xff\xd8\xff': 'JPEG',
|
| b'\x89PNG\r\n\x1a\n': 'PNG',
|
| b'RIFF': 'WEBP'
|
| }
|
|
|
| is_valid_image = any(image_data.startswith(header) for header in image_headers.keys())
|
| if not is_valid_image:
|
| return False, "Invalid image format. Supported: JPEG, PNG, WEBP"
|
|
|
| return True, "Valid image data"
|
|
|
| @staticmethod
|
| def validate_cultural_context(context: Dict[str, Any]) -> Tuple[bool, str]:
|
| """Validate cultural context data"""
|
| if not isinstance(context, dict):
|
| return False, "Cultural context must be a dictionary"
|
|
|
|
|
| required_fields = ['region', 'cultural_significance']
|
| missing_fields = [field for field in required_fields if field not in context]
|
|
|
| if missing_fields:
|
| return False, f"Missing required cultural context fields: {missing_fields}"
|
|
|
|
|
| if 'region' in context and context['region']:
|
| region = context['region'].strip()
|
| if len(region) < 2:
|
| return False, "Region must be at least 2 characters long"
|
|
|
| return True, "Valid cultural context"
|
|
|
| @classmethod
|
| def validate_user_contribution(cls, contribution: UserContribution) -> Tuple[bool, List[str]]:
|
| """Comprehensive validation for UserContribution"""
|
| errors = []
|
|
|
|
|
| if not contribution.user_session:
|
| errors.append("User session ID is required")
|
|
|
| if not isinstance(contribution.activity_type, ActivityType):
|
| errors.append("Invalid activity type")
|
|
|
|
|
| is_valid_lang, lang_msg = cls.validate_language_code(contribution.language)
|
| if not is_valid_lang:
|
| errors.append(lang_msg)
|
|
|
|
|
| content_errors = cls._validate_activity_content(
|
| contribution.activity_type,
|
| contribution.content_data
|
| )
|
| errors.extend(content_errors)
|
|
|
|
|
| is_valid_context, context_msg = cls.validate_cultural_context(contribution.cultural_context)
|
| if not is_valid_context:
|
| errors.append(context_msg)
|
|
|
|
|
| if contribution.timestamp > datetime.now():
|
| errors.append("Timestamp cannot be in the future")
|
|
|
| return len(errors) == 0, errors
|
|
|
| @classmethod
|
| def _validate_activity_content(cls, activity_type: ActivityType, content_data: Dict[str, Any]) -> List[str]:
|
| """Validate content data specific to activity type"""
|
| errors = []
|
|
|
| if activity_type == ActivityType.MEME:
|
| if 'text' not in content_data:
|
| errors.append("Meme content must include text")
|
| else:
|
| is_valid, msg = cls.validate_text_content(content_data['text'])
|
| if not is_valid:
|
| errors.append(f"Meme text: {msg}")
|
|
|
| elif activity_type == ActivityType.RECIPE:
|
| required_fields = ['title', 'ingredients', 'instructions']
|
| for field in required_fields:
|
| if field not in content_data:
|
| errors.append(f"Recipe content must include {field}")
|
| elif not content_data[field]:
|
| errors.append(f"Recipe {field} cannot be empty")
|
|
|
| elif activity_type == ActivityType.FOLKLORE:
|
| if 'story' not in content_data:
|
| errors.append("Folklore content must include story")
|
| else:
|
| is_valid, msg = cls.validate_text_content(content_data['story'], min_length=50)
|
| if not is_valid:
|
| errors.append(f"Folklore story: {msg}")
|
|
|
| elif activity_type == ActivityType.LANDMARK:
|
| if 'description' not in content_data:
|
| errors.append("Landmark content must include description")
|
| else:
|
| is_valid, msg = cls.validate_text_content(content_data['description'])
|
| if not is_valid:
|
| errors.append(f"Landmark description: {msg}")
|
|
|
| return errors
|
|
|
| @classmethod
|
| def validate_corpus_entry(cls, entry: CorpusEntry) -> Tuple[bool, List[str]]:
|
| """Comprehensive validation for CorpusEntry"""
|
| errors = []
|
|
|
| if not entry.contribution_id:
|
| errors.append("Contribution ID is required")
|
|
|
|
|
| if not entry.text_content and not entry.image_content:
|
| errors.append("Corpus entry must have either text or image content")
|
|
|
|
|
| if entry.text_content:
|
| is_valid, msg = cls.validate_text_content(entry.text_content)
|
| if not is_valid:
|
| errors.append(f"Text content: {msg}")
|
|
|
|
|
| if entry.image_content:
|
| is_valid, msg = cls.validate_image_data(entry.image_content)
|
| if not is_valid:
|
| errors.append(f"Image content: {msg}")
|
|
|
|
|
| is_valid_lang, lang_msg = cls.validate_language_code(entry.language)
|
| if not is_valid_lang:
|
| errors.append(lang_msg)
|
|
|
|
|
| if not 0.0 <= entry.quality_score <= 1.0:
|
| errors.append("Quality score must be between 0.0 and 1.0")
|
|
|
| return len(errors) == 0, errors
|
|
|
| @classmethod
|
| def validate_activity_session(cls, session: ActivitySession) -> Tuple[bool, List[str]]:
|
| """Comprehensive validation for ActivitySession"""
|
| errors = []
|
|
|
| if not session.session_id:
|
| errors.append("Session ID is required")
|
|
|
| if not isinstance(session.activity_type, ActivityType):
|
| errors.append("Invalid activity type")
|
|
|
| if session.start_time > datetime.now():
|
| errors.append("Start time cannot be in the future")
|
|
|
| return len(errors) == 0, errors |