singarajusaiteja's picture
core app upload
dad1de9 verified
"""
Validation functions for data models and user input
"""
from typing import Dict, List, Tuple, Any, Optional
import re
from datetime import datetime
from corpus_collection_engine.models.data_models import UserContribution, CorpusEntry, ActivitySession, ActivityType
from corpus_collection_engine.config import VALIDATION_CONFIG, SUPPORTED_LANGUAGES
class ValidationError(Exception):
"""Custom exception for validation errors"""
pass
class DataValidator:
"""Validator class for all data models and user input"""
@staticmethod
def validate_text_content(text: str, min_length: int = None, max_length: int = None) -> Tuple[bool, str]:
"""Validate text content length and basic format"""
if not text or not text.strip():
return False, "Text content cannot be empty"
text = text.strip()
min_len = min_length or VALIDATION_CONFIG['min_text_length']
max_len = max_length or VALIDATION_CONFIG['max_text_length']
if len(text) < min_len:
return False, f"Text must be at least {min_len} characters long"
if len(text) > max_len:
return False, f"Text must not exceed {max_len} characters"
# Check for suspicious patterns (basic spam detection)
if re.search(r'(.)\1{10,}', text): # Repeated characters
return False, "Text contains suspicious repeated patterns"
return True, "Valid text content"
@staticmethod
def validate_language_code(language: str) -> Tuple[bool, str]:
"""Validate language code against supported languages"""
if not language:
return False, "Language code cannot be empty"
if language not in SUPPORTED_LANGUAGES:
return False, f"Unsupported language code: {language}"
return True, f"Valid language: {SUPPORTED_LANGUAGES[language]}"
@staticmethod
def validate_image_data(image_data: bytes, max_size: int = None) -> Tuple[bool, str]:
"""Validate image data size and basic format"""
if not image_data:
return False, "Image data cannot be empty"
max_size = max_size or VALIDATION_CONFIG['max_image_size']
if len(image_data) > max_size:
size_mb = len(image_data) / (1024 * 1024)
max_mb = max_size / (1024 * 1024)
return False, f"Image size ({size_mb:.1f}MB) exceeds maximum ({max_mb:.1f}MB)"
# Basic image format validation (check for common headers)
image_headers = {
b'\xff\xd8\xff': 'JPEG',
b'\x89PNG\r\n\x1a\n': 'PNG',
b'RIFF': 'WEBP'
}
is_valid_image = any(image_data.startswith(header) for header in image_headers.keys())
if not is_valid_image:
return False, "Invalid image format. Supported: JPEG, PNG, WEBP"
return True, "Valid image data"
@staticmethod
def validate_cultural_context(context: Dict[str, Any]) -> Tuple[bool, str]:
"""Validate cultural context data"""
if not isinstance(context, dict):
return False, "Cultural context must be a dictionary"
# Check for required fields based on activity type
required_fields = ['region', 'cultural_significance']
missing_fields = [field for field in required_fields if field not in context]
if missing_fields:
return False, f"Missing required cultural context fields: {missing_fields}"
# Validate region if provided
if 'region' in context and context['region']:
region = context['region'].strip()
if len(region) < 2:
return False, "Region must be at least 2 characters long"
return True, "Valid cultural context"
@classmethod
def validate_user_contribution(cls, contribution: UserContribution) -> Tuple[bool, List[str]]:
"""Comprehensive validation for UserContribution"""
errors = []
# Validate basic fields
if not contribution.user_session:
errors.append("User session ID is required")
if not isinstance(contribution.activity_type, ActivityType):
errors.append("Invalid activity type")
# Validate language
is_valid_lang, lang_msg = cls.validate_language_code(contribution.language)
if not is_valid_lang:
errors.append(lang_msg)
# Validate content data based on activity type
content_errors = cls._validate_activity_content(
contribution.activity_type,
contribution.content_data
)
errors.extend(content_errors)
# Validate cultural context
is_valid_context, context_msg = cls.validate_cultural_context(contribution.cultural_context)
if not is_valid_context:
errors.append(context_msg)
# Validate timestamp
if contribution.timestamp > datetime.now():
errors.append("Timestamp cannot be in the future")
return len(errors) == 0, errors
@classmethod
def _validate_activity_content(cls, activity_type: ActivityType, content_data: Dict[str, Any]) -> List[str]:
"""Validate content data specific to activity type"""
errors = []
if activity_type == ActivityType.MEME:
if 'text' not in content_data:
errors.append("Meme content must include text")
else:
is_valid, msg = cls.validate_text_content(content_data['text'])
if not is_valid:
errors.append(f"Meme text: {msg}")
elif activity_type == ActivityType.RECIPE:
required_fields = ['title', 'ingredients', 'instructions']
for field in required_fields:
if field not in content_data:
errors.append(f"Recipe content must include {field}")
elif not content_data[field]:
errors.append(f"Recipe {field} cannot be empty")
elif activity_type == ActivityType.FOLKLORE:
if 'story' not in content_data:
errors.append("Folklore content must include story")
else:
is_valid, msg = cls.validate_text_content(content_data['story'], min_length=50)
if not is_valid:
errors.append(f"Folklore story: {msg}")
elif activity_type == ActivityType.LANDMARK:
if 'description' not in content_data:
errors.append("Landmark content must include description")
else:
is_valid, msg = cls.validate_text_content(content_data['description'])
if not is_valid:
errors.append(f"Landmark description: {msg}")
return errors
@classmethod
def validate_corpus_entry(cls, entry: CorpusEntry) -> Tuple[bool, List[str]]:
"""Comprehensive validation for CorpusEntry"""
errors = []
if not entry.contribution_id:
errors.append("Contribution ID is required")
# Must have either text or image content
if not entry.text_content and not entry.image_content:
errors.append("Corpus entry must have either text or image content")
# Validate text content if present
if entry.text_content:
is_valid, msg = cls.validate_text_content(entry.text_content)
if not is_valid:
errors.append(f"Text content: {msg}")
# Validate image content if present
if entry.image_content:
is_valid, msg = cls.validate_image_data(entry.image_content)
if not is_valid:
errors.append(f"Image content: {msg}")
# Validate language
is_valid_lang, lang_msg = cls.validate_language_code(entry.language)
if not is_valid_lang:
errors.append(lang_msg)
# Validate quality score
if not 0.0 <= entry.quality_score <= 1.0:
errors.append("Quality score must be between 0.0 and 1.0")
return len(errors) == 0, errors
@classmethod
def validate_activity_session(cls, session: ActivitySession) -> Tuple[bool, List[str]]:
"""Comprehensive validation for ActivitySession"""
errors = []
if not session.session_id:
errors.append("Session ID is required")
if not isinstance(session.activity_type, ActivityType):
errors.append("Invalid activity type")
if session.start_time > datetime.now():
errors.append("Start time cannot be in the future")
return len(errors) == 0, errors