Spaces:
Building
Building
| """ | |
| Smart Defaults Manager for Lexical Sophistication Analysis | |
| Provides intelligent default configurations based on measure types and analysis context. | |
| """ | |
| from typing import Dict, List, Any, Tuple, Optional | |
| import logging | |
| from web_app.schema_validator import SchemaValidator | |
| logger = logging.getLogger(__name__) | |
| class DefaultsManager: | |
| """Manages smart defaults for lexical sophistication analysis.""" | |
| # Define measure type patterns for intelligent classification | |
| MEASURE_PATTERNS = { | |
| 'frequency': ['freq', 'frequency', 'count', 'occurrence'], | |
| 'association': ['mi', 't_score', 'delta_p', 'ap_collex', 'llr', 'dice'], | |
| 'psycholinguistic': ['concreteness', 'valence', 'arousal', 'dominance', 'imageability', 'familiarity'], | |
| 'range': ['range', 'documents', 'texts', 'dispersion'], | |
| 'rank': ['rank', 'ranking', 'order'], | |
| 'probability': ['probability', 'prob', 'likelihood'] | |
| } | |
| # Define appropriate log transformation rules | |
| LOG_TRANSFORM_RULES = { | |
| 'frequency': True, # Always log-transform frequency measures | |
| 'association': False, # Never log-transform association measures | |
| 'psycholinguistic': False, # Never log-transform ratings/scales | |
| 'range': False, # Never log-transform range measures | |
| 'rank': False, # Never log-transform ranks | |
| 'probability': False # Never log-transform probabilities | |
| } | |
| # Define default measure priorities (higher = more important/commonly used) | |
| MEASURE_PRIORITIES = { | |
| 'frequency': 100, | |
| 'normalized_freq': 95, | |
| 'mi': 90, | |
| 't_score': 85, | |
| 'concreteness': 80, | |
| 'range': 75, | |
| 'dispersion': 70, | |
| 'delta_p': 65, | |
| 'rank': 60, | |
| 'ap_collex': 55 | |
| } | |
| def classify_measure_type(cls, measure_name: str) -> str: | |
| """ | |
| Classify a measure into its type category. | |
| Args: | |
| measure_name: Name of the measure to classify | |
| Returns: | |
| Category name ('frequency', 'association', 'psycholinguistic', 'range', 'rank', 'unknown') | |
| """ | |
| measure_lower = measure_name.lower().strip() | |
| for category, patterns in cls.MEASURE_PATTERNS.items(): | |
| if any(pattern in measure_lower for pattern in patterns): | |
| return category | |
| return 'unknown' | |
| def get_smart_defaults_for_entry(cls, entry_config: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Generate smart defaults for a configuration entry. | |
| Args: | |
| entry_config: Configuration entry (old or new schema format) | |
| Returns: | |
| Dictionary with smart default fields | |
| """ | |
| # Extract measure names from columns | |
| columns = entry_config.get('columns', {}) | |
| # Get all non-word columns as potential measures | |
| word_columns = {'word', 'surface_form', 'lemma', 'bigram', 'trigram', 'ngram'} | |
| measure_names = [] | |
| for col_name, col_index in columns.items(): | |
| if col_name.lower() not in word_columns: | |
| measure_names.append(col_name) | |
| if not measure_names: | |
| # Fallback: assume all columns except first are measures | |
| if isinstance(columns, dict) and columns: | |
| # Skip word column (typically index 0) | |
| measure_names = [name for name, idx in columns.items() if idx != 0] | |
| # Classify measures and generate defaults | |
| return cls._generate_smart_defaults(measure_names) | |
| def _generate_smart_defaults(cls, measure_names: List[str]) -> Dict[str, Any]: | |
| """ | |
| Generate smart defaults based on measure classification. | |
| Args: | |
| measure_names: List of available measure names | |
| Returns: | |
| Dictionary with smart default configuration | |
| """ | |
| # Classify each measure | |
| measure_classifications = {} | |
| for measure in measure_names: | |
| measure_classifications[measure] = cls.classify_measure_type(measure) | |
| # Determine log-transformable measures | |
| log_transformable = [] | |
| for measure, category in measure_classifications.items(): | |
| if cls.LOG_TRANSFORM_RULES.get(category, False): | |
| log_transformable.append(measure) | |
| # Select default measures (prioritize by importance and type) | |
| default_measures = cls._select_default_measures(measure_names, measure_classifications) | |
| # Select default log transforms (intersection of defaults and log-transformable) | |
| default_log_transforms = [m for m in default_measures if m in log_transformable] | |
| return { | |
| 'log_transformable': log_transformable, | |
| 'selectable_measures': measure_names, | |
| 'default_measures': default_measures, | |
| 'default_log_transforms': default_log_transforms, | |
| 'measure_classifications': measure_classifications # For debugging/UI display | |
| } | |
| def _select_default_measures(cls, measure_names: List[str], | |
| measure_classifications: Dict[str, str]) -> List[str]: | |
| """ | |
| Select default measures based on priority and balance. | |
| Args: | |
| measure_names: Available measure names | |
| measure_classifications: Classification of each measure | |
| Returns: | |
| List of default measure names (typically 2-3 measures) | |
| """ | |
| # Score measures by priority and type balance | |
| measure_scores = {} | |
| for measure in measure_names: | |
| # Base score from priority list | |
| base_score = cls.MEASURE_PRIORITIES.get(measure.lower(), 0) | |
| # Bonus for common patterns | |
| if any(pattern in measure.lower() for pattern in ['freq', 'frequency']): | |
| base_score += 50 | |
| elif any(pattern in measure.lower() for pattern in ['mi', 't_score']): | |
| base_score += 40 | |
| elif any(pattern in measure.lower() for pattern in ['concreteness', 'range']): | |
| base_score += 30 | |
| measure_scores[measure] = base_score | |
| # Sort by score and select top measures | |
| sorted_measures = sorted(measure_scores.items(), key=lambda x: x[1], reverse=True) | |
| # Select top measures with type diversity | |
| selected = [] | |
| selected_types = set() | |
| for measure, score in sorted_measures: | |
| measure_type = measure_classifications[measure] | |
| # Always include high-priority measures | |
| if score >= 90 or len(selected) < 2: | |
| selected.append(measure) | |
| selected_types.add(measure_type) | |
| # Add diverse types up to 3-4 measures | |
| elif len(selected) < 4 and measure_type not in selected_types: | |
| selected.append(measure) | |
| selected_types.add(measure_type) | |
| # Stop at 4 measures max | |
| elif len(selected) >= 4: | |
| break | |
| return selected[:4] # Limit to 4 measures max | |
| def get_ui_groupings(cls, config_data: Dict[str, Any]) -> Dict[str, List[str]]: | |
| """ | |
| Generate UI groupings for reference list entries. | |
| Groups related token/lemma entries together for display. | |
| Args: | |
| config_data: Full configuration data | |
| Returns: | |
| Dictionary mapping group names to entry lists | |
| """ | |
| groupings = {} | |
| processed_entries = set() | |
| for language, lang_data in config_data.items(): | |
| if not isinstance(lang_data, dict): | |
| continue | |
| for ngram_type, type_data in lang_data.items(): | |
| if not isinstance(type_data, dict): | |
| continue | |
| for entry_name, entry_config in type_data.items(): | |
| if entry_name in processed_entries: | |
| continue | |
| # Check if this is a new schema entry with analysis_type | |
| if entry_config.get('analysis_type'): | |
| # Try to find matching token/lemma pair | |
| base_name = entry_name.replace('_token', '').replace('_lemma', '') | |
| token_name = f"{base_name}_token" | |
| lemma_name = f"{base_name}_lemma" | |
| if (token_name in type_data and lemma_name in type_data and | |
| token_name not in processed_entries and lemma_name not in processed_entries): | |
| # Group them together | |
| group_key = f"{language}_{ngram_type}_{base_name}" | |
| groupings[group_key] = { | |
| 'display_name': base_name.replace('_', ' ').title(), | |
| 'entries': [token_name, lemma_name], | |
| 'type': ngram_type, | |
| 'language': language | |
| } | |
| processed_entries.add(token_name) | |
| processed_entries.add(lemma_name) | |
| else: | |
| # Single entry | |
| group_key = f"{language}_{ngram_type}_{entry_name}" | |
| groupings[group_key] = { | |
| 'display_name': entry_config.get('display_name', entry_name), | |
| 'entries': [entry_name], | |
| 'type': ngram_type, | |
| 'language': language | |
| } | |
| processed_entries.add(entry_name) | |
| else: | |
| # Old schema entry - single group | |
| group_key = f"{language}_{ngram_type}_{entry_name}" | |
| groupings[group_key] = { | |
| 'display_name': entry_config.get('display_name', entry_name), | |
| 'entries': [entry_name], | |
| 'type': ngram_type, | |
| 'language': language | |
| } | |
| processed_entries.add(entry_name) | |
| return groupings | |
| def apply_smart_defaults_to_config(cls, config_data: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Apply smart defaults to configuration entries that don't have them. | |
| Args: | |
| config_data: Configuration data to enhance | |
| Returns: | |
| Enhanced configuration data with smart defaults | |
| """ | |
| enhanced_config = config_data.copy() | |
| for language, lang_data in enhanced_config.items(): | |
| if not isinstance(lang_data, dict): | |
| continue | |
| for ngram_type, type_data in lang_data.items(): | |
| if not isinstance(type_data, dict): | |
| continue | |
| for entry_name, entry_config in type_data.items(): | |
| if not isinstance(entry_config, dict): | |
| continue | |
| # Check if entry needs smart defaults | |
| needs_defaults = not any(field in entry_config | |
| for field in SchemaValidator.NEW_SCHEMA_FIELDS) | |
| if needs_defaults: | |
| # Generate and apply smart defaults | |
| smart_defaults = cls.get_smart_defaults_for_entry(entry_config) | |
| entry_config.update(smart_defaults) | |
| logger.info(f"Applied smart defaults to {entry_name}") | |
| return enhanced_config | |
| def get_default_analysis_config(cls, selected_entries: List[str], | |
| config_data: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]: | |
| """ | |
| Generate default analysis configuration for selected entries. | |
| Args: | |
| selected_entries: List of selected reference list entries | |
| config_data: Full configuration data | |
| Returns: | |
| Tuple of (selected_measures, log_transforms) dictionaries | |
| """ | |
| selected_measures = {} | |
| log_transforms = {} | |
| for entry_name in selected_entries: | |
| # Find the entry in config | |
| entry_config = None | |
| for language, lang_data in config_data.items(): | |
| if not isinstance(lang_data, dict): | |
| continue | |
| for ngram_type, type_data in lang_data.items(): | |
| if not isinstance(type_data, dict): | |
| continue | |
| if entry_name in type_data: | |
| entry_config = type_data[entry_name] | |
| break | |
| if entry_config: | |
| break | |
| if not entry_config: | |
| continue | |
| # Get defaults from config or generate them | |
| if entry_config.get('default_measures'): | |
| selected_measures[entry_name] = entry_config['default_measures'] | |
| else: | |
| # Generate smart defaults | |
| defaults = cls.get_smart_defaults_for_entry(entry_config) | |
| selected_measures[entry_name] = defaults['default_measures'] | |
| if entry_config.get('default_log_transforms'): | |
| log_transforms[entry_name] = entry_config['default_log_transforms'] | |
| else: | |
| # Generate smart defaults | |
| defaults = cls.get_smart_defaults_for_entry(entry_config) | |
| log_transforms[entry_name] = defaults['default_log_transforms'] | |
| return selected_measures, log_transforms | |
| def test_smart_defaults(): | |
| """Test the smart defaults functionality.""" | |
| print("=== TESTING SMART DEFAULTS ENGINE ===") | |
| # Test measure classification | |
| test_measures = ['frequency', 'MI', 'concreteness', 'range', 'delta_p', 'normalized_freq'] | |
| print("\n๐ Measure Classification:") | |
| for measure in test_measures: | |
| category = DefaultsManager.classify_measure_type(measure) | |
| should_log = DefaultsManager.LOG_TRANSFORM_RULES.get(category, False) | |
| print(f" {measure} โ {category} (log: {should_log})") | |
| # Test smart defaults generation | |
| print("\n๐ฏ Smart Defaults Generation:") | |
| test_config = { | |
| 'columns': { | |
| 'word': 0, | |
| 'frequency': 1, | |
| 'normalized_freq': 2, | |
| 'range': 3, | |
| 'dispersion': 4 | |
| } | |
| } | |
| defaults = DefaultsManager.get_smart_defaults_for_entry(test_config) | |
| print(f" Log transformable: {defaults['log_transformable']}") | |
| print(f" Default measures: {defaults['default_measures']}") | |
| print(f" Default log transforms: {defaults['default_log_transforms']}") | |
| # Test association measures | |
| print("\n๐ Association Measures Test:") | |
| assoc_config = { | |
| 'columns': { | |
| 'bigram': 0, | |
| 'frequency': 1, | |
| 'MI': 2, | |
| 'T': 3, | |
| 'delta_p': 4 | |
| } | |
| } | |
| assoc_defaults = DefaultsManager.get_smart_defaults_for_entry(assoc_config) | |
| print(f" Log transformable: {assoc_defaults['log_transformable']}") | |
| print(f" Default measures: {assoc_defaults['default_measures']}") | |
| print(f" Default log transforms: {assoc_defaults['default_log_transforms']}") | |
| print("\nโ Smart Defaults Engine working perfectly!") | |
| return defaults, assoc_defaults | |
| if __name__ == "__main__": | |
| test_smart_defaults() | |