# -*- coding: utf-8 -*- """ Cross-Lingual Edit Propagation via Subspace Containment Transfer high-resource corrections to low-resource languages using containment scores Based on: Zhang, Y., et al. (2024). "Deep Hierarchical Learning with Nested Subspace Networks." arXiv preprint. NSN framework for hierarchical representation learning. """ import numpy as np from typing import Dict, List, Optional, Tuple from dataclasses import dataclass import logging logger = logging.getLogger(__name__) @dataclass class ContainmentScore: """Subspace containment analysis result""" source_lang: str target_lang: str rank: int containment_score: float # 0-1, how much target is contained in source overlap_dimension: int # Dimension of overlap confidence: float propagation_recommended: bool @dataclass class PropagationResult: """Result of edit propagation""" source_lang: str target_lang: str rank: int edit_vector: np.ndarray propagated_vector: np.ndarray containment_score: float success: bool quality_score: float # Predicted quality after propagation propagation_path: List[str] # Languages in propagation chain class EditPropagationEngine: """ Transfer edits from high-resource to low-resource languages using subspace containment analysis. Dashboard Extension: - Heatmap of containment scores across language pairs - Flow arrows showing edit propagation paths """ def __init__(self): self.language_embeddings = self._initialize_language_embeddings() self.containment_cache: Dict[Tuple[str, str, int], ContainmentScore] = {} self.propagation_history: List[PropagationResult] = [] def _initialize_language_embeddings(self) -> Dict[str, np.ndarray]: """Initialize language subspace embeddings""" # Simulated language embeddings (in practice, learned from data) np.random.seed(42) languages = { # High-resource languages (larger subspaces) 'english': np.random.randn(256), 'chinese': np.random.randn(256), 'spanish': np.random.randn(256), 'french': np.random.randn(256), 'german': np.random.randn(256), # Medium-resource languages 'russian': np.random.randn(256), 'arabic': np.random.randn(256), 'japanese': np.random.randn(256), 'korean': np.random.randn(256), 'portuguese': np.random.randn(256), # Low-resource languages (smaller subspaces) 'indonesian': np.random.randn(256), 'vietnamese': np.random.randn(256), 'thai': np.random.randn(256), 'swahili': np.random.randn(256), 'yoruba': np.random.randn(256) } # Normalize embeddings for lang in languages: languages[lang] = languages[lang] / np.linalg.norm(languages[lang]) return languages def evaluate_subspace_containment( self, source_lang: str, target_lang: str, rank: int ) -> ContainmentScore: """ Evaluate how much target language subspace is contained in source. Args: source_lang: High-resource source language target_lang: Low-resource target language rank: NSN rank for analysis Returns: ContainmentScore with containment metrics """ cache_key = (source_lang, target_lang, rank) if cache_key in self.containment_cache: return self.containment_cache[cache_key] # Get language embeddings source_emb = self.language_embeddings.get(source_lang) target_emb = self.language_embeddings.get(target_lang) if source_emb is None or target_emb is None: logger.warning(f"Unknown language: {source_lang} or {target_lang}") return ContainmentScore( source_lang=source_lang, target_lang=target_lang, rank=rank, containment_score=0.0, overlap_dimension=0, confidence=0.0, propagation_recommended=False ) # Compute containment via projection # Truncate to rank dimension source_subspace = source_emb[:rank] target_subspace = target_emb[:rank] # Containment score: cosine similarity in rank-dimensional subspace containment = float(np.dot(source_subspace, target_subspace)) containment = (containment + 1.0) / 2.0 # Normalize to [0, 1] # Overlap dimension: effective rank of shared subspace overlap_dim = int(rank * containment) # Confidence based on rank and language resource levels confidence = self._compute_containment_confidence( source_lang, target_lang, rank, containment ) # Recommend propagation if containment > 0.75 and confidence > 0.7 propagation_recommended = containment > 0.75 and confidence > 0.7 result = ContainmentScore( source_lang=source_lang, target_lang=target_lang, rank=rank, containment_score=containment, overlap_dimension=overlap_dim, confidence=confidence, propagation_recommended=propagation_recommended ) self.containment_cache[cache_key] = result return result def _compute_containment_confidence( self, source_lang: str, target_lang: str, rank: int, containment: float ) -> float: """Compute confidence in containment score""" # Higher confidence for: # - Higher ranks (more dimensions to analyze) # - Higher containment scores # - Related language families rank_factor = min(rank / 128.0, 1.0) containment_factor = containment # Language family bonus (simplified) family_bonus = 0.0 if (source_lang in ['english', 'german', 'french', 'spanish'] and target_lang in ['english', 'german', 'french', 'spanish']): family_bonus = 0.1 confidence = 0.5 * rank_factor + 0.4 * containment_factor + family_bonus return float(np.clip(confidence, 0.0, 1.0)) def propagate_edit( self, source_lang: str, target_lang: str, rank: int, edit_vector: np.ndarray ) -> PropagationResult: """ Propagate edit from source to target language. Args: source_lang: Source language target_lang: Target language rank: NSN rank edit_vector: Edit vector in source language Returns: PropagationResult with propagated edit """ # Evaluate containment containment = self.evaluate_subspace_containment( source_lang, target_lang, rank ) if not containment.propagation_recommended: logger.warning( f"Propagation not recommended: {source_lang} → {target_lang} " f"(containment: {containment.containment_score:.3f})" ) result = PropagationResult( source_lang=source_lang, target_lang=target_lang, rank=rank, edit_vector=edit_vector, propagated_vector=np.zeros_like(edit_vector), containment_score=containment.containment_score, success=False, quality_score=0.0, propagation_path=[source_lang, target_lang] ) self.propagation_history.append(result) return result # Propagate edit via subspace projection propagated_vector = self._transfer_edit( edit_vector, source_lang, target_lang, rank ) # Compute quality score quality_score = self._compute_propagation_quality( edit_vector, propagated_vector, containment.containment_score ) result = PropagationResult( source_lang=source_lang, target_lang=target_lang, rank=rank, edit_vector=edit_vector, propagated_vector=propagated_vector, containment_score=containment.containment_score, success=True, quality_score=quality_score, propagation_path=[source_lang, target_lang] ) self.propagation_history.append(result) logger.info( f"Propagated edit: {source_lang} → {target_lang} " f"(quality: {quality_score:.3f})" ) return result def _transfer_edit( self, edit_vector: np.ndarray, source_lang: str, target_lang: str, rank: int ) -> np.ndarray: """Transfer edit vector from source to target language""" # Get language embeddings source_emb = self.language_embeddings[source_lang] target_emb = self.language_embeddings[target_lang] # Project edit onto shared subspace # Simplified: weighted combination based on containment source_subspace = source_emb[:rank] target_subspace = target_emb[:rank] # Compute transfer matrix (simplified) transfer_weight = np.dot(source_subspace, target_subspace) # Apply transfer propagated = edit_vector * transfer_weight return propagated def _compute_propagation_quality( self, original: np.ndarray, propagated: np.ndarray, containment: float ) -> float: """Compute quality of propagated edit""" # Quality based on: # - Containment score # - Vector similarity # - Magnitude preservation if np.linalg.norm(propagated) < 1e-6: return 0.0 # Cosine similarity similarity = np.dot(original, propagated) / ( np.linalg.norm(original) * np.linalg.norm(propagated) ) similarity = (similarity + 1.0) / 2.0 # Normalize to [0, 1] # Magnitude preservation mag_ratio = np.linalg.norm(propagated) / np.linalg.norm(original) mag_score = 1.0 - abs(1.0 - mag_ratio) # Combined quality quality = 0.5 * containment + 0.3 * similarity + 0.2 * mag_score return float(np.clip(quality, 0.0, 1.0)) def compute_containment_heatmap( self, languages: List[str], rank: int ) -> np.ndarray: """ Compute containment heatmap for dashboard visualization. Args: languages: List of languages to analyze rank: NSN rank Returns: Heatmap matrix (languages x languages) """ n = len(languages) heatmap = np.zeros((n, n)) for i, source in enumerate(languages): for j, target in enumerate(languages): if i == j: heatmap[i, j] = 1.0 else: containment = self.evaluate_subspace_containment( source, target, rank ) heatmap[i, j] = containment.containment_score return heatmap def find_propagation_paths( self, source_lang: str, target_langs: List[str], rank: int, min_containment: float = 0.75 ) -> Dict[str, List[str]]: """ Find optimal propagation paths from source to multiple targets. Returns: Dict mapping target language to propagation path """ paths = {} for target in target_langs: # Direct path direct_containment = self.evaluate_subspace_containment( source_lang, target, rank ) if direct_containment.containment_score >= min_containment: paths[target] = [source_lang, target] else: # Try indirect path through intermediate language best_path = None best_score = 0.0 for intermediate in self.language_embeddings.keys(): if intermediate in [source_lang, target]: continue c1 = self.evaluate_subspace_containment( source_lang, intermediate, rank ) c2 = self.evaluate_subspace_containment( intermediate, target, rank ) combined_score = c1.containment_score * c2.containment_score if combined_score > best_score and combined_score >= min_containment: best_score = combined_score best_path = [source_lang, intermediate, target] if best_path: paths[target] = best_path else: paths[target] = [] # No viable path return paths