Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Cross-Lingual Edit Propagation via Subspace Containment | |
| Transfer high-resource corrections to low-resource languages using containment scores | |
| Based on: | |
| Zhang, Y., et al. (2024). "Deep Hierarchical Learning with Nested Subspace Networks." | |
| arXiv preprint. NSN framework for hierarchical representation learning. | |
| """ | |
| import numpy as np | |
| from typing import Dict, List, Optional, Tuple | |
| from dataclasses import dataclass | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class ContainmentScore: | |
| """Subspace containment analysis result""" | |
| source_lang: str | |
| target_lang: str | |
| rank: int | |
| containment_score: float # 0-1, how much target is contained in source | |
| overlap_dimension: int # Dimension of overlap | |
| confidence: float | |
| propagation_recommended: bool | |
| class PropagationResult: | |
| """Result of edit propagation""" | |
| source_lang: str | |
| target_lang: str | |
| rank: int | |
| edit_vector: np.ndarray | |
| propagated_vector: np.ndarray | |
| containment_score: float | |
| success: bool | |
| quality_score: float # Predicted quality after propagation | |
| propagation_path: List[str] # Languages in propagation chain | |
| class EditPropagationEngine: | |
| """ | |
| Transfer edits from high-resource to low-resource languages using | |
| subspace containment analysis. | |
| Dashboard Extension: | |
| - Heatmap of containment scores across language pairs | |
| - Flow arrows showing edit propagation paths | |
| """ | |
| def __init__(self): | |
| self.language_embeddings = self._initialize_language_embeddings() | |
| self.containment_cache: Dict[Tuple[str, str, int], ContainmentScore] = {} | |
| self.propagation_history: List[PropagationResult] = [] | |
| def _initialize_language_embeddings(self) -> Dict[str, np.ndarray]: | |
| """Initialize language subspace embeddings""" | |
| # Simulated language embeddings (in practice, learned from data) | |
| np.random.seed(42) | |
| languages = { | |
| # High-resource languages (larger subspaces) | |
| 'english': np.random.randn(256), | |
| 'chinese': np.random.randn(256), | |
| 'spanish': np.random.randn(256), | |
| 'french': np.random.randn(256), | |
| 'german': np.random.randn(256), | |
| # Medium-resource languages | |
| 'russian': np.random.randn(256), | |
| 'arabic': np.random.randn(256), | |
| 'japanese': np.random.randn(256), | |
| 'korean': np.random.randn(256), | |
| 'portuguese': np.random.randn(256), | |
| # Low-resource languages (smaller subspaces) | |
| 'indonesian': np.random.randn(256), | |
| 'vietnamese': np.random.randn(256), | |
| 'thai': np.random.randn(256), | |
| 'swahili': np.random.randn(256), | |
| 'yoruba': np.random.randn(256) | |
| } | |
| # Normalize embeddings | |
| for lang in languages: | |
| languages[lang] = languages[lang] / np.linalg.norm(languages[lang]) | |
| return languages | |
| def evaluate_subspace_containment( | |
| self, | |
| source_lang: str, | |
| target_lang: str, | |
| rank: int | |
| ) -> ContainmentScore: | |
| """ | |
| Evaluate how much target language subspace is contained in source. | |
| Args: | |
| source_lang: High-resource source language | |
| target_lang: Low-resource target language | |
| rank: NSN rank for analysis | |
| Returns: | |
| ContainmentScore with containment metrics | |
| """ | |
| cache_key = (source_lang, target_lang, rank) | |
| if cache_key in self.containment_cache: | |
| return self.containment_cache[cache_key] | |
| # Get language embeddings | |
| source_emb = self.language_embeddings.get(source_lang) | |
| target_emb = self.language_embeddings.get(target_lang) | |
| if source_emb is None or target_emb is None: | |
| logger.warning(f"Unknown language: {source_lang} or {target_lang}") | |
| return ContainmentScore( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| rank=rank, | |
| containment_score=0.0, | |
| overlap_dimension=0, | |
| confidence=0.0, | |
| propagation_recommended=False | |
| ) | |
| # Compute containment via projection | |
| # Truncate to rank dimension | |
| source_subspace = source_emb[:rank] | |
| target_subspace = target_emb[:rank] | |
| # Containment score: cosine similarity in rank-dimensional subspace | |
| containment = float(np.dot(source_subspace, target_subspace)) | |
| containment = (containment + 1.0) / 2.0 # Normalize to [0, 1] | |
| # Overlap dimension: effective rank of shared subspace | |
| overlap_dim = int(rank * containment) | |
| # Confidence based on rank and language resource levels | |
| confidence = self._compute_containment_confidence( | |
| source_lang, target_lang, rank, containment | |
| ) | |
| # Recommend propagation if containment > 0.75 and confidence > 0.7 | |
| propagation_recommended = containment > 0.75 and confidence > 0.7 | |
| result = ContainmentScore( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| rank=rank, | |
| containment_score=containment, | |
| overlap_dimension=overlap_dim, | |
| confidence=confidence, | |
| propagation_recommended=propagation_recommended | |
| ) | |
| self.containment_cache[cache_key] = result | |
| return result | |
| def _compute_containment_confidence( | |
| self, | |
| source_lang: str, | |
| target_lang: str, | |
| rank: int, | |
| containment: float | |
| ) -> float: | |
| """Compute confidence in containment score""" | |
| # Higher confidence for: | |
| # - Higher ranks (more dimensions to analyze) | |
| # - Higher containment scores | |
| # - Related language families | |
| rank_factor = min(rank / 128.0, 1.0) | |
| containment_factor = containment | |
| # Language family bonus (simplified) | |
| family_bonus = 0.0 | |
| if (source_lang in ['english', 'german', 'french', 'spanish'] and | |
| target_lang in ['english', 'german', 'french', 'spanish']): | |
| family_bonus = 0.1 | |
| confidence = 0.5 * rank_factor + 0.4 * containment_factor + family_bonus | |
| return float(np.clip(confidence, 0.0, 1.0)) | |
| def propagate_edit( | |
| self, | |
| source_lang: str, | |
| target_lang: str, | |
| rank: int, | |
| edit_vector: np.ndarray | |
| ) -> PropagationResult: | |
| """ | |
| Propagate edit from source to target language. | |
| Args: | |
| source_lang: Source language | |
| target_lang: Target language | |
| rank: NSN rank | |
| edit_vector: Edit vector in source language | |
| Returns: | |
| PropagationResult with propagated edit | |
| """ | |
| # Evaluate containment | |
| containment = self.evaluate_subspace_containment( | |
| source_lang, target_lang, rank | |
| ) | |
| if not containment.propagation_recommended: | |
| logger.warning( | |
| f"Propagation not recommended: {source_lang} → {target_lang} " | |
| f"(containment: {containment.containment_score:.3f})" | |
| ) | |
| result = PropagationResult( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| rank=rank, | |
| edit_vector=edit_vector, | |
| propagated_vector=np.zeros_like(edit_vector), | |
| containment_score=containment.containment_score, | |
| success=False, | |
| quality_score=0.0, | |
| propagation_path=[source_lang, target_lang] | |
| ) | |
| self.propagation_history.append(result) | |
| return result | |
| # Propagate edit via subspace projection | |
| propagated_vector = self._transfer_edit( | |
| edit_vector, source_lang, target_lang, rank | |
| ) | |
| # Compute quality score | |
| quality_score = self._compute_propagation_quality( | |
| edit_vector, propagated_vector, containment.containment_score | |
| ) | |
| result = PropagationResult( | |
| source_lang=source_lang, | |
| target_lang=target_lang, | |
| rank=rank, | |
| edit_vector=edit_vector, | |
| propagated_vector=propagated_vector, | |
| containment_score=containment.containment_score, | |
| success=True, | |
| quality_score=quality_score, | |
| propagation_path=[source_lang, target_lang] | |
| ) | |
| self.propagation_history.append(result) | |
| logger.info( | |
| f"Propagated edit: {source_lang} → {target_lang} " | |
| f"(quality: {quality_score:.3f})" | |
| ) | |
| return result | |
| def _transfer_edit( | |
| self, | |
| edit_vector: np.ndarray, | |
| source_lang: str, | |
| target_lang: str, | |
| rank: int | |
| ) -> np.ndarray: | |
| """Transfer edit vector from source to target language""" | |
| # Get language embeddings | |
| source_emb = self.language_embeddings[source_lang] | |
| target_emb = self.language_embeddings[target_lang] | |
| # Project edit onto shared subspace | |
| # Simplified: weighted combination based on containment | |
| source_subspace = source_emb[:rank] | |
| target_subspace = target_emb[:rank] | |
| # Compute transfer matrix (simplified) | |
| transfer_weight = np.dot(source_subspace, target_subspace) | |
| # Apply transfer | |
| propagated = edit_vector * transfer_weight | |
| return propagated | |
| def _compute_propagation_quality( | |
| self, | |
| original: np.ndarray, | |
| propagated: np.ndarray, | |
| containment: float | |
| ) -> float: | |
| """Compute quality of propagated edit""" | |
| # Quality based on: | |
| # - Containment score | |
| # - Vector similarity | |
| # - Magnitude preservation | |
| if np.linalg.norm(propagated) < 1e-6: | |
| return 0.0 | |
| # Cosine similarity | |
| similarity = np.dot(original, propagated) / ( | |
| np.linalg.norm(original) * np.linalg.norm(propagated) | |
| ) | |
| similarity = (similarity + 1.0) / 2.0 # Normalize to [0, 1] | |
| # Magnitude preservation | |
| mag_ratio = np.linalg.norm(propagated) / np.linalg.norm(original) | |
| mag_score = 1.0 - abs(1.0 - mag_ratio) | |
| # Combined quality | |
| quality = 0.5 * containment + 0.3 * similarity + 0.2 * mag_score | |
| return float(np.clip(quality, 0.0, 1.0)) | |
| def compute_containment_heatmap( | |
| self, | |
| languages: List[str], | |
| rank: int | |
| ) -> np.ndarray: | |
| """ | |
| Compute containment heatmap for dashboard visualization. | |
| Args: | |
| languages: List of languages to analyze | |
| rank: NSN rank | |
| Returns: | |
| Heatmap matrix (languages x languages) | |
| """ | |
| n = len(languages) | |
| heatmap = np.zeros((n, n)) | |
| for i, source in enumerate(languages): | |
| for j, target in enumerate(languages): | |
| if i == j: | |
| heatmap[i, j] = 1.0 | |
| else: | |
| containment = self.evaluate_subspace_containment( | |
| source, target, rank | |
| ) | |
| heatmap[i, j] = containment.containment_score | |
| return heatmap | |
| def find_propagation_paths( | |
| self, | |
| source_lang: str, | |
| target_langs: List[str], | |
| rank: int, | |
| min_containment: float = 0.75 | |
| ) -> Dict[str, List[str]]: | |
| """ | |
| Find optimal propagation paths from source to multiple targets. | |
| Returns: | |
| Dict mapping target language to propagation path | |
| """ | |
| paths = {} | |
| for target in target_langs: | |
| # Direct path | |
| direct_containment = self.evaluate_subspace_containment( | |
| source_lang, target, rank | |
| ) | |
| if direct_containment.containment_score >= min_containment: | |
| paths[target] = [source_lang, target] | |
| else: | |
| # Try indirect path through intermediate language | |
| best_path = None | |
| best_score = 0.0 | |
| for intermediate in self.language_embeddings.keys(): | |
| if intermediate in [source_lang, target]: | |
| continue | |
| c1 = self.evaluate_subspace_containment( | |
| source_lang, intermediate, rank | |
| ) | |
| c2 = self.evaluate_subspace_containment( | |
| intermediate, target, rank | |
| ) | |
| combined_score = c1.containment_score * c2.containment_score | |
| if combined_score > best_score and combined_score >= min_containment: | |
| best_score = combined_score | |
| best_path = [source_lang, intermediate, target] | |
| if best_path: | |
| paths[target] = best_path | |
| else: | |
| paths[target] = [] # No viable path | |
| return paths | |