File size: 14,005 Bytes
517f71b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 |
# -*- coding: utf-8 -*-
"""
Cross-Lingual Edit Propagation via Subspace Containment
Transfer high-resource corrections to low-resource languages using containment scores
Based on:
Zhang, Y., et al. (2024). "Deep Hierarchical Learning with Nested Subspace Networks."
arXiv preprint. NSN framework for hierarchical representation learning.
"""
import numpy as np
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import logging
logger = logging.getLogger(__name__)
@dataclass
class ContainmentScore:
"""Subspace containment analysis result"""
source_lang: str
target_lang: str
rank: int
containment_score: float # 0-1, how much target is contained in source
overlap_dimension: int # Dimension of overlap
confidence: float
propagation_recommended: bool
@dataclass
class PropagationResult:
"""Result of edit propagation"""
source_lang: str
target_lang: str
rank: int
edit_vector: np.ndarray
propagated_vector: np.ndarray
containment_score: float
success: bool
quality_score: float # Predicted quality after propagation
propagation_path: List[str] # Languages in propagation chain
class EditPropagationEngine:
"""
Transfer edits from high-resource to low-resource languages using
subspace containment analysis.
Dashboard Extension:
- Heatmap of containment scores across language pairs
- Flow arrows showing edit propagation paths
"""
def __init__(self):
self.language_embeddings = self._initialize_language_embeddings()
self.containment_cache: Dict[Tuple[str, str, int], ContainmentScore] = {}
self.propagation_history: List[PropagationResult] = []
def _initialize_language_embeddings(self) -> Dict[str, np.ndarray]:
"""Initialize language subspace embeddings"""
# Simulated language embeddings (in practice, learned from data)
np.random.seed(42)
languages = {
# High-resource languages (larger subspaces)
'english': np.random.randn(256),
'chinese': np.random.randn(256),
'spanish': np.random.randn(256),
'french': np.random.randn(256),
'german': np.random.randn(256),
# Medium-resource languages
'russian': np.random.randn(256),
'arabic': np.random.randn(256),
'japanese': np.random.randn(256),
'korean': np.random.randn(256),
'portuguese': np.random.randn(256),
# Low-resource languages (smaller subspaces)
'indonesian': np.random.randn(256),
'vietnamese': np.random.randn(256),
'thai': np.random.randn(256),
'swahili': np.random.randn(256),
'yoruba': np.random.randn(256)
}
# Normalize embeddings
for lang in languages:
languages[lang] = languages[lang] / np.linalg.norm(languages[lang])
return languages
def evaluate_subspace_containment(
self,
source_lang: str,
target_lang: str,
rank: int
) -> ContainmentScore:
"""
Evaluate how much target language subspace is contained in source.
Args:
source_lang: High-resource source language
target_lang: Low-resource target language
rank: NSN rank for analysis
Returns:
ContainmentScore with containment metrics
"""
cache_key = (source_lang, target_lang, rank)
if cache_key in self.containment_cache:
return self.containment_cache[cache_key]
# Get language embeddings
source_emb = self.language_embeddings.get(source_lang)
target_emb = self.language_embeddings.get(target_lang)
if source_emb is None or target_emb is None:
logger.warning(f"Unknown language: {source_lang} or {target_lang}")
return ContainmentScore(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
containment_score=0.0,
overlap_dimension=0,
confidence=0.0,
propagation_recommended=False
)
# Compute containment via projection
# Truncate to rank dimension
source_subspace = source_emb[:rank]
target_subspace = target_emb[:rank]
# Containment score: cosine similarity in rank-dimensional subspace
containment = float(np.dot(source_subspace, target_subspace))
containment = (containment + 1.0) / 2.0 # Normalize to [0, 1]
# Overlap dimension: effective rank of shared subspace
overlap_dim = int(rank * containment)
# Confidence based on rank and language resource levels
confidence = self._compute_containment_confidence(
source_lang, target_lang, rank, containment
)
# Recommend propagation if containment > 0.75 and confidence > 0.7
propagation_recommended = containment > 0.75 and confidence > 0.7
result = ContainmentScore(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
containment_score=containment,
overlap_dimension=overlap_dim,
confidence=confidence,
propagation_recommended=propagation_recommended
)
self.containment_cache[cache_key] = result
return result
def _compute_containment_confidence(
self,
source_lang: str,
target_lang: str,
rank: int,
containment: float
) -> float:
"""Compute confidence in containment score"""
# Higher confidence for:
# - Higher ranks (more dimensions to analyze)
# - Higher containment scores
# - Related language families
rank_factor = min(rank / 128.0, 1.0)
containment_factor = containment
# Language family bonus (simplified)
family_bonus = 0.0
if (source_lang in ['english', 'german', 'french', 'spanish'] and
target_lang in ['english', 'german', 'french', 'spanish']):
family_bonus = 0.1
confidence = 0.5 * rank_factor + 0.4 * containment_factor + family_bonus
return float(np.clip(confidence, 0.0, 1.0))
def propagate_edit(
self,
source_lang: str,
target_lang: str,
rank: int,
edit_vector: np.ndarray
) -> PropagationResult:
"""
Propagate edit from source to target language.
Args:
source_lang: Source language
target_lang: Target language
rank: NSN rank
edit_vector: Edit vector in source language
Returns:
PropagationResult with propagated edit
"""
# Evaluate containment
containment = self.evaluate_subspace_containment(
source_lang, target_lang, rank
)
if not containment.propagation_recommended:
logger.warning(
f"Propagation not recommended: {source_lang} → {target_lang} "
f"(containment: {containment.containment_score:.3f})"
)
result = PropagationResult(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
edit_vector=edit_vector,
propagated_vector=np.zeros_like(edit_vector),
containment_score=containment.containment_score,
success=False,
quality_score=0.0,
propagation_path=[source_lang, target_lang]
)
self.propagation_history.append(result)
return result
# Propagate edit via subspace projection
propagated_vector = self._transfer_edit(
edit_vector, source_lang, target_lang, rank
)
# Compute quality score
quality_score = self._compute_propagation_quality(
edit_vector, propagated_vector, containment.containment_score
)
result = PropagationResult(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
edit_vector=edit_vector,
propagated_vector=propagated_vector,
containment_score=containment.containment_score,
success=True,
quality_score=quality_score,
propagation_path=[source_lang, target_lang]
)
self.propagation_history.append(result)
logger.info(
f"Propagated edit: {source_lang} → {target_lang} "
f"(quality: {quality_score:.3f})"
)
return result
def _transfer_edit(
self,
edit_vector: np.ndarray,
source_lang: str,
target_lang: str,
rank: int
) -> np.ndarray:
"""Transfer edit vector from source to target language"""
# Get language embeddings
source_emb = self.language_embeddings[source_lang]
target_emb = self.language_embeddings[target_lang]
# Project edit onto shared subspace
# Simplified: weighted combination based on containment
source_subspace = source_emb[:rank]
target_subspace = target_emb[:rank]
# Compute transfer matrix (simplified)
transfer_weight = np.dot(source_subspace, target_subspace)
# Apply transfer
propagated = edit_vector * transfer_weight
return propagated
def _compute_propagation_quality(
self,
original: np.ndarray,
propagated: np.ndarray,
containment: float
) -> float:
"""Compute quality of propagated edit"""
# Quality based on:
# - Containment score
# - Vector similarity
# - Magnitude preservation
if np.linalg.norm(propagated) < 1e-6:
return 0.0
# Cosine similarity
similarity = np.dot(original, propagated) / (
np.linalg.norm(original) * np.linalg.norm(propagated)
)
similarity = (similarity + 1.0) / 2.0 # Normalize to [0, 1]
# Magnitude preservation
mag_ratio = np.linalg.norm(propagated) / np.linalg.norm(original)
mag_score = 1.0 - abs(1.0 - mag_ratio)
# Combined quality
quality = 0.5 * containment + 0.3 * similarity + 0.2 * mag_score
return float(np.clip(quality, 0.0, 1.0))
def compute_containment_heatmap(
self,
languages: List[str],
rank: int
) -> np.ndarray:
"""
Compute containment heatmap for dashboard visualization.
Args:
languages: List of languages to analyze
rank: NSN rank
Returns:
Heatmap matrix (languages x languages)
"""
n = len(languages)
heatmap = np.zeros((n, n))
for i, source in enumerate(languages):
for j, target in enumerate(languages):
if i == j:
heatmap[i, j] = 1.0
else:
containment = self.evaluate_subspace_containment(
source, target, rank
)
heatmap[i, j] = containment.containment_score
return heatmap
def find_propagation_paths(
self,
source_lang: str,
target_langs: List[str],
rank: int,
min_containment: float = 0.75
) -> Dict[str, List[str]]:
"""
Find optimal propagation paths from source to multiple targets.
Returns:
Dict mapping target language to propagation path
"""
paths = {}
for target in target_langs:
# Direct path
direct_containment = self.evaluate_subspace_containment(
source_lang, target, rank
)
if direct_containment.containment_score >= min_containment:
paths[target] = [source_lang, target]
else:
# Try indirect path through intermediate language
best_path = None
best_score = 0.0
for intermediate in self.language_embeddings.keys():
if intermediate in [source_lang, target]:
continue
c1 = self.evaluate_subspace_containment(
source_lang, intermediate, rank
)
c2 = self.evaluate_subspace_containment(
intermediate, target, rank
)
combined_score = c1.containment_score * c2.containment_score
if combined_score > best_score and combined_score >= min_containment:
best_score = combined_score
best_path = [source_lang, intermediate, target]
if best_path:
paths[target] = best_path
else:
paths[target] = [] # No viable path
return paths
|