quantum-nsn-integration / edit_propagation_engine.py
Nurcholish's picture
Upload 17 files
517f71b verified
# -*- coding: utf-8 -*-
"""
Cross-Lingual Edit Propagation via Subspace Containment
Transfer high-resource corrections to low-resource languages using containment scores
Based on:
Zhang, Y., et al. (2024). "Deep Hierarchical Learning with Nested Subspace Networks."
arXiv preprint. NSN framework for hierarchical representation learning.
"""
import numpy as np
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import logging
logger = logging.getLogger(__name__)
@dataclass
class ContainmentScore:
"""Subspace containment analysis result"""
source_lang: str
target_lang: str
rank: int
containment_score: float # 0-1, how much target is contained in source
overlap_dimension: int # Dimension of overlap
confidence: float
propagation_recommended: bool
@dataclass
class PropagationResult:
"""Result of edit propagation"""
source_lang: str
target_lang: str
rank: int
edit_vector: np.ndarray
propagated_vector: np.ndarray
containment_score: float
success: bool
quality_score: float # Predicted quality after propagation
propagation_path: List[str] # Languages in propagation chain
class EditPropagationEngine:
"""
Transfer edits from high-resource to low-resource languages using
subspace containment analysis.
Dashboard Extension:
- Heatmap of containment scores across language pairs
- Flow arrows showing edit propagation paths
"""
def __init__(self):
self.language_embeddings = self._initialize_language_embeddings()
self.containment_cache: Dict[Tuple[str, str, int], ContainmentScore] = {}
self.propagation_history: List[PropagationResult] = []
def _initialize_language_embeddings(self) -> Dict[str, np.ndarray]:
"""Initialize language subspace embeddings"""
# Simulated language embeddings (in practice, learned from data)
np.random.seed(42)
languages = {
# High-resource languages (larger subspaces)
'english': np.random.randn(256),
'chinese': np.random.randn(256),
'spanish': np.random.randn(256),
'french': np.random.randn(256),
'german': np.random.randn(256),
# Medium-resource languages
'russian': np.random.randn(256),
'arabic': np.random.randn(256),
'japanese': np.random.randn(256),
'korean': np.random.randn(256),
'portuguese': np.random.randn(256),
# Low-resource languages (smaller subspaces)
'indonesian': np.random.randn(256),
'vietnamese': np.random.randn(256),
'thai': np.random.randn(256),
'swahili': np.random.randn(256),
'yoruba': np.random.randn(256)
}
# Normalize embeddings
for lang in languages:
languages[lang] = languages[lang] / np.linalg.norm(languages[lang])
return languages
def evaluate_subspace_containment(
self,
source_lang: str,
target_lang: str,
rank: int
) -> ContainmentScore:
"""
Evaluate how much target language subspace is contained in source.
Args:
source_lang: High-resource source language
target_lang: Low-resource target language
rank: NSN rank for analysis
Returns:
ContainmentScore with containment metrics
"""
cache_key = (source_lang, target_lang, rank)
if cache_key in self.containment_cache:
return self.containment_cache[cache_key]
# Get language embeddings
source_emb = self.language_embeddings.get(source_lang)
target_emb = self.language_embeddings.get(target_lang)
if source_emb is None or target_emb is None:
logger.warning(f"Unknown language: {source_lang} or {target_lang}")
return ContainmentScore(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
containment_score=0.0,
overlap_dimension=0,
confidence=0.0,
propagation_recommended=False
)
# Compute containment via projection
# Truncate to rank dimension
source_subspace = source_emb[:rank]
target_subspace = target_emb[:rank]
# Containment score: cosine similarity in rank-dimensional subspace
containment = float(np.dot(source_subspace, target_subspace))
containment = (containment + 1.0) / 2.0 # Normalize to [0, 1]
# Overlap dimension: effective rank of shared subspace
overlap_dim = int(rank * containment)
# Confidence based on rank and language resource levels
confidence = self._compute_containment_confidence(
source_lang, target_lang, rank, containment
)
# Recommend propagation if containment > 0.75 and confidence > 0.7
propagation_recommended = containment > 0.75 and confidence > 0.7
result = ContainmentScore(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
containment_score=containment,
overlap_dimension=overlap_dim,
confidence=confidence,
propagation_recommended=propagation_recommended
)
self.containment_cache[cache_key] = result
return result
def _compute_containment_confidence(
self,
source_lang: str,
target_lang: str,
rank: int,
containment: float
) -> float:
"""Compute confidence in containment score"""
# Higher confidence for:
# - Higher ranks (more dimensions to analyze)
# - Higher containment scores
# - Related language families
rank_factor = min(rank / 128.0, 1.0)
containment_factor = containment
# Language family bonus (simplified)
family_bonus = 0.0
if (source_lang in ['english', 'german', 'french', 'spanish'] and
target_lang in ['english', 'german', 'french', 'spanish']):
family_bonus = 0.1
confidence = 0.5 * rank_factor + 0.4 * containment_factor + family_bonus
return float(np.clip(confidence, 0.0, 1.0))
def propagate_edit(
self,
source_lang: str,
target_lang: str,
rank: int,
edit_vector: np.ndarray
) -> PropagationResult:
"""
Propagate edit from source to target language.
Args:
source_lang: Source language
target_lang: Target language
rank: NSN rank
edit_vector: Edit vector in source language
Returns:
PropagationResult with propagated edit
"""
# Evaluate containment
containment = self.evaluate_subspace_containment(
source_lang, target_lang, rank
)
if not containment.propagation_recommended:
logger.warning(
f"Propagation not recommended: {source_lang}{target_lang} "
f"(containment: {containment.containment_score:.3f})"
)
result = PropagationResult(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
edit_vector=edit_vector,
propagated_vector=np.zeros_like(edit_vector),
containment_score=containment.containment_score,
success=False,
quality_score=0.0,
propagation_path=[source_lang, target_lang]
)
self.propagation_history.append(result)
return result
# Propagate edit via subspace projection
propagated_vector = self._transfer_edit(
edit_vector, source_lang, target_lang, rank
)
# Compute quality score
quality_score = self._compute_propagation_quality(
edit_vector, propagated_vector, containment.containment_score
)
result = PropagationResult(
source_lang=source_lang,
target_lang=target_lang,
rank=rank,
edit_vector=edit_vector,
propagated_vector=propagated_vector,
containment_score=containment.containment_score,
success=True,
quality_score=quality_score,
propagation_path=[source_lang, target_lang]
)
self.propagation_history.append(result)
logger.info(
f"Propagated edit: {source_lang}{target_lang} "
f"(quality: {quality_score:.3f})"
)
return result
def _transfer_edit(
self,
edit_vector: np.ndarray,
source_lang: str,
target_lang: str,
rank: int
) -> np.ndarray:
"""Transfer edit vector from source to target language"""
# Get language embeddings
source_emb = self.language_embeddings[source_lang]
target_emb = self.language_embeddings[target_lang]
# Project edit onto shared subspace
# Simplified: weighted combination based on containment
source_subspace = source_emb[:rank]
target_subspace = target_emb[:rank]
# Compute transfer matrix (simplified)
transfer_weight = np.dot(source_subspace, target_subspace)
# Apply transfer
propagated = edit_vector * transfer_weight
return propagated
def _compute_propagation_quality(
self,
original: np.ndarray,
propagated: np.ndarray,
containment: float
) -> float:
"""Compute quality of propagated edit"""
# Quality based on:
# - Containment score
# - Vector similarity
# - Magnitude preservation
if np.linalg.norm(propagated) < 1e-6:
return 0.0
# Cosine similarity
similarity = np.dot(original, propagated) / (
np.linalg.norm(original) * np.linalg.norm(propagated)
)
similarity = (similarity + 1.0) / 2.0 # Normalize to [0, 1]
# Magnitude preservation
mag_ratio = np.linalg.norm(propagated) / np.linalg.norm(original)
mag_score = 1.0 - abs(1.0 - mag_ratio)
# Combined quality
quality = 0.5 * containment + 0.3 * similarity + 0.2 * mag_score
return float(np.clip(quality, 0.0, 1.0))
def compute_containment_heatmap(
self,
languages: List[str],
rank: int
) -> np.ndarray:
"""
Compute containment heatmap for dashboard visualization.
Args:
languages: List of languages to analyze
rank: NSN rank
Returns:
Heatmap matrix (languages x languages)
"""
n = len(languages)
heatmap = np.zeros((n, n))
for i, source in enumerate(languages):
for j, target in enumerate(languages):
if i == j:
heatmap[i, j] = 1.0
else:
containment = self.evaluate_subspace_containment(
source, target, rank
)
heatmap[i, j] = containment.containment_score
return heatmap
def find_propagation_paths(
self,
source_lang: str,
target_langs: List[str],
rank: int,
min_containment: float = 0.75
) -> Dict[str, List[str]]:
"""
Find optimal propagation paths from source to multiple targets.
Returns:
Dict mapping target language to propagation path
"""
paths = {}
for target in target_langs:
# Direct path
direct_containment = self.evaluate_subspace_containment(
source_lang, target, rank
)
if direct_containment.containment_score >= min_containment:
paths[target] = [source_lang, target]
else:
# Try indirect path through intermediate language
best_path = None
best_score = 0.0
for intermediate in self.language_embeddings.keys():
if intermediate in [source_lang, target]:
continue
c1 = self.evaluate_subspace_containment(
source_lang, intermediate, rank
)
c2 = self.evaluate_subspace_containment(
intermediate, target, rank
)
combined_score = c1.containment_score * c2.containment_score
if combined_score > best_score and combined_score >= min_containment:
best_score = combined_score
best_path = [source_lang, intermediate, target]
if best_path:
paths[target] = best_path
else:
paths[target] = [] # No viable path
return paths