File size: 14,005 Bytes
517f71b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# -*- coding: utf-8 -*-
"""

Cross-Lingual Edit Propagation via Subspace Containment

Transfer high-resource corrections to low-resource languages using containment scores



Based on:

    Zhang, Y., et al. (2024). "Deep Hierarchical Learning with Nested Subspace Networks."

    arXiv preprint. NSN framework for hierarchical representation learning.

"""
import numpy as np
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import logging

logger = logging.getLogger(__name__)


@dataclass
class ContainmentScore:
    """Subspace containment analysis result"""
    source_lang: str
    target_lang: str
    rank: int
    containment_score: float  # 0-1, how much target is contained in source
    overlap_dimension: int  # Dimension of overlap
    confidence: float
    propagation_recommended: bool


@dataclass
class PropagationResult:
    """Result of edit propagation"""
    source_lang: str
    target_lang: str
    rank: int
    edit_vector: np.ndarray
    propagated_vector: np.ndarray
    containment_score: float
    success: bool
    quality_score: float  # Predicted quality after propagation
    propagation_path: List[str]  # Languages in propagation chain


class EditPropagationEngine:
    """

    Transfer edits from high-resource to low-resource languages using

    subspace containment analysis.

    

    Dashboard Extension:

    - Heatmap of containment scores across language pairs

    - Flow arrows showing edit propagation paths

    """
    
    def __init__(self):
        self.language_embeddings = self._initialize_language_embeddings()
        self.containment_cache: Dict[Tuple[str, str, int], ContainmentScore] = {}
        self.propagation_history: List[PropagationResult] = []
        
    def _initialize_language_embeddings(self) -> Dict[str, np.ndarray]:
        """Initialize language subspace embeddings"""
        # Simulated language embeddings (in practice, learned from data)
        np.random.seed(42)
        
        languages = {
            # High-resource languages (larger subspaces)
            'english': np.random.randn(256),
            'chinese': np.random.randn(256),
            'spanish': np.random.randn(256),
            'french': np.random.randn(256),
            'german': np.random.randn(256),
            
            # Medium-resource languages
            'russian': np.random.randn(256),
            'arabic': np.random.randn(256),
            'japanese': np.random.randn(256),
            'korean': np.random.randn(256),
            'portuguese': np.random.randn(256),
            
            # Low-resource languages (smaller subspaces)
            'indonesian': np.random.randn(256),
            'vietnamese': np.random.randn(256),
            'thai': np.random.randn(256),
            'swahili': np.random.randn(256),
            'yoruba': np.random.randn(256)
        }
        
        # Normalize embeddings
        for lang in languages:
            languages[lang] = languages[lang] / np.linalg.norm(languages[lang])
        
        return languages
    
    def evaluate_subspace_containment(

        self,

        source_lang: str,

        target_lang: str,

        rank: int

    ) -> ContainmentScore:
        """

        Evaluate how much target language subspace is contained in source.

        

        Args:

            source_lang: High-resource source language

            target_lang: Low-resource target language

            rank: NSN rank for analysis

            

        Returns:

            ContainmentScore with containment metrics

        """
        cache_key = (source_lang, target_lang, rank)
        if cache_key in self.containment_cache:
            return self.containment_cache[cache_key]
        
        # Get language embeddings
        source_emb = self.language_embeddings.get(source_lang)
        target_emb = self.language_embeddings.get(target_lang)
        
        if source_emb is None or target_emb is None:
            logger.warning(f"Unknown language: {source_lang} or {target_lang}")
            return ContainmentScore(
                source_lang=source_lang,
                target_lang=target_lang,
                rank=rank,
                containment_score=0.0,
                overlap_dimension=0,
                confidence=0.0,
                propagation_recommended=False
            )
        
        # Compute containment via projection
        # Truncate to rank dimension
        source_subspace = source_emb[:rank]
        target_subspace = target_emb[:rank]
        
        # Containment score: cosine similarity in rank-dimensional subspace
        containment = float(np.dot(source_subspace, target_subspace))
        containment = (containment + 1.0) / 2.0  # Normalize to [0, 1]
        
        # Overlap dimension: effective rank of shared subspace
        overlap_dim = int(rank * containment)
        
        # Confidence based on rank and language resource levels
        confidence = self._compute_containment_confidence(
            source_lang, target_lang, rank, containment
        )
        
        # Recommend propagation if containment > 0.75 and confidence > 0.7
        propagation_recommended = containment > 0.75 and confidence > 0.7
        
        result = ContainmentScore(
            source_lang=source_lang,
            target_lang=target_lang,
            rank=rank,
            containment_score=containment,
            overlap_dimension=overlap_dim,
            confidence=confidence,
            propagation_recommended=propagation_recommended
        )
        
        self.containment_cache[cache_key] = result
        return result
    
    def _compute_containment_confidence(

        self,

        source_lang: str,

        target_lang: str,

        rank: int,

        containment: float

    ) -> float:
        """Compute confidence in containment score"""
        # Higher confidence for:
        # - Higher ranks (more dimensions to analyze)
        # - Higher containment scores
        # - Related language families
        
        rank_factor = min(rank / 128.0, 1.0)
        containment_factor = containment
        
        # Language family bonus (simplified)
        family_bonus = 0.0
        if (source_lang in ['english', 'german', 'french', 'spanish'] and
            target_lang in ['english', 'german', 'french', 'spanish']):
            family_bonus = 0.1
        
        confidence = 0.5 * rank_factor + 0.4 * containment_factor + family_bonus
        return float(np.clip(confidence, 0.0, 1.0))
    
    def propagate_edit(

        self,

        source_lang: str,

        target_lang: str,

        rank: int,

        edit_vector: np.ndarray

    ) -> PropagationResult:
        """

        Propagate edit from source to target language.

        

        Args:

            source_lang: Source language

            target_lang: Target language

            rank: NSN rank

            edit_vector: Edit vector in source language

            

        Returns:

            PropagationResult with propagated edit

        """
        # Evaluate containment
        containment = self.evaluate_subspace_containment(
            source_lang, target_lang, rank
        )
        
        if not containment.propagation_recommended:
            logger.warning(
                f"Propagation not recommended: {source_lang}{target_lang} "
                f"(containment: {containment.containment_score:.3f})"
            )
            
            result = PropagationResult(
                source_lang=source_lang,
                target_lang=target_lang,
                rank=rank,
                edit_vector=edit_vector,
                propagated_vector=np.zeros_like(edit_vector),
                containment_score=containment.containment_score,
                success=False,
                quality_score=0.0,
                propagation_path=[source_lang, target_lang]
            )
            
            self.propagation_history.append(result)
            return result
        
        # Propagate edit via subspace projection
        propagated_vector = self._transfer_edit(
            edit_vector, source_lang, target_lang, rank
        )
        
        # Compute quality score
        quality_score = self._compute_propagation_quality(
            edit_vector, propagated_vector, containment.containment_score
        )
        
        result = PropagationResult(
            source_lang=source_lang,
            target_lang=target_lang,
            rank=rank,
            edit_vector=edit_vector,
            propagated_vector=propagated_vector,
            containment_score=containment.containment_score,
            success=True,
            quality_score=quality_score,
            propagation_path=[source_lang, target_lang]
        )
        
        self.propagation_history.append(result)
        logger.info(
            f"Propagated edit: {source_lang}{target_lang} "
            f"(quality: {quality_score:.3f})"
        )
        
        return result
    
    def _transfer_edit(

        self,

        edit_vector: np.ndarray,

        source_lang: str,

        target_lang: str,

        rank: int

    ) -> np.ndarray:
        """Transfer edit vector from source to target language"""
        # Get language embeddings
        source_emb = self.language_embeddings[source_lang]
        target_emb = self.language_embeddings[target_lang]
        
        # Project edit onto shared subspace
        # Simplified: weighted combination based on containment
        source_subspace = source_emb[:rank]
        target_subspace = target_emb[:rank]
        
        # Compute transfer matrix (simplified)
        transfer_weight = np.dot(source_subspace, target_subspace)
        
        # Apply transfer
        propagated = edit_vector * transfer_weight
        
        return propagated
    
    def _compute_propagation_quality(

        self,

        original: np.ndarray,

        propagated: np.ndarray,

        containment: float

    ) -> float:
        """Compute quality of propagated edit"""
        # Quality based on:
        # - Containment score
        # - Vector similarity
        # - Magnitude preservation
        
        if np.linalg.norm(propagated) < 1e-6:
            return 0.0
        
        # Cosine similarity
        similarity = np.dot(original, propagated) / (
            np.linalg.norm(original) * np.linalg.norm(propagated)
        )
        similarity = (similarity + 1.0) / 2.0  # Normalize to [0, 1]
        
        # Magnitude preservation
        mag_ratio = np.linalg.norm(propagated) / np.linalg.norm(original)
        mag_score = 1.0 - abs(1.0 - mag_ratio)
        
        # Combined quality
        quality = 0.5 * containment + 0.3 * similarity + 0.2 * mag_score
        
        return float(np.clip(quality, 0.0, 1.0))
    
    def compute_containment_heatmap(

        self,

        languages: List[str],

        rank: int

    ) -> np.ndarray:
        """

        Compute containment heatmap for dashboard visualization.

        

        Args:

            languages: List of languages to analyze

            rank: NSN rank

            

        Returns:

            Heatmap matrix (languages x languages)

        """
        n = len(languages)
        heatmap = np.zeros((n, n))
        
        for i, source in enumerate(languages):
            for j, target in enumerate(languages):
                if i == j:
                    heatmap[i, j] = 1.0
                else:
                    containment = self.evaluate_subspace_containment(
                        source, target, rank
                    )
                    heatmap[i, j] = containment.containment_score
        
        return heatmap
    
    def find_propagation_paths(

        self,

        source_lang: str,

        target_langs: List[str],

        rank: int,

        min_containment: float = 0.75

    ) -> Dict[str, List[str]]:
        """

        Find optimal propagation paths from source to multiple targets.

        

        Returns:

            Dict mapping target language to propagation path

        """
        paths = {}
        
        for target in target_langs:
            # Direct path
            direct_containment = self.evaluate_subspace_containment(
                source_lang, target, rank
            )
            
            if direct_containment.containment_score >= min_containment:
                paths[target] = [source_lang, target]
            else:
                # Try indirect path through intermediate language
                best_path = None
                best_score = 0.0
                
                for intermediate in self.language_embeddings.keys():
                    if intermediate in [source_lang, target]:
                        continue
                    
                    c1 = self.evaluate_subspace_containment(
                        source_lang, intermediate, rank
                    )
                    c2 = self.evaluate_subspace_containment(
                        intermediate, target, rank
                    )
                    
                    combined_score = c1.containment_score * c2.containment_score
                    
                    if combined_score > best_score and combined_score >= min_containment:
                        best_score = combined_score
                        best_path = [source_lang, intermediate, target]
                
                if best_path:
                    paths[target] = best_path
                else:
                    paths[target] = []  # No viable path
        
        return paths