File size: 39,553 Bytes
5ae226b
 
 
 
 
 
 
 
d3af4de
 
 
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1cd59b
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae226b
 
 
 
 
 
 
 
 
e1cd59b
5ae226b
 
1f38592
 
 
e1cd59b
1f38592
 
5ae226b
 
 
77111fb
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f38592
5ae226b
1f38592
 
 
5ae226b
 
 
 
1f38592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae226b
 
 
 
 
 
 
 
 
 
 
1f38592
 
 
 
77111fb
 
 
 
 
 
1f38592
 
77111fb
 
 
 
 
 
 
 
 
1f38592
 
 
 
 
 
 
 
77111fb
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
77111fb
 
 
 
 
 
5ae226b
 
 
 
 
 
 
 
77111fb
 
 
 
 
 
 
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1cd59b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77111fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f38592
77111fb
1f38592
 
77111fb
1f38592
 
 
 
 
 
 
 
77111fb
1f38592
 
 
 
 
77111fb
1f38592
77111fb
1f38592
 
 
 
77111fb
1f38592
 
 
 
77111fb
1f38592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77111fb
1f38592
 
77111fb
1f38592
 
 
 
 
 
 
77111fb
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae226b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
"""

Samāsa (Compound) Splitter

Detects and splits Sanskrit compound words at their boundaries.

"""

from typing import List, Tuple, Optional
from dataclasses import dataclass

# Import analyzer for Kosha access (absolute imports for HF compatibility)
from analyzer import VidyutAnalyzer, MorphParse
from sandhi_engine import SandhiEngine


@dataclass
class CompoundSplit:
    """Result of compound splitting."""
    surface: str              # Original compound
    components: List[str]     # Split components
    split_points: List[int]   # Character positions of splits
    is_compound: bool         # Was this actually a compound?
    compound_type: Optional[str]  # tatpuruṣa, dvandva, bahuvrīhi, etc.


class SamasaSplitter:
    """

    Splits Sanskrit compound words (samāsa) at their boundaries.

    Uses Kosha lookups to validate potential split points.

    """
    
    # Common compound final elements (uttarapada patterns)
    COMPOUND_FINALS = [
        "kara", "kAra", "kArin", "kft", "kftya",
        "gata", "gati", "gamana",
        "ja", "jAta", "janman",
        "Da", "DAra", "DAraka", "DArin",
        "maya", "mat", "vat",
        "pati", "nATa", "ISvara", "adhipa",
        "Atman", "rUpa", "svarUpa",
        "pada", "pAduka",
        "stha", "sthita", "sthAna",
        "yukta", "hIna", "rahita",
        "priya", "rata", "ASrita",
        "vid", "jYa", "vadin", "pAla",
        "rAja", "indra", "deva", "loka", 
        "karziR", "AkarziRi","ISa",              # Loving/devoted
    ]
    
    # Common compound first elements (pūrvapada patterns)
    COMPOUND_INITIALS = [
        "mahA", "ati", "su", "dur", "sat", "a", "an",  # Prefixes
        "sarva", "viSva", "eka", "bahu",               # All/one/many
        "deva", "brahma", "Atma", "para",              # Divine/supreme
        "rAja", "mahI", "loka",                        # King/earth/world
        "hfd", "manas", "citta",                       # Heart/mind
        "padma", "kamala", "Ananda", "ISa",                            # Lotus
    ]
    
    # Hardcoded protection for high-frequency words that might be over-split
    COMMON_WORDS = {
        "namaH", "namo", "om", "rAmo", "rAmaH", "hariH", "guruH",
        "pArvatI", "Siva", "nArAyaRa", "lokAH", "SivAya","AkarziRi",
    }
    
    def __init__(self, analyzer: Optional[VidyutAnalyzer] = None):
        """Initialize with optional shared analyzer."""
        self.analyzer = analyzer or VidyutAnalyzer(preload_cache=False)
        self.sandhi_engine = SandhiEngine()  # V4: Generative sandhi expansion
    
    # Sandhi reversal rules: (surface_ending, possible_original_endings)
    # These are common consonant/vowel Sandhi transformations to reverse
    SANDHI_REVERSIONS = {
        # Consonant Sandhi (final consonant before vowel)
        'd': ['t', 'd'],      # vidyud -> vidyut
        'g': ['k', 'g'],      # vAg -> vAk
        'b': ['p', 'b'],      # ap -> ab (water)
        'D': ['T', 'D'],      # 
        'j': ['c', 'j'],      #
        'z': ['s', 'z'],      # 
        # Vowel Sandhi (vowel combinations)
        'A': ['a', 'A'],      # a+a -> A
        'I': ['i', 'I'],      # i+i -> I
        'U': ['u', 'U'],      # u+u -> U
        'e': ['a', 'i'],      # a+i -> e
        'o': ['a', 'u'],      # a+u -> o
        'ai': ['a', 'e'],     # a+e -> ai
        'au': ['a', 'o'],     # a+o -> au
        # Consonant clusters
        'cC': ['t', 'c'],     # t+c -> cC
        'jj': ['d', 'j'],     # d+j -> jj
        'DD': ['D', 'D'],     #
        # Visarga Sandhi
        'o': ['aH'],          # aH + vowel -> o
        'ar': ['aH'],         # aH + r -> ar
    }
    
    def _try_sandhi_reversal(self, surface: str, min_stem_len: int = 3) -> List[str]:
        """

        Try to recover original stems from Sandhi-modified surface forms.

        Returns list of possible original forms, ordered by likelihood.

        """
        candidates = [surface]  # Original form is always a candidate
        
        # TRANSLITERATION NORMALIZATION (lowercase digraph → SLP1 single char)
        # This handles: bh→B, dh→D, gh→G, ph→P, th→T, kh→K, ch→C, jh→J
        TRANSLIT_MAP = [
            ('bh', 'B'), ('dh', 'D'), ('gh', 'G'), ('ph', 'P'),
            ('th', 'T'), ('kh', 'K'), ('ch', 'C'), ('jh', 'J'),
            ('Th', 'W'), ('Dh', 'Q'),  # Retroflex aspirates
        ]
        normalized = surface
        for digraph, single in TRANSLIT_MAP:
            normalized = normalized.replace(digraph, single)
        if normalized != surface:
            candidates.append(normalized)
        
        # Try consonant Sandhi at word boundary (last char)
        for form in [surface, normalized]:
            if len(form) >= min_stem_len and form[-1] in self.SANDHI_REVERSIONS:
                for original in self.SANDHI_REVERSIONS[form[-1]]:
                    candidate = form[:-1] + original
                    if candidate not in candidates:
                        candidates.append(candidate)
        
        # Try internal Sandhi (for compound-internal changes)
        # e.g., buddhy -> buddhi (y often represents elided i)
        for form in [surface, normalized]:
            if form.endswith('y') and len(form) >= min_stem_len:
                candidates.append(form[:-1] + 'i')  # Try y -> i
            if form.endswith('v') and len(form) >= min_stem_len:
                candidates.append(form[:-1] + 'u')  # Try v -> u
            
        # Remove duplicates while preserving order
        seen = set()
        unique = []
        for c in candidates:
            if c not in seen:
                seen.add(c)
                unique.append(c)
        
        return unique
    
    def _is_valid_stem(self, surface: str) -> bool:
        """

        Check if a surface form is a valid stem, trying:

        0. COMMON_WORDS protection

        1. Direct Kosha lookup

        2. Visarga/Anusvara base check (rAmaH → rAma)

        3. Sandhi reversal

        4. Pratyaya (suffix) stripping

        """
        if len(surface) < 2:
            return False
        
        # 0. Safety Check for Common Words (protect namaH, rAmo, etc.)
        if surface in self.COMMON_WORDS:
            return True
        
        # 1. Direct Kosha Check
        if self.analyzer._in_kosha(surface):
            return True
        
        # 2. Visarga/Anusvara Check (FIX for rAmaH validation)
        # If sandhi-restored "rAmo" → "rAmaH", accept it if base "rAma" is in kosha
        if surface.endswith('H') and len(surface) > 2:
            base = surface[:-1]
            if self.analyzer._in_kosha(base):
                return True
        if surface.endswith('M') and len(surface) > 2:
            base = surface[:-1]
            if self.analyzer._in_kosha(base):
                return True
        
        # 3. Try all Sandhi reversal candidates
        candidates = self._try_sandhi_reversal(surface)
        for candidate in candidates:
            if self.analyzer._in_kosha(candidate):
                return True
            # Also try vowel adjustments
            if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                return True
            if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
                return True
            if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
                return True
            # Recursive visarga check for candidates too
            if candidate.endswith('H') and len(candidate) > 2:
                if self.analyzer._in_kosha(candidate[:-1]):
                    return True
        
        # Try VIBHAKTI STRIPPING (nominal case endings)
        VIBHAKTI_ENDINGS = [
            'am', 'aH', 'ena', 'Aya', 'At', 'asya', 'e', 'AH',  # Masculine a-stem
            'An', 'EH', 'eBya', 'AnAm', 'ezu',                   # Masculine a-stem plural
            'au', 'OH', 'AvyAm',                                  # Dual
            'aye',                                                 # i-stem dative (pataye, munaye)
            'ave',                                                 # u-stem dative (vizRave, gurave)
        ]
        for ending in sorted(VIBHAKTI_ENDINGS, key=len, reverse=True):
            if surface.endswith(ending) and len(surface) > len(ending) + 2:
                stem = surface[:-len(ending)]
                if self.analyzer._in_kosha(stem):
                    return True
                # Try with 'a' restoration (munipuMgavam → munipuMgava)
                if self.analyzer._in_kosha(stem + 'a'):
                    return True
                
                # SPECIAL CASE: 'aye' ending implies 'i' stem (pataye → pati)
                if ending == 'aye' and self.analyzer._in_kosha(stem + 'i'):
                    return True
                
                # SPECIAL CASE: 'ave' ending implies 'u' stem (gurave → guru)
                if ending == 'ave' and self.analyzer._in_kosha(stem + 'u'):
                    return True
        
        # Try PRATYAYA STRIPPING (grammatical suffix removal)
        # This is Panini's kRt/taddhita system - generalizes to ALL Sanskrit
        PRATYAYAS = [
            ('ana', 3),   # lyuT: action noun (karaNa from kR)
            ('Ana', 3),   # śānac: present participle
            ('tva', 3),   # tva: abstract noun (devatva from deva)
            ('tA', 2),    # tal: abstract noun (sundaratA)
            ('ya', 2),    # yat: fitness/gerundive
            ('ta', 2),    # kta: past participle
            ('ti', 2),    # ktin: action noun
            ('in', 2),    # ṇini: possessor
            ('ika', 3),   # ṭhak: related to
            ('Iya', 3),   # cha: related to
            # Feminine/agent kṛdanta suffixes (Fix 2)
            ('iRi', 3),   # iṇī: feminine agent (ākarṣiṇī)
            ('iRI', 3),   # iṇī: alt spelling
            ('inI', 3),   # inī: feminine possessor (yoginī)
            ('ikA', 3),   # ikā: feminine derivative (nāyikā)
            ('trI', 3),   # trī: feminine agent (kartrī)
        ]
        
        for suffix, min_root in PRATYAYAS:
            if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
                root = surface[:-len(suffix)]
                # Try the root in Kosha
                if self.analyzer._in_kosha(root):
                    return True
                # Try with guṇa 'a' restoration
                if self.analyzer._in_kosha(root + 'a'):
                    return True
                # Try R→f transliteration (MW uses f for ṛ: kartRI → kartf)
                root_f = root.replace('R', 'f')
                if root_f != root and self.analyzer._in_kosha(root_f):
                    return True
                # Try Sandhi reversal on root
                for r in self._try_sandhi_reversal(root):
                    if self.analyzer._in_kosha(r):
                        return True
        
        return False
    
    def _count_kosha_heads(self, surface: str, min_head_len: int = 5) -> int:
        """

        FIX 2: Count how many valid kosha stems exist inside a long string.

        Used to detect mega-tokens that swallowed multiple stems.

        """
        if len(surface) < min_head_len * 2:
            return 1 if self._is_valid_stem(surface) else 0
        
        heads = 0
        i = 0
        while i < len(surface) - min_head_len + 1:
            # Try to find a valid stem starting at position i
            for j in range(min(len(surface), i + 15), i + min_head_len - 1, -1):
                candidate = surface[i:j]
                if len(candidate) >= min_head_len and self._is_valid_stem(candidate):
                    heads += 1
                    i = j  # Skip past this head
                    break
            else:
                i += 1
        return max(heads, 1 if self._is_valid_stem(surface) else 0)
    
    def _is_krdanta(self, surface: str) -> bool:
        """

        FIX 3: Recognize kṛdanta (verbal derivative) forms.

        These should be kept as units, not split further.

        

        Kṛdanta indicators:

        - Ends with participial suffix preceded by verbal root

        - The whole form is in kosha as a recognized derivative

        """
        KRDANTA_SUFFIXES = [
            ('mAna', 4),   # Present participle (ātmanepada)
            ('Ana', 3),    # Present participle 
            ('tavat', 5),  # Past active participle
            ('ta', 2),     # Past passive participle (kta)
            ('in', 2),     # Agent noun (ṇini)
            ('aka', 3),    # Agent noun (ṇvul)
            ('tR', 2),     # Agent noun (tṛc)
        ]
        
        for suffix, min_root in KRDANTA_SUFFIXES:
            if surface.endswith(suffix) and len(surface) > len(suffix) + min_root:
                root = surface[:-len(suffix)]
                # Check if root looks like a valid verbal root
                # Valid roots are usually in kosha
                for candidate in self._try_sandhi_reversal(root):
                    if self.analyzer._in_kosha(candidate):
                        return True
        return False
    
    def _recursive_split(self, word: str, memo: dict = None) -> List[str]:
        """

        Recursively split a compound into maximal valid components.

        

        IMPROVED ALGORITHM with three fixes:

        1. FIX 1: Derivational spine continuation - keep collapsing if stem+suffix both valid

        2. FIX 2: Multi-head splitting - if token has multiple kosha heads, force split

        3. FIX 3: Kṛdanta recognition - keep participles as atomic units

        

        Uses memoization to avoid exponential blowup.

        """
        if memo is None:
            memo = {}
        
        if word in memo:
            return memo[word]
        
        # FIX 3: If it's a recognized kṛdanta, keep it atomic
        if self._is_krdanta(word) and self._is_valid_stem(word):
            memo[word] = [word]
            return [word]
        
        # FIX 2: Force split if token is long and contains multiple kosha heads
        MAX_TOKEN_LEN = 15  # Tokens longer than this that have multiple heads must split
        if len(word) > MAX_TOKEN_LEN:
            head_count = self._count_kosha_heads(word)
            if head_count > 1:
                # Don't return early - we MUST try to split this
                pass  # Continue to splitting logic
            else:
                # Single head or no heads - if valid, keep it
                if self._is_valid_stem(word):
                    memo[word] = [word]
                    return [word]
        else:
            # Base case: if word itself is valid AND not too long, return it
            if self._is_valid_stem(word):
                memo[word] = [word]
                return [word]
        
        # Base case: too short to split
        if len(word) < 4:
            memo[word] = [word]
            return [word]
        
        best_parse = [word]  # Default: no split
        best_score = -1000  # Start negative to ensure any valid split wins
        
        min_len = 3  # Minimum 3 chars to prevent rA, nA splits
        
        # Try all split points
        for i in range(min_len, len(word) - min_len + 1):
            left = word[:i]
            right = word[i:]
            
            # Check if left is valid (with Sandhi reversal)
            if self._is_valid_stem(left):
                # FIX 1: Derivational spine continuation
                # If left is a valid stem, check if left+next_suffix also forms a valid stem
                # This prevents over-splitting inside known words like bhAvanA
                spine_continued = False
                for ext_len in range(3, min(len(right) + 1, 8)):  # Try extending by 3-7 chars
                    extended = left + right[:ext_len]
                    if self._is_valid_stem(extended):
                        # The spine continues! Don't split here, try a longer left
                        spine_continued = True
                        break
                
                # Only split if spine doesn't continue OR if we're at a very long boundary
                if spine_continued and len(left) < 10:
                    continue  # Skip this split point, try longer
                
                # Recursively split the right side
                right_parse = self._recursive_split(right, memo)
                
                # Count valid components in this parse
                full_parse = [left] + right_parse
                valid_count = sum(1 for comp in full_parse if self._is_valid_stem(comp))
                
                # IMPROVED SCORING:
                # 1. Reward valid components heavily
                # 2. PENALIZE many components (prefer fewer, longer splits)
                # 3. PENALIZE short components (< 5 chars)
                # 4. REWARD if components are known kosha stems (not just valid via suffix)
                num_components = len(full_parse)
                avg_len = sum(len(c) for c in full_parse) / num_components
                short_penalty = sum(1 for c in full_parse if len(c) < 5)
                
                # Bonus for components that are DIRECTLY in kosha (not via suffix stripping)
                direct_kosha_bonus = sum(10 for c in full_parse 
                                         if self.analyzer._in_kosha(c) or 
                                         any(self.analyzer._in_kosha(x) for x in self._try_sandhi_reversal(c)))
                
                # Score formula: favor valid + long + few components + direct kosha
                score = (valid_count * 100  # Valid components matter most
                         - num_components * 15  # Penalize many splits (reduced from 20)
                         + avg_len * 5  # Reward longer components
                         - short_penalty * 40  # Penalize short fragments (reduced from 50)
                         + direct_kosha_bonus)  # Bonus for direct kosha stems
                
                if score > best_score:
                    best_score = score
                    best_parse = full_parse
        
        memo[word] = best_parse
        return best_parse
    
    def _longest_left_split(self, word: str) -> Optional[Tuple[str, str]]:
        """

        Find the longest valid left stem greedily WITH SANDHI REVERSAL.

        

        For unknown prefixes, tries consonant/vowel Sandhi reversions:

        - vidyud -> vidyut (d -> t before vowel)

        - buddhy -> buddhi (y -> i for elided vowel)

        """
        min_len = 3  # Minimum valid stem length
        
        # Scan from longest left to shortest
        for i in range(len(word) - min_len, min_len - 1, -1):
            left = word[:i]
            right = word[i:]
            
            # Try ALL Sandhi reversal candidates for left
            left_valid = False
            left_candidates = self._try_sandhi_reversal(left)
            for candidate in left_candidates:
                if self.analyzer._in_kosha(candidate):
                    left_valid = True
                    break
                # Also try with vowel adjustments
                if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                    left_valid = True
                    break
                if candidate.endswith('I') and self.analyzer._in_kosha(candidate[:-1] + 'i'):
                    left_valid = True
                    break
                if candidate.endswith('U') and self.analyzer._in_kosha(candidate[:-1] + 'u'):
                    left_valid = True
                    break
            
            if left_valid and len(right) >= min_len:
                # Check if right is valid using Sandhi reversal
                right_valid = False
                right_candidates = self._try_sandhi_reversal(right)
                for candidate in right_candidates:
                    if self.analyzer._in_kosha(candidate):
                        right_valid = True
                        break
                    # Try with vowel adjustments
                    if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                        right_valid = True
                        break
                
                # Try lookahead on right (for compound remainders)
                if not right_valid:
                    for j in range(min_len, min(len(right), 15)):
                        prefix = right[:j]
                        # Try all Sandhi reversals on the prefix
                        prefix_candidates = self._try_sandhi_reversal(prefix)
                        for candidate in prefix_candidates:
                            if self.analyzer._in_kosha(candidate):
                                right_valid = True
                                break
                            if candidate.endswith('A') and self.analyzer._in_kosha(candidate[:-1] + 'a'):
                                right_valid = True
                                break
                        if right_valid:
                            break
                
                # Sandhi restoration: if left ended with long vowel, right may need prefix
                if not right_valid and left.endswith('A') and right[0] not in 'aAiIuUeEoO':
                    restored = 'A' + right
                    restored_candidates = self._try_sandhi_reversal(restored)
                    for candidate in restored_candidates:
                        if self.analyzer._in_kosha(candidate):
                            right_valid = True
                            break
                    if not right_valid:
                        for j in range(min_len, min(len(restored), 12)):
                            if self.analyzer._in_kosha(restored[:j]):
                                right_valid = True
                                break
                
                if right_valid:
                    return (left, right)
        
        return None
    
    def _find_split_candidates(self, word: str) -> List[int]:
        """Find potential split points based on stem cache validation."""
        candidates = []
        min_component = 2  # Minimum component length
        
        # Endings to strip when validating
        ENDINGS = ["M", "H", "aM", "am", "aH", "At", "ena", "Aya", "asya", 
                   "e", "O", "AnAm", "A", "I", "U", "AN", "An", "i"]
        
        for i in range(min_component, len(word) - min_component + 1):
            left = word[:i]
            right = word[i:]
            
            # Check left side (try as-is, then with vowel additions/normalization)
            left_valid = self.analyzer._in_kosha(left)
            if not left_valid:
                for suffix in ["a", "A", "i", "I", "u", "U"]:
                    if self.analyzer._in_kosha(left + suffix):
                        left_valid = True
                        break
            # Sandhi reversal: if left ends with long vowel, try normalizing
            if not left_valid and left.endswith('A'):
                if self.analyzer._in_kosha(left[:-1] + 'a'):
                    left_valid = True
            if not left_valid and left.endswith('I'):
                if self.analyzer._in_kosha(left[:-1] + 'i'):
                    left_valid = True
            if not left_valid and left.endswith('U'):
                if self.analyzer._in_kosha(left[:-1] + 'u'):
                    left_valid = True
            
            # Check right side (try as-is, strip endings, add vowels)
            right_valid = self.analyzer._in_kosha(right)
            if not right_valid:
                # Try stripping endings
                for ending in sorted(ENDINGS, key=len, reverse=True):
                    if right.endswith(ending) and len(right) > len(ending) + 1:
                        stripped = right[:-len(ending)]
                        if self.analyzer._in_kosha(stripped):
                            right_valid = True
                            break
                        # Also try with vowel additions
                        for suffix in ["a", "A"]:
                            if self.analyzer._in_kosha(stripped + suffix):
                                right_valid = True
                                break
                        if right_valid:
                            break
            
            if not right_valid:
                # Try vowel additions
                for suffix in ["a", "A", "i", "I"]:
                    if self.analyzer._in_kosha(right + suffix):
                        right_valid = True
                        break
            
            # Sandhi reversal for right side: if left ends with long vowel,
            # the vowel may have absorbed initial vowel of right.
            # Try restoring: AtmA|bhAsa -> check A+bhAsa = AbhAsa
            if not right_valid and len(right) > 2:
                # Check if left ends with long vowel that could have eaten something
                if left.endswith('A') and right[0] not in 'aAiIuUeEoO':
                    # Right starts with consonant - maybe initial A was eaten
                    restored = 'A' + right
                    if self.analyzer._in_kosha(restored):
                        right_valid = True
                    elif len(restored) > 3:
                        # Try lookahead on restored
                        for j in range(3, min(len(restored), 12)):
                            if self.analyzer._in_kosha(restored[:j]):
                                right_valid = True
                                break
                elif left.endswith('I') and right[0] not in 'aAiIuUeEoO':
                    restored = 'I' + right
                    if self.analyzer._in_kosha(restored):
                        right_valid = True
                elif left.endswith('U') and right[0] not in 'aAiIuUeEoO':
                    restored = 'U' + right
                    if self.analyzer._in_kosha(restored):
                        right_valid = True
            
            # Also check if right itself starts a sub-compound (Recursive Lookahead)
            if not right_valid and len(right) > 3:
                # Try to find ANY valid item at start of right
                # Check prefixes of length 3 to 12
                for j in range(3, min(len(right), 15)):
                    prefix = right[:j]
                    if self.analyzer._in_kosha(prefix):
                        right_valid = True
                        break
                    # Sandhi normalization: if prefix ends with long vowel, try short
                    # AtmA -> Atma, prAtI -> prAti, etc.
                    if prefix.endswith('A'):
                        normalized = prefix[:-1] + 'a'
                        if self.analyzer._in_kosha(normalized):
                            right_valid = True
                            break
                    elif prefix.endswith('I'):
                        normalized = prefix[:-1] + 'i'
                        if self.analyzer._in_kosha(normalized):
                            right_valid = True
                            break
                    elif prefix.endswith('U'):
                        normalized = prefix[:-1] + 'u'
                        if self.analyzer._in_kosha(normalized):
                            right_valid = True
                            break
                
                # If still not found, check known initials
                if not right_valid:
                    for initial in self.COMPOUND_INITIALS + list(self.COMPOUND_FINALS):
                        if right.startswith(initial) and len(initial) >= 2:
                            right_valid = True
                            break
            
            # DEBUG
            # if "sopAdhika" in word:
            #    print(f"Check {left} | {right} -> L:{left_valid} R:{right_valid}")

            if left_valid and right_valid:
                candidates.append(i)
        
        return candidates
    
    def score_split(components):
            # Base: Squared length favors fewer, longer components
            score = sum(len(c)**2 for c in components)
            
            # --- PENALTIES ---
            for c in components:
                if len(c) < 4:
                    if not self._is_valid_stem(c):
                        score -= 50
                    else:
                        score -= 5
            
            if len(components) > 2:
                score -= (len(components) - 2) * 20
            
            # --- BONUSES ---
            
            # 1. VALIDITY BONUS (THE FIX)
            # Old value: 30. New value: 100.
            # This ensures that 164 (split score) + 200 (bonus) > 289 (garbage score)
            valid_count = sum(1 for c in components if self._is_valid_stem(c))
            score += valid_count * 100  
            
            # 2. SURVIVAL BONUS (Protects rAmo, namaH)
            if len(components) == 1:
                if self._is_valid_stem(components[0]):
                    score += 50
            
            # 3. Compound Pattern Bonus
            if len(components) >= 2:
                left = components[0]
                right = components[-1]
                
                if left in self.COMPOUND_INITIALS: score += 15
                
                # Check Right Final
                r_stem, _ = self.analyzer._extract_vibhakti(right)
                if r_stem in self.COMPOUND_FINALS: score += 25
                elif right in self.COMPOUND_FINALS: score += 25
                
                if abs(len(left) - len(right)) <= 1: score += 10
            # 4. Expansion penalty (RELAXED)
            # We removed the "elif expansion == 0: score += 20" trap.
            total_len = sum(len(c) for c in components)
            expansion = total_len - len(word)
            if expansion > 1:
                score -= (expansion - 1) * 25
            return score
    
    def split(self, word: str, max_components: int = 4) -> CompoundSplit:
        """

        Split a compound word into its components.

        

        Uses greedy algorithm with Kosha validation.

        Returns original word if no valid split found.

        """
        if len(word) < 4:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Check if word itself is in Kosha (might not be compound)
        # KEY FIX: If word is already a known stem (lexicalized), DO NOT SPLIT
        # This protects 'paramAtma', 'kzetrajYa', 'sopAdhika' from being broken down
        if self.analyzer._in_kosha(word):
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Use RECURSIVE COMPOSITIONAL algorithm
        # Tries ALL split points, recursively parses right sides,
        # returns parse with MOST valid components
        components = self._recursive_split(word)
        
        if len(components) <= 1:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Calculate split points from components
        split_points = []
        pos = 0
        for comp in components[:-1]:
            pos += len(comp)
            split_points.append(pos)
        
        return CompoundSplit(
            surface=word, components=components,
            split_points=split_points, is_compound=True,
            compound_type=None  # We don't classify samāsa types
        )
    
    def _split_dp(self, word: str, memo: dict = None) -> List[List[str]]:
        """

        V4 Algorithm: Memoized Dynamic Programming with Sandhi Expansion.

        

        Returns all valid splits, cached by suffix.

        Handles coalescent sandhi (e=a+i, o=a+u, etc.) that V3 misses.

        """
        if memo is None:
            memo = {}
        
        if word in memo:
            return memo[word]
        
        # Base: too short to split
        if len(word) <= 2:
            if self._is_valid_stem(word):
                return [[word]]
            return []
        
        valid_splits = []

        # 1. OPTION A: The whole word is a stem (Lexicalized)
        if self._is_valid_stem(word):
            valid_splits.append([word])
            # DO NOT RETURN EARLY. Keep looking for splits!

        # 2. OPTION B: Split it (Generative Sandhi)
        # Try each split position with sandhi expansion
        for i in range(2, len(word) - 1):
            for left, right in self.sandhi_engine.generate_splits(word, i):
                if len(left) < 2 or len(right) < 2:
                    continue
                    
                if self._is_valid_stem(left):
                    # Recurse on right (memoized!)
                    right_splits = self._split_dp(right, memo)
                    for rs in right_splits:
                        valid_splits.append([left] + rs)
        
        memo[word] = valid_splits
        return valid_splits
    
    def split_v4(self, word: str) -> CompoundSplit:
        """

        V4 Split: Uses generative sandhi expansion for coalescent sandhi.

        

        Handles:

        - Vowel coalescence: gaṇeśa → gaṇa + īśa (e = a+i)

        - Visarga sandhi: punarjanma → punaH + janma

        - Vṛddhi: tavaiva → tava + eva

        """
        if len(word) < 4:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # Use V4 DP algorithm
        all_splits = self._split_dp(word)
        
        if not all_splits:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        # SCORING STRATEGY:
        # Balance: prefer splits, but penalize over-fragmentation.
        # 1. Penalize short components (< 3 chars) heavily
        # 2. Prefer 2-component splits over 3+ components
        # 3. Single long tokens get moderate penalty
        # V4 Scoring with Compound Pattern Recognition
        def score_split(components):
            # Base: Squared length favors fewer, longer components
            score = sum(len(c)**2 for c in components)
            
            # --- PENALTIES ---
            # 1. Short junk penalty (unless it's a valid stem)
            for c in components:
                if len(c) < 4:
                    if not self._is_valid_stem(c):
                        score -= 50  # Garbage fragment
                    else:
                        score -= 5   # Valid but short (e.g. 'ISa'), slight penalty
            
            # 2. Fragmentation penalty
            if len(components) > 2:
                score -= (len(components) - 2) * 30  # Increased penalty
            
            # 3. 2-component bonus (optimal compound structure)
            if len(components) == 2:
                score += 25
            
            # --- BONUSES ---
            # 0. COMMON_WORDS Protection (namaH, rAmo should stay atomic)
            if len(components) == 1 and components[0] in self.COMMON_WORDS:
                score += 50  # Strong bonus to prevent splitting
            
            # 1. Validity Bonus (Crucial for pataye/rAmo)
            # Use _is_valid_stem so declined words get credit
            valid_count = sum(1 for c in components if self._is_valid_stem(c))
            score += valid_count * 30
            
            # 2. Compound Pattern Bonus (The Fix for gaRapataye)
            if len(components) >= 2:
                left = components[0]
                right = components[-1]
                
                # Check Left against Initials
                if left in self.COMPOUND_INITIALS:
                    score += 15
                
                # Check Right against Finals
                # Need to extract stem to match (pataye -> pati)
                for final in self.COMPOUND_FINALS:
                    if right.startswith(final) or right == final:
                        score += 25  # High bonus for matching pattern like 'pati'
                        break
                    # Try stripping vibhakti
                    if right.endswith('aye') and right[:-3] + 'i' == final:
                        score += 25
                        break
                    if right.endswith('ave') and right[:-3] + 'u' == final:
                        score += 25
                        break
                
                # Balance bonus
                if abs(len(left) - len(right)) <= 1:
                    score += 10
            
            # 4. Expansion penalty (sandhi artifacts add characters)
            # Allow 1 char expansion for sandhi (e → a+I), only penalize 2+ extra chars
            total_len = sum(len(c) for c in components)
            expansion = total_len - len(word)
            if expansion > 1:
                score -= (expansion - 1) * 25  # Stronger penalty
            elif expansion == 0:
                score += 20  # Bonus for exact-length splits (no sandhi artifact)

            return score

        best_split = max(all_splits, key=score_split)
        
        if len(best_split) <= 1:
            return CompoundSplit(
                surface=word, components=[word],
                split_points=[], is_compound=False, compound_type=None
            )
        
        return CompoundSplit(
            surface=word, components=best_split,
            split_points=[], is_compound=True, compound_type=None
        )
    
    def split_multiple(self, words: List[str]) -> List[CompoundSplit]:
        """Split multiple words."""
        return [self.split(w) for w in words]


# --- TEST ---
if __name__ == "__main__":
    print("Testing SamasaSplitter...")
    splitter = SamasaSplitter()
    
    test_compounds = [
        "hfdpadma",
        "paramAtma", 
        "mahArAja",
        "devadatta",
        "rAjakumAra",
        "sopAdhika",
    ]
    
    for word in test_compounds:
        result = splitter.split(word)
        if result.is_compound:
            print(f"  {word:20}{' + '.join(result.components)}")
        else:
            print(f"  {word:20} → (not split)")