File size: 4,193 Bytes
80ff7e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Replace each occurrence of a word with N consecutive slots, each
carrying its own phoneme, duration share, and a copy of the note_pitch.

Goal: force the model to articulate at word boundaries by putting a
plosive (K, P, T) at the end of slot A and the start of slot B.

Example: "broken" → [("brok", "en_B-R-OW1-K"), ("ken", "en_K-AH0-N")]

Usage:
    python split_word.py --in IN.json --out OUT.json --old pretty \
        --pieces 'brok:en_B-R-OW1-K' 'ken:en_K-AH0-N' \
        [--total-duration 0.50]
"""
import argparse
import json


def split_seg(seg: dict, old: str, pieces: list[tuple[str, str]],
              total_duration: float | None = None) -> dict:
    text = seg['text'].split()
    phon = seg['phoneme'].split()
    durs = [float(x) for x in seg['duration'].split()]
    pitches = seg['note_pitch'].split()
    types = seg['note_type'].split()
    n = len(text)
    assert len(phon) == n == len(durs) == len(pitches) == len(types)

    new_text, new_phon, new_durs, new_pitches, new_types = [], [], [], [], []
    n_split = 0

    for i in range(n):
        if text[i].lower() == old.lower():
            # Determine the duration to redistribute. Either fixed (--total-duration)
            # or original duration of this slot, possibly augmented by stealing from
            # the *next* <SP> rest.
            target_dur = total_duration or durs[i]
            steal_idx = None
            if total_duration is not None and total_duration > durs[i]:
                want = total_duration - durs[i]
                if i + 1 < n and text[i + 1] == '<SP>' and durs[i + 1] > want + 0.05:
                    steal_idx = i + 1
                    durs[i + 1] -= want
                    print(f"  slot {i}: stole {want:.2f}s from <SP> rest at {i+1}")
                else:
                    print(f"  WARN slot {i}: cannot reach {total_duration:.2f}s (no slack)")
                    target_dur = durs[i]
            per_piece = target_dur / len(pieces)
            for j, (piece_text, piece_phon) in enumerate(pieces):
                new_text.append(piece_text)
                new_phon.append(piece_phon)
                new_durs.append(per_piece)
                new_pitches.append(pitches[i])  # same pitch for all pieces
                # Articulation hints: 1=onset on first piece, 3=end on last, 2=mid sustain
                if len(pieces) == 1:
                    new_types.append(types[i])
                elif j == 0:
                    new_types.append('1')
                elif j == len(pieces) - 1:
                    new_types.append('3')
                else:
                    new_types.append('2')
            n_split += 1
        else:
            new_text.append(text[i])
            new_phon.append(phon[i])
            new_durs.append(durs[i])
            new_pitches.append(pitches[i])
            new_types.append(types[i])

    print(f"  split {n_split} occurrence(s) of '{old}' into {len(pieces)} pieces")
    print(f"  new slot count: {len(new_text)} (was {n})")

    out = dict(seg)
    out['text'] = ' '.join(new_text)
    out['phoneme'] = ' '.join(new_phon)
    out['duration'] = ' '.join(f"{d:.2f}" for d in new_durs)
    out['note_pitch'] = ' '.join(new_pitches)
    out['note_type'] = ' '.join(new_types)
    return out


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--in', dest='inp', required=True)
    ap.add_argument('--out', dest='out', required=True)
    ap.add_argument('--old', required=True)
    ap.add_argument('--pieces', nargs='+', required=True,
                    help="Each piece as 'text:phoneme', e.g. 'brok:en_B-R-OW1-K'")
    ap.add_argument('--total-duration', type=float, default=None,
                    help="Force total slot duration (steals from next <SP> rest)")
    args = ap.parse_args()

    pieces = []
    for spec in args.pieces:
        text, _, phon = spec.partition(':')
        pieces.append((text, phon))

    data = json.load(open(args.inp))
    edited = [split_seg(s, args.old, pieces, args.total_duration) for s in data]
    json.dump(edited, open(args.out, 'w'), ensure_ascii=False, indent=2)
    print(f"\nWrote {args.out}")


if __name__ == '__main__':
    main()