alexwengg commited on
Commit
10d29de
·
verified ·
1 Parent(s): 1d7ab20

Upload 40 files

Browse files
Files changed (40) hide show
  1. BatchingModelConvert/.DS_Store +0 -0
  2. BatchingModelConvert/conformer_batch.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  3. BatchingModelConvert/conformer_batch.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  4. BatchingModelConvert/conformer_batch.mlpackage/Manifest.json +18 -0
  5. BatchingModelConvert/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  6. BatchingModelConvert/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  7. BatchingModelConvert/decoder.mlpackage/Manifest.json +18 -0
  8. BatchingModelConvert/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  9. BatchingModelConvert/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  10. BatchingModelConvert/joint_decision.mlpackage/Manifest.json +18 -0
  11. BatchingModelConvert/metadata.json +23 -0
  12. BatchingModelConvert/pre_encode.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  13. BatchingModelConvert/pre_encode.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  14. BatchingModelConvert/pre_encode.mlpackage/Manifest.json +18 -0
  15. BatchingModelConvert/preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  16. BatchingModelConvert/preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  17. BatchingModelConvert/preprocessor.mlpackage/Manifest.json +18 -0
  18. BatchingModelConvert/vocab.json +1028 -0
  19. Conversion/convert_parakeet_eou.py +722 -0
  20. Conversion/convert_split_encoder.py +698 -0
  21. Conversion/individual_components.py +250 -0
  22. Inference/debug_nemo_streaming.py +218 -0
  23. Inference/print_config.py +21 -0
  24. Inference/test_full_pytorch_streaming.py +276 -0
  25. README.md +60 -119
  26. StreamingModelConvert/.DS_Store +0 -0
  27. StreamingModelConvert/metadata.json +35 -0
  28. StreamingModelConvert/parakeet_eou_streaming_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  29. StreamingModelConvert/parakeet_eou_streaming_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  30. StreamingModelConvert/parakeet_eou_streaming_decoder.mlpackage/Manifest.json +18 -0
  31. StreamingModelConvert/parakeet_eou_streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  32. StreamingModelConvert/parakeet_eou_streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  33. StreamingModelConvert/parakeet_eou_streaming_encoder.mlpackage/Manifest.json +18 -0
  34. StreamingModelConvert/parakeet_eou_streaming_joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  35. StreamingModelConvert/parakeet_eou_streaming_joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  36. StreamingModelConvert/parakeet_eou_streaming_joint_decision.mlpackage/Manifest.json +18 -0
  37. StreamingModelConvert/parakeet_eou_streaming_preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  38. StreamingModelConvert/parakeet_eou_streaming_preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  39. StreamingModelConvert/parakeet_eou_streaming_preprocessor.mlpackage/Manifest.json +18 -0
  40. StreamingModelConvert/vocab.json +1028 -0
BatchingModelConvert/.DS_Store ADDED
Binary file (8.2 kB). View file
 
BatchingModelConvert/conformer_batch.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5401568336b33cd5a07ec4d58b48a21b47136007cd2d435359fe0b8d89d4b8b8
3
+ size 406220
BatchingModelConvert/conformer_batch.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821d028626949ed20d5c9193909c2b58275f86781d48a7cff84e41ade5b39481
3
+ size 206005056
BatchingModelConvert/conformer_batch.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "3423F522-20D5-4F17-9BB9-B576C03768EC": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "93F5F1E1-D925-43D2-A60A-9DD6CAE60345": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "3423F522-20D5-4F17-9BB9-B576C03768EC"
18
+ }
BatchingModelConvert/decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ae627522d54d773e50aa45fa50cfb8056f6da2a2322a071cd284cc43a4376c7
3
+ size 7265
BatchingModelConvert/decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e7357ba615c3fcca04d8dcb56e9e58a675831af57b39d6175a9dd5c6dcfcb5c
3
+ size 7874944
BatchingModelConvert/decoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "2CF76130-BF8D-480A-986A-85328EB3ECC8": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "E161022C-A73B-4427-89F5-390A26D62C0B": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "E161022C-A73B-4427-89F5-390A26D62C0B"
18
+ }
BatchingModelConvert/joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b22c7d55453c573aaca7576b4cdd9f178265a5a0253a77e3e855244d83df0e5f
3
+ size 8659
BatchingModelConvert/joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
3
+ size 2794182
BatchingModelConvert/joint_decision.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "4849FC14-7F7E-4B92-BCA2-D45FA9790109": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "F412A0B2-8BCE-4597-93CA-73097B855A6E": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "F412A0B2-8BCE-4597-93CA-73097B855A6E"
18
+ }
BatchingModelConvert/metadata.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "nvidia/parakeet_realtime_eou_120m-v1",
3
+ "model_name": "parakeet_realtime_eou_120m-v1-split",
4
+ "streaming_mode": "split_encoder",
5
+ "sample_rate": 16000,
6
+ "mel_dim": 128,
7
+ "hidden_dim": 512,
8
+ "num_layers": 17,
9
+ "mel_frames_per_chunk": 45,
10
+ "vocab_size": 1026,
11
+ "blank_id": 1026,
12
+ "decoder_hidden": 640,
13
+ "decoder_layers": 1,
14
+ "cache_channel_size": 70,
15
+ "cache_time_size": 8,
16
+ "components": {
17
+ "preprocessor": "preprocessor.mlpackage",
18
+ "pre_encode": "pre_encode.mlpackage",
19
+ "conformer": "conformer_batch.mlpackage",
20
+ "decoder": "decoder.mlpackage",
21
+ "joint_decision": "joint_decision.mlpackage"
22
+ }
23
+ }
BatchingModelConvert/pre_encode.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5500eefe496f94bbc1a359d90b432b87bfd20b96e7fe185e1a007b2630a0a1cb
3
+ size 12168
BatchingModelConvert/pre_encode.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:842054a63f229efc5c0bd938c05a044631797a2b856ad1aef27aba0db3177d0e
3
+ size 9472832
BatchingModelConvert/pre_encode.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "5D69C154-6F7F-4494-B043-A650C98A354E": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "90B9EACA-EC76-462E-9532-AB46C9C50373": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "5D69C154-6F7F-4494-B043-A650C98A354E"
18
+ }
BatchingModelConvert/preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b438308779dd446e070e320fb35f4b23fe559f7300364864da8f04f5e13322c8
3
+ size 13747
BatchingModelConvert/preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f257ad1ac11575d73a6ffda555319b2c96b0a224f0dc03ddd8c62950e9b18e53
3
+ size 592384
BatchingModelConvert/preprocessor.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "2FCDB141-775C-4A2A-9F4B-8B59C09CDD0D": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "A29B736B-4AEF-4817-8DC4-4CF66B11BF8C": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "A29B736B-4AEF-4817-8DC4-4CF66B11BF8C"
18
+ }
BatchingModelConvert/vocab.json ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "<unk>",
3
+ "1": "▁t",
4
+ "2": "▁th",
5
+ "3": "▁a",
6
+ "4": "▁i",
7
+ "5": "▁the",
8
+ "6": "▁s",
9
+ "7": "re",
10
+ "8": "▁w",
11
+ "9": "▁o",
12
+ "10": "in",
13
+ "11": "at",
14
+ "12": "er",
15
+ "13": "nd",
16
+ "14": "ou",
17
+ "15": "▁c",
18
+ "16": "▁b",
19
+ "17": "▁h",
20
+ "18": "en",
21
+ "19": "on",
22
+ "20": "▁m",
23
+ "21": "▁f",
24
+ "22": "ing",
25
+ "23": "▁p",
26
+ "24": "▁to",
27
+ "25": "▁and",
28
+ "26": "▁d",
29
+ "27": "an",
30
+ "28": "or",
31
+ "29": "es",
32
+ "30": "▁y",
33
+ "31": "▁l",
34
+ "32": "▁of",
35
+ "33": "ll",
36
+ "34": "▁in",
37
+ "35": "ed",
38
+ "36": "it",
39
+ "37": "▁g",
40
+ "38": "is",
41
+ "39": "▁you",
42
+ "40": "▁n",
43
+ "41": "ar",
44
+ "42": "om",
45
+ "43": "as",
46
+ "44": "ve",
47
+ "45": "▁e",
48
+ "46": "ic",
49
+ "47": "▁it",
50
+ "48": "al",
51
+ "49": "us",
52
+ "50": "▁wh",
53
+ "51": "▁we",
54
+ "52": "▁be",
55
+ "53": "ion",
56
+ "54": "ow",
57
+ "55": "le",
58
+ "56": "▁is",
59
+ "57": "et",
60
+ "58": "ent",
61
+ "59": "ot",
62
+ "60": "ut",
63
+ "61": "▁re",
64
+ "62": "▁on",
65
+ "63": "ay",
66
+ "64": "▁ha",
67
+ "65": "ig",
68
+ "66": "▁so",
69
+ "67": "ct",
70
+ "68": "▁he",
71
+ "69": "▁for",
72
+ "70": "ver",
73
+ "71": "ke",
74
+ "72": "ro",
75
+ "73": "▁st",
76
+ "74": "id",
77
+ "75": "▁go",
78
+ "76": "all",
79
+ "77": "se",
80
+ "78": "ly",
81
+ "79": "▁u",
82
+ "80": "ch",
83
+ "81": "st",
84
+ "82": "ld",
85
+ "83": "▁k",
86
+ "84": "ce",
87
+ "85": "ur",
88
+ "86": "▁li",
89
+ "87": "am",
90
+ "88": "▁r",
91
+ "89": "ht",
92
+ "90": "▁j",
93
+ "91": "ith",
94
+ "92": "▁se",
95
+ "93": "ir",
96
+ "94": "▁as",
97
+ "95": "▁an",
98
+ "96": "im",
99
+ "97": "▁do",
100
+ "98": "ad",
101
+ "99": "▁was",
102
+ "100": "ight",
103
+ "101": "th",
104
+ "102": "▁are",
105
+ "103": "▁but",
106
+ "104": "▁sh",
107
+ "105": "ust",
108
+ "106": "ally",
109
+ "107": "▁not",
110
+ "108": "▁or",
111
+ "109": "▁com",
112
+ "110": "▁can",
113
+ "111": "▁me",
114
+ "112": "op",
115
+ "113": "▁mo",
116
+ "114": "▁at",
117
+ "115": "ill",
118
+ "116": "▁ch",
119
+ "117": "▁ne",
120
+ "118": "ant",
121
+ "119": "▁de",
122
+ "120": "▁kn",
123
+ "121": "▁one",
124
+ "122": "il",
125
+ "123": "ol",
126
+ "124": "▁con",
127
+ "125": "ter",
128
+ "126": "▁ab",
129
+ "127": "▁fr",
130
+ "128": "ere",
131
+ "129": "ck",
132
+ "130": "▁al",
133
+ "131": "▁all",
134
+ "132": "qu",
135
+ "133": "▁pro",
136
+ "134": "▁som",
137
+ "135": "ould",
138
+ "136": "▁tw",
139
+ "137": "ul",
140
+ "138": "ra",
141
+ "139": "od",
142
+ "140": "ers",
143
+ "141": "▁su",
144
+ "142": "ive",
145
+ "143": "▁v",
146
+ "144": "use",
147
+ "145": "ate",
148
+ "146": "ge",
149
+ "147": "if",
150
+ "148": "▁ex",
151
+ "149": "ess",
152
+ "150": "pp",
153
+ "151": "▁lo",
154
+ "152": "out",
155
+ "153": "▁if",
156
+ "154": "est",
157
+ "155": "ain",
158
+ "156": "ist",
159
+ "157": "and",
160
+ "158": "ea",
161
+ "159": "very",
162
+ "160": "art",
163
+ "161": "▁wor",
164
+ "162": "▁my",
165
+ "163": "ab",
166
+ "164": "ment",
167
+ "165": "▁bec",
168
+ "166": "un",
169
+ "167": "ity",
170
+ "168": "ri",
171
+ "169": "pe",
172
+ "170": "ions",
173
+ "171": "▁by",
174
+ "172": "ok",
175
+ "173": "our",
176
+ "174": "ort",
177
+ "175": "ind",
178
+ "176": "ink",
179
+ "177": "nt",
180
+ "178": "▁up",
181
+ "179": "um",
182
+ "180": "▁don",
183
+ "181": "▁get",
184
+ "182": "red",
185
+ "183": "▁out",
186
+ "184": "el",
187
+ "185": "ause",
188
+ "186": "res",
189
+ "187": "▁ma",
190
+ "188": "ich",
191
+ "189": "▁us",
192
+ "190": "rou",
193
+ "191": "▁int",
194
+ "192": "em",
195
+ "193": "os",
196
+ "194": "ies",
197
+ "195": "ie",
198
+ "196": "▁pl",
199
+ "197": "▁tr",
200
+ "198": "ven",
201
+ "199": "ous",
202
+ "200": "▁le",
203
+ "201": "▁two",
204
+ "202": "ard",
205
+ "203": "ine",
206
+ "204": "▁co",
207
+ "205": "een",
208
+ "206": "▁now",
209
+ "207": "ty",
210
+ "208": "her",
211
+ "209": "ack",
212
+ "210": "▁pe",
213
+ "211": "ame",
214
+ "212": "▁how",
215
+ "213": "▁who",
216
+ "214": "▁see",
217
+ "215": "▁tim",
218
+ "216": "ect",
219
+ "217": "ast",
220
+ "218": "▁our",
221
+ "219": "ci",
222
+ "220": "ree",
223
+ "221": "ople",
224
+ "222": "gh",
225
+ "223": "▁no",
226
+ "224": "▁had",
227
+ "225": "▁man",
228
+ "226": "▁qu",
229
+ "227": "▁en",
230
+ "228": "ide",
231
+ "229": "ure",
232
+ "230": "ud",
233
+ "231": "so",
234
+ "232": "▁his",
235
+ "233": "▁sa",
236
+ "234": "▁sp",
237
+ "235": "▁say",
238
+ "236": "ose",
239
+ "237": "ther",
240
+ "238": "▁act",
241
+ "239": "▁ta",
242
+ "240": "▁cl",
243
+ "241": "ings",
244
+ "242": "pt",
245
+ "243": "king",
246
+ "244": "▁any",
247
+ "245": "▁has",
248
+ "246": "▁un",
249
+ "247": "iv",
250
+ "248": "▁im",
251
+ "249": "▁ag",
252
+ "250": "▁te",
253
+ "251": "▁fe",
254
+ "252": "one",
255
+ "253": "per",
256
+ "254": "ong",
257
+ "255": "▁po",
258
+ "256": "▁ad",
259
+ "257": "ff",
260
+ "258": "ore",
261
+ "259": "itt",
262
+ "260": "ans",
263
+ "261": "iz",
264
+ "262": "eah",
265
+ "263": "reat",
266
+ "264": "act",
267
+ "265": "own",
268
+ "266": "hing",
269
+ "267": "enty",
270
+ "268": "age",
271
+ "269": "ber",
272
+ "270": "ice",
273
+ "271": "▁am",
274
+ "272": "ple",
275
+ "273": "are",
276
+ "274": "▁per",
277
+ "275": "und",
278
+ "276": "ite",
279
+ "277": "ix",
280
+ "278": "pl",
281
+ "279": "▁way",
282
+ "280": "▁did",
283
+ "281": "▁pr",
284
+ "282": "▁got",
285
+ "283": "ars",
286
+ "284": "▁she",
287
+ "285": "▁let",
288
+ "286": "ag",
289
+ "287": "▁ac",
290
+ "288": "int",
291
+ "289": "▁ar",
292
+ "290": "ry",
293
+ "291": "ign",
294
+ "292": "ish",
295
+ "293": "▁fir",
296
+ "294": "ace",
297
+ "295": "ble",
298
+ "296": "og",
299
+ "297": "ue",
300
+ "298": "▁ye",
301
+ "299": "ap",
302
+ "300": "iff",
303
+ "301": "▁ro",
304
+ "302": "▁her",
305
+ "303": "nder",
306
+ "304": "▁ok",
307
+ "305": "▁res",
308
+ "306": "▁gu",
309
+ "307": "ence",
310
+ "308": "▁may",
311
+ "309": "ated",
312
+ "310": "ip",
313
+ "311": "▁bo",
314
+ "312": "▁him",
315
+ "313": "way",
316
+ "314": "ac",
317
+ "315": "ical",
318
+ "316": "ass",
319
+ "317": "ase",
320
+ "318": "▁dis",
321
+ "319": "able",
322
+ "320": "ick",
323
+ "321": "▁app",
324
+ "322": "ance",
325
+ "323": "▁pre",
326
+ "324": "▁six",
327
+ "325": "▁off",
328
+ "326": "▁new",
329
+ "327": "ia",
330
+ "328": "orm",
331
+ "329": "ank",
332
+ "330": "▁lot",
333
+ "331": "ach",
334
+ "332": "▁fo",
335
+ "333": "inet",
336
+ "334": "ire",
337
+ "335": "ary",
338
+ "336": "ult",
339
+ "337": "▁tal",
340
+ "338": "▁mu",
341
+ "339": "▁bl",
342
+ "340": "ount",
343
+ "341": "sel",
344
+ "342": "vel",
345
+ "343": "▁br",
346
+ "344": "▁imp",
347
+ "345": "ep",
348
+ "346": "cess",
349
+ "347": "ord",
350
+ "348": "▁sc",
351
+ "349": "▁inc",
352
+ "350": "ound",
353
+ "351": "ang",
354
+ "352": "be",
355
+ "353": "ress",
356
+ "354": "uct",
357
+ "355": "▁ind",
358
+ "356": "▁af",
359
+ "357": "ving",
360
+ "358": "▁oh",
361
+ "359": "▁bet",
362
+ "360": "▁use",
363
+ "361": "ome",
364
+ "362": "ens",
365
+ "363": "ys",
366
+ "364": "▁bu",
367
+ "365": "co",
368
+ "366": "ory",
369
+ "367": "ater",
370
+ "368": "ild",
371
+ "369": "ght",
372
+ "370": "ial",
373
+ "371": "▁day",
374
+ "372": "ning",
375
+ "373": "na",
376
+ "374": "ile",
377
+ "375": "▁spe",
378
+ "376": "▁mar",
379
+ "377": "ody",
380
+ "378": "ough",
381
+ "379": "ade",
382
+ "380": "vers",
383
+ "381": "xt",
384
+ "382": "▁fl",
385
+ "383": "▁ke",
386
+ "384": "ian",
387
+ "385": "▁sy",
388
+ "386": "▁put",
389
+ "387": "fore",
390
+ "388": "ub",
391
+ "389": "▁ph",
392
+ "390": "fe",
393
+ "391": "▁em",
394
+ "392": "▁ser",
395
+ "393": "form",
396
+ "394": "ting",
397
+ "395": "te",
398
+ "396": "av",
399
+ "397": "ious",
400
+ "398": "▁rec",
401
+ "399": "ks",
402
+ "400": "▁gr",
403
+ "401": "ces",
404
+ "402": "wn",
405
+ "403": "ors",
406
+ "404": "▁jo",
407
+ "405": "ents",
408
+ "406": "▁des",
409
+ "407": "▁try",
410
+ "408": "▁equ",
411
+ "409": "▁z",
412
+ "410": "▁rem",
413
+ "411": "▁str",
414
+ "412": "self",
415
+ "413": "▁bit",
416
+ "414": "ph",
417
+ "415": "ved",
418
+ "416": "▁why",
419
+ "417": "▁bas",
420
+ "418": "▁hel",
421
+ "419": "▁rel",
422
+ "420": "ath",
423
+ "421": "ject",
424
+ "422": "ail",
425
+ "423": "▁la",
426
+ "424": "ual",
427
+ "425": "▁god",
428
+ "426": "▁nat",
429
+ "427": "erm",
430
+ "428": "day",
431
+ "429": "▁id",
432
+ "430": "ft",
433
+ "431": "▁wr",
434
+ "432": "▁min",
435
+ "433": "ates",
436
+ "434": "▁gen",
437
+ "435": "tain",
438
+ "436": "▁ob",
439
+ "437": "ull",
440
+ "438": "ict",
441
+ "439": "▁tra",
442
+ "440": "▁end",
443
+ "441": "▁hig",
444
+ "442": "▁fif",
445
+ "443": "oth",
446
+ "444": "tern",
447
+ "445": "▁its",
448
+ "446": "vent",
449
+ "447": "▁sm",
450
+ "448": "ons",
451
+ "449": "▁add",
452
+ "450": "iss",
453
+ "451": "▁bel",
454
+ "452": "ful",
455
+ "453": "get",
456
+ "454": "▁ele",
457
+ "455": "▁rep",
458
+ "456": "ak",
459
+ "457": "▁ho",
460
+ "458": "▁pos",
461
+ "459": "▁num",
462
+ "460": "ange",
463
+ "461": "ves",
464
+ "462": "ific",
465
+ "463": "urn",
466
+ "464": "ise",
467
+ "465": "▁cr",
468
+ "466": "▁um",
469
+ "467": "ward",
470
+ "468": "▁reg",
471
+ "469": "ady",
472
+ "470": "ower",
473
+ "471": "uc",
474
+ "472": "▁dec",
475
+ "473": "lic",
476
+ "474": "▁set",
477
+ "475": "▁gon",
478
+ "476": "▁op",
479
+ "477": "▁ear",
480
+ "478": "▁sub",
481
+ "479": "▁sl",
482
+ "480": "les",
483
+ "481": "stem",
484
+ "482": "cial",
485
+ "483": "olog",
486
+ "484": "atch",
487
+ "485": "ily",
488
+ "486": "body",
489
+ "487": "nds",
490
+ "488": "ular",
491
+ "489": "ren",
492
+ "490": "▁own",
493
+ "491": "▁too",
494
+ "492": "cent",
495
+ "493": "ible",
496
+ "494": "pect",
497
+ "495": "ered",
498
+ "496": "ways",
499
+ "497": "teen",
500
+ "498": "▁uh",
501
+ "499": "▁big",
502
+ "500": "▁mod",
503
+ "501": "▁att",
504
+ "502": "▁car",
505
+ "503": "gr",
506
+ "504": "▁acc",
507
+ "505": "ied",
508
+ "506": "mun",
509
+ "507": "ib",
510
+ "508": "▁mon",
511
+ "509": "▁sch",
512
+ "510": "▁pol",
513
+ "511": "▁dat",
514
+ "512": "▁fin",
515
+ "513": "▁sim",
516
+ "514": "▁inv",
517
+ "515": "▁def",
518
+ "516": "ked",
519
+ "517": "▁ent",
520
+ "518": "▁yes",
521
+ "519": "ows",
522
+ "520": "ics",
523
+ "521": "ited",
524
+ "522": "ute",
525
+ "523": "ism",
526
+ "524": "ps",
527
+ "525": "▁ed",
528
+ "526": "▁el",
529
+ "527": "ably",
530
+ "528": "ppen",
531
+ "529": "als",
532
+ "530": "▁ten",
533
+ "531": "ract",
534
+ "532": "ss",
535
+ "533": "▁ass",
536
+ "534": "▁met",
537
+ "535": "gan",
538
+ "536": "▁eng",
539
+ "537": "▁stu",
540
+ "538": "ween",
541
+ "539": "arch",
542
+ "540": "▁gl",
543
+ "541": "▁cor",
544
+ "542": "▁dr",
545
+ "543": "vern",
546
+ "544": "▁ty",
547
+ "545": "▁run",
548
+ "546": "hip",
549
+ "547": "cus",
550
+ "548": "cond",
551
+ "549": "▁ins",
552
+ "550": "irty",
553
+ "551": "▁pub",
554
+ "552": "lud",
555
+ "553": "llow",
556
+ "554": "▁cou",
557
+ "555": "ew",
558
+ "556": "iew",
559
+ "557": "▁sur",
560
+ "558": "ero",
561
+ "559": "ood",
562
+ "560": "ness",
563
+ "561": "▁fun",
564
+ "562": "▁eff",
565
+ "563": "cept",
566
+ "564": "▁ca",
567
+ "565": "▁exp",
568
+ "566": "duct",
569
+ "567": "▁sw",
570
+ "568": "ize",
571
+ "569": "ope",
572
+ "570": "▁par",
573
+ "571": "kes",
574
+ "572": "cy",
575
+ "573": "▁ev",
576
+ "574": "▁ref",
577
+ "575": "ell",
578
+ "576": "▁bus",
579
+ "577": "ug",
580
+ "578": "rib",
581
+ "579": "▁cur",
582
+ "580": "mo",
583
+ "581": "ock",
584
+ "582": "ures",
585
+ "583": "air",
586
+ "584": "▁war",
587
+ "585": "str",
588
+ "586": "▁med",
589
+ "587": "▁wa",
590
+ "588": "▁val",
591
+ "589": "▁sin",
592
+ "590": "blem",
593
+ "591": "▁fam",
594
+ "592": "li",
595
+ "593": "▁far",
596
+ "594": "▁cle",
597
+ "595": "▁col",
598
+ "596": "mon",
599
+ "597": "▁gra",
600
+ "598": "led",
601
+ "599": "ense",
602
+ "600": "tin",
603
+ "601": "ues",
604
+ "602": "its",
605
+ "603": "▁mem",
606
+ "604": "▁inf",
607
+ "605": "▁eas",
608
+ "606": "ideo",
609
+ "607": "▁top",
610
+ "608": "io",
611
+ "609": "pan",
612
+ "610": "▁hum",
613
+ "611": "▁old",
614
+ "612": "ead",
615
+ "613": "▁ord",
616
+ "614": "ric",
617
+ "615": "ants",
618
+ "616": "oy",
619
+ "617": "esn",
620
+ "618": "uck",
621
+ "619": "ason",
622
+ "620": "ced",
623
+ "621": "ool",
624
+ "622": "rat",
625
+ "623": "ouse",
626
+ "624": "▁lar",
627
+ "625": "▁art",
628
+ "626": "▁wee",
629
+ "627": "▁cer",
630
+ "628": "ized",
631
+ "629": "▁mat",
632
+ "630": "con",
633
+ "631": "erg",
634
+ "632": "land",
635
+ "633": "ines",
636
+ "634": "▁chr",
637
+ "635": "▁aut",
638
+ "636": "▁lea",
639
+ "637": "▁sou",
640
+ "638": "oney",
641
+ "639": "tty",
642
+ "640": "▁ple",
643
+ "641": "ulat",
644
+ "642": "oks",
645
+ "643": "▁few",
646
+ "644": "▁sol",
647
+ "645": "▁che",
648
+ "646": "chn",
649
+ "647": "ird",
650
+ "648": "▁bre",
651
+ "649": "▁dur",
652
+ "650": "▁wom",
653
+ "651": "me",
654
+ "652": "izat",
655
+ "653": "eric",
656
+ "654": "ote",
657
+ "655": "▁uni",
658
+ "656": "eren",
659
+ "657": "arn",
660
+ "658": "ross",
661
+ "659": "ices",
662
+ "660": "ten",
663
+ "661": "eral",
664
+ "662": "ever",
665
+ "663": "ieve",
666
+ "664": "lish",
667
+ "665": "ash",
668
+ "666": "▁opp",
669
+ "667": "alth",
670
+ "668": "ger",
671
+ "669": "▁sk",
672
+ "670": "▁red",
673
+ "671": "peri",
674
+ "672": "▁det",
675
+ "673": "▁ext",
676
+ "674": "ner",
677
+ "675": "ah",
678
+ "676": "▁var",
679
+ "677": "▁loc",
680
+ "678": "gram",
681
+ "679": "ists",
682
+ "680": "ives",
683
+ "681": "▁es",
684
+ "682": "▁nor",
685
+ "683": "tro",
686
+ "684": "ale",
687
+ "685": "▁iss",
688
+ "686": "▁pri",
689
+ "687": "gin",
690
+ "688": "az",
691
+ "689": "oc",
692
+ "690": "▁pop",
693
+ "691": "ern",
694
+ "692": "▁sit",
695
+ "693": "ket",
696
+ "694": "▁pa",
697
+ "695": "▁law",
698
+ "696": "ages",
699
+ "697": "br",
700
+ "698": "▁cam",
701
+ "699": "▁mom",
702
+ "700": "osed",
703
+ "701": "▁bro",
704
+ "702": "ne",
705
+ "703": "bs",
706
+ "704": "▁cre",
707
+ "705": "erat",
708
+ "706": "▁sec",
709
+ "707": "▁cap",
710
+ "708": "▁vis",
711
+ "709": "▁pat",
712
+ "710": "ield",
713
+ "711": "iet",
714
+ "712": "▁tri",
715
+ "713": "up",
716
+ "714": "▁bra",
717
+ "715": "ts",
718
+ "716": "▁mot",
719
+ "717": "▁unt",
720
+ "718": "put",
721
+ "719": "bo",
722
+ "720": "ork",
723
+ "721": "mer",
724
+ "722": "ital",
725
+ "723": "▁air",
726
+ "724": "ined",
727
+ "725": "▁beh",
728
+ "726": "▁adv",
729
+ "727": "▁ret",
730
+ "728": "imes",
731
+ "729": "▁tea",
732
+ "730": "ural",
733
+ "731": "sid",
734
+ "732": "ters",
735
+ "733": "▁pur",
736
+ "734": "▁sci",
737
+ "735": "bers",
738
+ "736": "ient",
739
+ "737": "ier",
740
+ "738": "cc",
741
+ "739": "sw",
742
+ "740": "▁av",
743
+ "741": "reen",
744
+ "742": "ode",
745
+ "743": "ont",
746
+ "744": "▁dra",
747
+ "745": "ann",
748
+ "746": "nect",
749
+ "747": "▁x",
750
+ "748": "▁eu",
751
+ "749": "ton",
752
+ "750": "inat",
753
+ "751": "ene",
754
+ "752": "ared",
755
+ "753": "els",
756
+ "754": "▁mor",
757
+ "755": "▁rat",
758
+ "756": "cri",
759
+ "757": "▁men",
760
+ "758": "▁ah",
761
+ "759": "ames",
762
+ "760": "▁arm",
763
+ "761": "eak",
764
+ "762": "▁pay",
765
+ "763": "▁hal",
766
+ "764": "ins",
767
+ "765": "ilit",
768
+ "766": "stit",
769
+ "767": "▁ra",
770
+ "768": "▁leg",
771
+ "769": "cl",
772
+ "770": "pr",
773
+ "771": "▁wal",
774
+ "772": "▁bad",
775
+ "773": "▁ge",
776
+ "774": "roup",
777
+ "775": "▁mus",
778
+ "776": "man",
779
+ "777": "▁gi",
780
+ "778": "eds",
781
+ "779": "▁aw",
782
+ "780": "po",
783
+ "781": "ark",
784
+ "782": "row",
785
+ "783": "▁dep",
786
+ "784": "ully",
787
+ "785": "ral",
788
+ "786": "lect",
789
+ "787": "pend",
790
+ "788": "▁sev",
791
+ "789": "ime",
792
+ "790": "gest",
793
+ "791": "here",
794
+ "792": "▁yet",
795
+ "793": "ted",
796
+ "794": "▁rev",
797
+ "795": "ds",
798
+ "796": "▁ask",
799
+ "797": "less",
800
+ "798": "▁di",
801
+ "799": "ets",
802
+ "800": "line",
803
+ "801": "▁aff",
804
+ "802": "ired",
805
+ "803": "▁est",
806
+ "804": "ken",
807
+ "805": "vid",
808
+ "806": "most",
809
+ "807": "ivid",
810
+ "808": "unch",
811
+ "809": "par",
812
+ "810": "med",
813
+ "811": "rop",
814
+ "812": "ased",
815
+ "813": "eone",
816
+ "814": "▁ve",
817
+ "815": "▁abs",
818
+ "816": "ergy",
819
+ "817": "ret",
820
+ "818": "▁saw",
821
+ "819": "▁ey",
822
+ "820": "▁cal",
823
+ "821": "uat",
824
+ "822": "▁mid",
825
+ "823": "vat",
826
+ "824": "ream",
827
+ "825": "vice",
828
+ "826": "ians",
829
+ "827": "rent",
830
+ "828": "ctor",
831
+ "829": "err",
832
+ "830": "ush",
833
+ "831": "ases",
834
+ "832": "▁suc",
835
+ "833": "erms",
836
+ "834": "ave",
837
+ "835": "angu",
838
+ "836": "ries",
839
+ "837": "▁wo",
840
+ "838": "arts",
841
+ "839": "▁fil",
842
+ "840": "▁fat",
843
+ "841": "▁cho",
844
+ "842": "orts",
845
+ "843": "▁fre",
846
+ "844": "ee",
847
+ "845": "ught",
848
+ "846": "eng",
849
+ "847": "ump",
850
+ "848": "▁bar",
851
+ "849": "ying",
852
+ "850": "ane",
853
+ "851": "▁tem",
854
+ "852": "anks",
855
+ "853": "ury",
856
+ "854": "iat",
857
+ "855": "mit",
858
+ "856": "trol",
859
+ "857": "▁net",
860
+ "858": "▁maj",
861
+ "859": "▁cra",
862
+ "860": "ling",
863
+ "861": "▁fig",
864
+ "862": "orn",
865
+ "863": "icat",
866
+ "864": "pany",
867
+ "865": "▁occ",
868
+ "866": "ott",
869
+ "867": "ands",
870
+ "868": "▁exc",
871
+ "869": "▁mr",
872
+ "870": "ency",
873
+ "871": "rope",
874
+ "872": "itch",
875
+ "873": "▁lit",
876
+ "874": "abil",
877
+ "875": "not",
878
+ "876": "ma",
879
+ "877": "▁typ",
880
+ "878": "▁opt",
881
+ "879": "ob",
882
+ "880": "ser",
883
+ "881": "ety",
884
+ "882": "ms",
885
+ "883": "peci",
886
+ "884": "aces",
887
+ "885": "aut",
888
+ "886": "▁hon",
889
+ "887": "cuss",
890
+ "888": "▁sal",
891
+ "889": "▁sor",
892
+ "890": "att",
893
+ "891": "▁lab",
894
+ "892": "▁har",
895
+ "893": "urch",
896
+ "894": "nded",
897
+ "895": "uce",
898
+ "896": "ids",
899
+ "897": "▁hy",
900
+ "898": "▁fut",
901
+ "899": "▁ste",
902
+ "900": "ours",
903
+ "901": "ems",
904
+ "902": "utes",
905
+ "903": "ng",
906
+ "904": "ta",
907
+ "905": "▁won",
908
+ "906": "▁fa",
909
+ "907": "▁env",
910
+ "908": "ards",
911
+ "909": "▁job",
912
+ "910": "ium",
913
+ "911": "▁dot",
914
+ "912": "▁obv",
915
+ "913": "ina",
916
+ "914": "side",
917
+ "915": "elve",
918
+ "916": "cu",
919
+ "917": "▁jes",
920
+ "918": "▁pot",
921
+ "919": "▁pie",
922
+ "920": "▁tre",
923
+ "921": "▁hey",
924
+ "922": "▁mag",
925
+ "923": "ron",
926
+ "924": "▁key",
927
+ "925": "swer",
928
+ "926": "▁win",
929
+ "927": "ucat",
930
+ "928": "work",
931
+ "929": "ides",
932
+ "930": "▁low",
933
+ "931": "▁vol",
934
+ "932": "▁oth",
935
+ "933": "atic",
936
+ "934": "lf",
937
+ "935": "ads",
938
+ "936": "inds",
939
+ "937": "com",
940
+ "938": "ths",
941
+ "939": "▁ver",
942
+ "940": "ised",
943
+ "941": "lo",
944
+ "942": "▁squ",
945
+ "943": "▁cut",
946
+ "944": "oked",
947
+ "945": "irit",
948
+ "946": "ateg",
949
+ "947": "ppy",
950
+ "948": "mitt",
951
+ "949": "come",
952
+ "950": "hn",
953
+ "951": "igin",
954
+ "952": "mand",
955
+ "953": "▁dam",
956
+ "954": "ho",
957
+ "955": "▁da",
958
+ "956": "▁fur",
959
+ "957": "iron",
960
+ "958": "ilar",
961
+ "959": "▁fac",
962
+ "960": "▁neg",
963
+ "961": "▁ago",
964
+ "962": "ged",
965
+ "963": "miss",
966
+ "964": "enth",
967
+ "965": "▁dou",
968
+ "966": "▁hit",
969
+ "967": "▁guy",
970
+ "968": "▁bi",
971
+ "969": "ove",
972
+ "970": "fess",
973
+ "971": "ples",
974
+ "972": "owed",
975
+ "973": "ured",
976
+ "974": "▁ris",
977
+ "975": "ints",
978
+ "976": "rew",
979
+ "977": "▁sum",
980
+ "978": "▁hu",
981
+ "979": "ploy",
982
+ "980": "ude",
983
+ "981": "ried",
984
+ "982": "▁cir",
985
+ "983": "▁dev",
986
+ "984": "ear",
987
+ "985": "▁tot",
988
+ "986": "▁ann",
989
+ "987": "duc",
990
+ "988": "ik",
991
+ "989": "pon",
992
+ "990": "sted",
993
+ "991": "▁ide",
994
+ "992": "▁'",
995
+ "993": "ipp",
996
+ "994": "▁eat",
997
+ "995": "▁dom",
998
+ "996": "▁",
999
+ "997": "e",
1000
+ "998": "t",
1001
+ "999": "o",
1002
+ "1000": "a",
1003
+ "1001": "i",
1004
+ "1002": "n",
1005
+ "1003": "s",
1006
+ "1004": "r",
1007
+ "1005": "h",
1008
+ "1006": "l",
1009
+ "1007": "d",
1010
+ "1008": "u",
1011
+ "1009": "c",
1012
+ "1010": "m",
1013
+ "1011": "y",
1014
+ "1012": "g",
1015
+ "1013": "w",
1016
+ "1014": "f",
1017
+ "1015": "p",
1018
+ "1016": "b",
1019
+ "1017": "v",
1020
+ "1018": "k",
1021
+ "1019": "'",
1022
+ "1020": "j",
1023
+ "1021": "x",
1024
+ "1022": "q",
1025
+ "1023": "z",
1026
+ "1024": "<EOU>",
1027
+ "1025": "<EOB>"
1028
+ }
Conversion/convert_parakeet_eou.py ADDED
@@ -0,0 +1,722 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """CLI for exporting Parakeet Realtime EOU 120M components to CoreML.
3
+
4
+ This model is a cache-aware streaming FastConformer-RNNT model optimized for
5
+ low-latency speech recognition with end-of-utterance detection.
6
+
7
+ Key differences from Parakeet TDT v3:
8
+ - Smaller model (120M vs 600M params)
9
+ - No duration outputs (standard RNNT, not TDT)
10
+ - Cache-aware streaming encoder (17 layers, attention context [70,1])
11
+ - Special <EOU> token for end-of-utterance detection
12
+ - Optimized for 80-160ms latency
13
+
14
+ Reference: https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ from dataclasses import asdict
20
+ from pathlib import Path
21
+ from typing import Dict, Optional, Tuple
22
+
23
+ import coremltools as ct
24
+ import numpy as np
25
+ import soundfile as sf
26
+ import torch
27
+ import typer
28
+
29
+ import nemo.collections.asr as nemo_asr
30
+
31
+ from individual_components import (
32
+ DecoderWrapper,
33
+ EncoderWrapper,
34
+ ExportSettings,
35
+ JointWrapper,
36
+ JointDecisionWrapper,
37
+ JointDecisionSingleStep,
38
+ PreprocessorWrapper,
39
+ MelEncoderWrapper,
40
+ _coreml_convert,
41
+ )
42
+
43
+ DEFAULT_MODEL_ID = "nvidia/parakeet_realtime_eou_120m-v1"
44
+ AUTHOR = "Fluid Inference"
45
+
46
+
47
+ def _compute_length(seconds: float, sample_rate: int) -> int:
48
+ return int(round(seconds * sample_rate))
49
+
50
+
51
+ def _prepare_audio(
52
+ validation_audio: Optional[Path],
53
+ sample_rate: int,
54
+ max_samples: int,
55
+ seed: Optional[int],
56
+ ) -> torch.Tensor:
57
+ if validation_audio is None:
58
+ if seed is not None:
59
+ torch.manual_seed(seed)
60
+ audio = torch.randn(1, max_samples, dtype=torch.float32)
61
+ return audio
62
+
63
+ data, sr = sf.read(str(validation_audio), dtype="float32")
64
+ if sr != sample_rate:
65
+ raise typer.BadParameter(
66
+ f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
67
+ )
68
+
69
+ if data.ndim > 1:
70
+ data = data[:, 0]
71
+
72
+ if data.size == 0:
73
+ raise typer.BadParameter("Validation audio is empty")
74
+
75
+ if data.size < max_samples:
76
+ pad_width = max_samples - data.size
77
+ data = np.pad(data, (0, pad_width))
78
+ elif data.size > max_samples:
79
+ data = data[:max_samples]
80
+
81
+ audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
82
+ return audio
83
+
84
+
85
+ def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
86
+ try:
87
+ model.minimum_deployment_target = ct.target.iOS17
88
+ except Exception:
89
+ pass
90
+ model.short_description = description
91
+ model.author = AUTHOR
92
+ path.parent.mkdir(parents=True, exist_ok=True)
93
+ model.save(str(path))
94
+
95
+
96
+ def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
97
+ return tuple(int(dim) for dim in tensor.shape)
98
+
99
+
100
+ def _parse_compute_units(name: str) -> ct.ComputeUnit:
101
+ """Parse a human-friendly compute units string into ct.ComputeUnit."""
102
+ normalized = str(name).strip().upper()
103
+ mapping = {
104
+ "ALL": ct.ComputeUnit.ALL,
105
+ "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
106
+ "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
107
+ "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
108
+ "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
109
+ }
110
+ if normalized not in mapping:
111
+ raise typer.BadParameter(
112
+ f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
113
+ )
114
+ return mapping[normalized]
115
+
116
+
117
+ def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
118
+ """Parse compute precision string into ct.precision or None."""
119
+ if name is None:
120
+ return None
121
+ normalized = str(name).strip().upper()
122
+ if normalized == "":
123
+ return None
124
+ mapping = {
125
+ "FLOAT32": ct.precision.FLOAT32,
126
+ "FLOAT16": ct.precision.FLOAT16,
127
+ }
128
+ if normalized not in mapping:
129
+ raise typer.BadParameter(
130
+ f"Unknown compute precision '{name}'. Choose from: "
131
+ + ", ".join(mapping.keys())
132
+ )
133
+ return mapping[normalized]
134
+
135
+
136
+ app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
137
+
138
+
139
+ @app.command()
140
+ def convert(
141
+ nemo_path: Optional[Path] = typer.Option(
142
+ None,
143
+ "--nemo-path",
144
+ exists=True,
145
+ resolve_path=True,
146
+ help="Path to parakeet_realtime_eou_120m-v1.nemo checkpoint (skip to auto-download)",
147
+ ),
148
+ model_id: str = typer.Option(
149
+ DEFAULT_MODEL_ID,
150
+ "--model-id",
151
+ help="Model identifier to download when --nemo-path is omitted",
152
+ ),
153
+ output_dir: Path = typer.Option(
154
+ Path("parakeet_eou_coreml"),
155
+ help="Directory where mlpackages and metadata will be written",
156
+ ),
157
+ preprocessor_cu: str = typer.Option(
158
+ "CPU_ONLY",
159
+ "--preprocessor-cu",
160
+ help="Compute units for preprocessor (default CPU_ONLY)",
161
+ ),
162
+ mel_encoder_cu: str = typer.Option(
163
+ "CPU_ONLY",
164
+ "--mel-encoder-cu",
165
+ help="Compute units for fused mel+encoder (default CPU_ONLY)",
166
+ ),
167
+ compute_precision: Optional[str] = typer.Option(
168
+ None,
169
+ "--compute-precision",
170
+ help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
171
+ ),
172
+ max_audio_seconds: float = typer.Option(
173
+ 15.0,
174
+ "--max-audio-seconds",
175
+ help="Maximum audio duration in seconds for the fixed window export",
176
+ ),
177
+ validation_audio: Optional[Path] = typer.Option(
178
+ None,
179
+ "--validation-audio",
180
+ exists=True,
181
+ resolve_path=True,
182
+ help="Path to a 16kHz WAV file for tracing (uses random if not provided)",
183
+ ),
184
+ ) -> None:
185
+ """Export all Parakeet Realtime EOU sub-modules to CoreML.
186
+
187
+ This exports the cache-aware streaming FastConformer-RNNT model for
188
+ low-latency speech recognition with end-of-utterance detection.
189
+ """
190
+ export_settings = ExportSettings(
191
+ output_dir=output_dir,
192
+ compute_units=ct.ComputeUnit.CPU_ONLY,
193
+ deployment_target=ct.target.iOS17,
194
+ compute_precision=_parse_compute_precision(compute_precision),
195
+ max_audio_seconds=max_audio_seconds,
196
+ max_symbol_steps=1,
197
+ )
198
+
199
+ typer.echo("Export configuration:")
200
+ typer.echo(asdict(export_settings))
201
+
202
+ output_dir.mkdir(parents=True, exist_ok=True)
203
+ pre_cu = _parse_compute_units(preprocessor_cu)
204
+ melenc_cu = _parse_compute_units(mel_encoder_cu)
205
+
206
+ if nemo_path is not None:
207
+ typer.echo(f"Loading NeMo model from {nemo_path}…")
208
+ # Try loading as generic ASRModel first, then specific class
209
+ try:
210
+ asr_model = nemo_asr.models.ASRModel.restore_from(
211
+ str(nemo_path), map_location="cpu"
212
+ )
213
+ except Exception:
214
+ # Fallback to EncDecRNNTBPEModel
215
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(
216
+ str(nemo_path), map_location="cpu"
217
+ )
218
+ checkpoint_meta = {
219
+ "type": "file",
220
+ "path": str(nemo_path),
221
+ }
222
+ else:
223
+ typer.echo(f"Downloading NeMo model via {model_id}…")
224
+ # Use ASRModel.from_pretrained as recommended for this model
225
+ try:
226
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(
227
+ model_id, map_location="cpu"
228
+ )
229
+ except Exception:
230
+ # Fallback to EncDecRNNTBPEModel
231
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
232
+ model_id, map_location="cpu"
233
+ )
234
+ checkpoint_meta = {
235
+ "type": "pretrained",
236
+ "model_id": model_id,
237
+ }
238
+ asr_model.eval()
239
+
240
+ # Print model info
241
+ typer.echo(f"Model class: {type(asr_model).__name__}")
242
+ typer.echo(f"Encoder class: {type(asr_model.encoder).__name__}")
243
+
244
+ sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
245
+ max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
246
+
247
+ # Prepare audio for tracing
248
+ if validation_audio is not None:
249
+ typer.echo(f"Using validation audio: {validation_audio}")
250
+ audio_tensor = _prepare_audio(validation_audio, sample_rate, max_samples, seed=None)
251
+ else:
252
+ typer.echo("Using random audio for tracing (seed=42)")
253
+ audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
254
+
255
+ audio_length = torch.tensor([max_samples], dtype=torch.int32)
256
+
257
+ preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
258
+ encoder = EncoderWrapper(asr_model.encoder.eval())
259
+ decoder = DecoderWrapper(asr_model.decoder.eval())
260
+ joint = JointWrapper(asr_model.joint.eval())
261
+
262
+ decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
263
+ asr_model.decoder._rnnt_export = True
264
+
265
+ try:
266
+ with torch.no_grad():
267
+ mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
268
+ mel_length_ref = mel_length_ref.to(dtype=torch.int32)
269
+ encoder_ref, encoder_length_ref, frame_times_ref = encoder(
270
+ mel_ref, mel_length_ref
271
+ )
272
+ encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
273
+
274
+ # Clone tensors to drop inference flags
275
+ mel_ref = mel_ref.clone().detach()
276
+ mel_length_ref = mel_length_ref.clone().detach()
277
+ encoder_ref = encoder_ref.clone().detach()
278
+ encoder_length_ref = encoder_length_ref.clone().detach()
279
+ frame_times_ref = frame_times_ref.clone().detach()
280
+
281
+ vocab_size = int(asr_model.tokenizer.vocab_size)
282
+ decoder_hidden = int(asr_model.decoder.pred_hidden)
283
+ decoder_layers = int(asr_model.decoder.pred_rnn_layers)
284
+
285
+ # Check if model has extra outputs (TDT-style duration)
286
+ num_extra = getattr(asr_model.joint, "num_extra_outputs", 0)
287
+ typer.echo(f"Vocab size: {vocab_size}, num_extra_outputs: {num_extra}")
288
+
289
+ targets = torch.full(
290
+ (1, export_settings.max_symbol_steps),
291
+ fill_value=asr_model.decoder.blank_idx,
292
+ dtype=torch.int32,
293
+ )
294
+ target_lengths = torch.tensor(
295
+ [export_settings.max_symbol_steps], dtype=torch.int32
296
+ )
297
+ zero_state = torch.zeros(
298
+ decoder_layers,
299
+ 1,
300
+ decoder_hidden,
301
+ dtype=torch.float32,
302
+ )
303
+
304
+ with torch.no_grad():
305
+ decoder_ref, h_ref, c_ref = decoder(
306
+ targets, target_lengths, zero_state, zero_state
307
+ )
308
+ joint_ref = joint(encoder_ref, decoder_ref)
309
+
310
+ decoder_ref = decoder_ref.clone()
311
+ h_ref = h_ref.clone()
312
+ c_ref = c_ref.clone()
313
+ joint_ref = joint_ref.clone()
314
+
315
+ typer.echo(f"Encoder output shape: {encoder_ref.shape}")
316
+ typer.echo(f"Decoder output shape: {decoder_ref.shape}")
317
+ typer.echo(f"Joint output shape: {joint_ref.shape}")
318
+
319
+ # === Export Preprocessor ===
320
+ typer.echo("Tracing and converting preprocessor…")
321
+ preprocessor = preprocessor.cpu()
322
+ audio_tensor = audio_tensor.cpu()
323
+ audio_length = audio_length.cpu()
324
+ traced_preprocessor = torch.jit.trace(
325
+ preprocessor, (audio_tensor, audio_length), strict=False
326
+ )
327
+ traced_preprocessor.eval()
328
+ preprocessor_inputs = [
329
+ ct.TensorType(
330
+ name="audio_signal",
331
+ shape=(1, ct.RangeDim(1, max_samples)),
332
+ dtype=np.float32,
333
+ ),
334
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
335
+ ]
336
+ preprocessor_outputs = [
337
+ ct.TensorType(name="mel", dtype=np.float32),
338
+ ct.TensorType(name="mel_length", dtype=np.int32),
339
+ ]
340
+ preprocessor_model = _coreml_convert(
341
+ traced_preprocessor,
342
+ preprocessor_inputs,
343
+ preprocessor_outputs,
344
+ export_settings,
345
+ compute_units_override=pre_cu,
346
+ )
347
+ preprocessor_path = output_dir / "parakeet_eou_preprocessor.mlpackage"
348
+ _save_mlpackage(
349
+ preprocessor_model,
350
+ preprocessor_path,
351
+ f"Parakeet EOU preprocessor ({max_audio_seconds}s window)",
352
+ )
353
+
354
+ # === Export Encoder ===
355
+ typer.echo("Tracing and converting encoder…")
356
+ traced_encoder = torch.jit.trace(
357
+ encoder, (mel_ref, mel_length_ref), strict=False
358
+ )
359
+ traced_encoder.eval()
360
+ encoder_inputs = [
361
+ ct.TensorType(
362
+ name="mel", shape=_tensor_shape(mel_ref), dtype=np.float32
363
+ ),
364
+ ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
365
+ ]
366
+ encoder_outputs = [
367
+ ct.TensorType(name="encoder", dtype=np.float32),
368
+ ct.TensorType(name="encoder_length", dtype=np.int32),
369
+ ct.TensorType(name="frame_times", dtype=np.float32),
370
+ ]
371
+ encoder_model = _coreml_convert(
372
+ traced_encoder,
373
+ encoder_inputs,
374
+ encoder_outputs,
375
+ export_settings,
376
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
377
+ )
378
+ encoder_path = output_dir / "parakeet_eou_encoder.mlpackage"
379
+ _save_mlpackage(
380
+ encoder_model,
381
+ encoder_path,
382
+ f"Parakeet EOU encoder ({max_audio_seconds}s window)",
383
+ )
384
+
385
+ # === Export Fused Mel+Encoder ===
386
+ typer.echo("Tracing and converting fused mel+encoder…")
387
+ mel_encoder = MelEncoderWrapper(preprocessor, encoder)
388
+ traced_mel_encoder = torch.jit.trace(
389
+ mel_encoder, (audio_tensor, audio_length), strict=False
390
+ )
391
+ traced_mel_encoder.eval()
392
+ mel_encoder_inputs = [
393
+ ct.TensorType(
394
+ name="audio_signal", shape=(1, max_samples), dtype=np.float32
395
+ ),
396
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
397
+ ]
398
+ mel_encoder_outputs = [
399
+ ct.TensorType(name="encoder", dtype=np.float32),
400
+ ct.TensorType(name="encoder_length", dtype=np.int32),
401
+ ct.TensorType(name="frame_times", dtype=np.float32),
402
+ ]
403
+ mel_encoder_model = _coreml_convert(
404
+ traced_mel_encoder,
405
+ mel_encoder_inputs,
406
+ mel_encoder_outputs,
407
+ export_settings,
408
+ compute_units_override=melenc_cu,
409
+ )
410
+ mel_encoder_path = output_dir / "parakeet_eou_mel_encoder.mlpackage"
411
+ _save_mlpackage(
412
+ mel_encoder_model,
413
+ mel_encoder_path,
414
+ f"Parakeet EOU fused Mel+Encoder ({max_audio_seconds}s window)",
415
+ )
416
+
417
+ # === Export Decoder ===
418
+ typer.echo("Tracing and converting decoder…")
419
+ traced_decoder = torch.jit.trace(
420
+ decoder,
421
+ (targets, target_lengths, zero_state, zero_state),
422
+ strict=False,
423
+ )
424
+ traced_decoder.eval()
425
+ decoder_inputs = [
426
+ ct.TensorType(
427
+ name="targets", shape=_tensor_shape(targets), dtype=np.int32
428
+ ),
429
+ ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
430
+ ct.TensorType(
431
+ name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32
432
+ ),
433
+ ct.TensorType(
434
+ name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32
435
+ ),
436
+ ]
437
+ decoder_outputs = [
438
+ ct.TensorType(name="decoder", dtype=np.float32),
439
+ ct.TensorType(name="h_out", dtype=np.float32),
440
+ ct.TensorType(name="c_out", dtype=np.float32),
441
+ ]
442
+ decoder_model = _coreml_convert(
443
+ traced_decoder,
444
+ decoder_inputs,
445
+ decoder_outputs,
446
+ export_settings,
447
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
448
+ )
449
+ decoder_path = output_dir / "parakeet_eou_decoder.mlpackage"
450
+ _save_mlpackage(
451
+ decoder_model,
452
+ decoder_path,
453
+ "Parakeet EOU decoder (RNNT prediction network)",
454
+ )
455
+
456
+ # === Export Joint ===
457
+ typer.echo("Tracing and converting joint…")
458
+ traced_joint = torch.jit.trace(
459
+ joint,
460
+ (encoder_ref, decoder_ref),
461
+ strict=False,
462
+ )
463
+ traced_joint.eval()
464
+ joint_inputs = [
465
+ ct.TensorType(
466
+ name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
467
+ ),
468
+ ct.TensorType(
469
+ name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
470
+ ),
471
+ ]
472
+ joint_outputs = [
473
+ ct.TensorType(name="logits", dtype=np.float32),
474
+ ]
475
+ joint_model = _coreml_convert(
476
+ traced_joint,
477
+ joint_inputs,
478
+ joint_outputs,
479
+ export_settings,
480
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
481
+ )
482
+ joint_path = output_dir / "parakeet_eou_joint.mlpackage"
483
+ _save_mlpackage(
484
+ joint_model,
485
+ joint_path,
486
+ "Parakeet EOU joint network (RNNT)",
487
+ )
488
+
489
+ # === Export Joint Decision Head ===
490
+ typer.echo("Tracing and converting joint decision head…")
491
+ joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size)
492
+ traced_joint_decision = torch.jit.trace(
493
+ joint_decision,
494
+ (encoder_ref, decoder_ref),
495
+ strict=False,
496
+ )
497
+ traced_joint_decision.eval()
498
+ joint_decision_inputs = [
499
+ ct.TensorType(
500
+ name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32
501
+ ),
502
+ ct.TensorType(
503
+ name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32
504
+ ),
505
+ ]
506
+ joint_decision_outputs = [
507
+ ct.TensorType(name="token_id", dtype=np.int32),
508
+ ct.TensorType(name="token_prob", dtype=np.float32),
509
+ ]
510
+ joint_decision_model = _coreml_convert(
511
+ traced_joint_decision,
512
+ joint_decision_inputs,
513
+ joint_decision_outputs,
514
+ export_settings,
515
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
516
+ )
517
+ joint_decision_path = output_dir / "parakeet_eou_joint_decision.mlpackage"
518
+ _save_mlpackage(
519
+ joint_decision_model,
520
+ joint_decision_path,
521
+ "Parakeet EOU joint + decision head (softmax, argmax)",
522
+ )
523
+
524
+ # === Export Single-Step Joint Decision ===
525
+ typer.echo("Tracing and converting single-step joint decision…")
526
+ jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size)
527
+ # Create single-step slices from refs
528
+ enc_step = encoder_ref[:, :, :1].contiguous()
529
+ dec_step = decoder_ref[:, :, :1].contiguous()
530
+ traced_jd_single = torch.jit.trace(
531
+ jd_single,
532
+ (enc_step, dec_step),
533
+ strict=False,
534
+ )
535
+ traced_jd_single.eval()
536
+ jd_single_inputs = [
537
+ ct.TensorType(
538
+ name="encoder_step",
539
+ shape=(1, enc_step.shape[1], 1),
540
+ dtype=np.float32,
541
+ ),
542
+ ct.TensorType(
543
+ name="decoder_step",
544
+ shape=(1, dec_step.shape[1], 1),
545
+ dtype=np.float32,
546
+ ),
547
+ ]
548
+ jd_single_outputs = [
549
+ ct.TensorType(name="token_id", dtype=np.int32),
550
+ ct.TensorType(name="token_prob", dtype=np.float32),
551
+ ct.TensorType(name="top_k_ids", dtype=np.int32),
552
+ ct.TensorType(name="top_k_logits", dtype=np.float32),
553
+ ]
554
+ jd_single_model = _coreml_convert(
555
+ traced_jd_single,
556
+ jd_single_inputs,
557
+ jd_single_outputs,
558
+ export_settings,
559
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
560
+ )
561
+ jd_single_path = output_dir / "parakeet_eou_joint_decision_single_step.mlpackage"
562
+ _save_mlpackage(
563
+ jd_single_model,
564
+ jd_single_path,
565
+ "Parakeet EOU single-step joint decision (current frame)",
566
+ )
567
+
568
+ # === Save Metadata ===
569
+ metadata: Dict[str, object] = {
570
+ "model_id": model_id,
571
+ "model_name": "parakeet_realtime_eou_120m-v1",
572
+ "model_class": type(asr_model).__name__,
573
+ "encoder_class": type(asr_model.encoder).__name__,
574
+ "sample_rate": sample_rate,
575
+ "max_audio_seconds": export_settings.max_audio_seconds,
576
+ "max_audio_samples": max_samples,
577
+ "max_symbol_steps": export_settings.max_symbol_steps,
578
+ "vocab_size": vocab_size,
579
+ "vocab_with_blank": vocab_size + 1,
580
+ "decoder_hidden": decoder_hidden,
581
+ "decoder_layers": decoder_layers,
582
+ "num_extra_outputs": num_extra,
583
+ "has_eou_token": True,
584
+ "checkpoint": checkpoint_meta,
585
+ "coreml": {
586
+ "compute_units": export_settings.compute_units.name,
587
+ "compute_precision": (
588
+ export_settings.compute_precision.name
589
+ if export_settings.compute_precision is not None
590
+ else "FLOAT32"
591
+ ),
592
+ },
593
+ "components": {
594
+ "preprocessor": {
595
+ "inputs": {
596
+ "audio_signal": [1, max_samples],
597
+ "audio_length": [1],
598
+ },
599
+ "outputs": {
600
+ "mel": list(_tensor_shape(mel_ref)),
601
+ "mel_length": [1],
602
+ },
603
+ "path": preprocessor_path.name,
604
+ },
605
+ "encoder": {
606
+ "inputs": {
607
+ "mel": list(_tensor_shape(mel_ref)),
608
+ "mel_length": [1],
609
+ },
610
+ "outputs": {
611
+ "encoder": list(_tensor_shape(encoder_ref)),
612
+ "encoder_length": [1],
613
+ "frame_times": [1, _tensor_shape(encoder_ref)[2]],
614
+ },
615
+ "path": encoder_path.name,
616
+ },
617
+ "mel_encoder": {
618
+ "inputs": {
619
+ "audio_signal": [1, max_samples],
620
+ "audio_length": [1],
621
+ },
622
+ "outputs": {
623
+ "encoder": list(_tensor_shape(encoder_ref)),
624
+ "encoder_length": [1],
625
+ "frame_times": [1, _tensor_shape(encoder_ref)[2]],
626
+ },
627
+ "path": mel_encoder_path.name,
628
+ },
629
+ "decoder": {
630
+ "inputs": {
631
+ "targets": list(_tensor_shape(targets)),
632
+ "target_length": [1],
633
+ "h_in": list(_tensor_shape(zero_state)),
634
+ "c_in": list(_tensor_shape(zero_state)),
635
+ },
636
+ "outputs": {
637
+ "decoder": list(_tensor_shape(decoder_ref)),
638
+ "h_out": list(_tensor_shape(h_ref)),
639
+ "c_out": list(_tensor_shape(c_ref)),
640
+ },
641
+ "path": decoder_path.name,
642
+ },
643
+ "joint": {
644
+ "inputs": {
645
+ "encoder": list(_tensor_shape(encoder_ref)),
646
+ "decoder": list(_tensor_shape(decoder_ref)),
647
+ },
648
+ "outputs": {
649
+ "logits": list(_tensor_shape(joint_ref)),
650
+ },
651
+ "path": joint_path.name,
652
+ },
653
+ "joint_decision": {
654
+ "inputs": {
655
+ "encoder": list(_tensor_shape(encoder_ref)),
656
+ "decoder": list(_tensor_shape(decoder_ref)),
657
+ },
658
+ "outputs": {
659
+ "token_id": [
660
+ _tensor_shape(encoder_ref)[0],
661
+ _tensor_shape(encoder_ref)[2],
662
+ _tensor_shape(decoder_ref)[2],
663
+ ],
664
+ "token_prob": [
665
+ _tensor_shape(encoder_ref)[0],
666
+ _tensor_shape(encoder_ref)[2],
667
+ _tensor_shape(decoder_ref)[2],
668
+ ],
669
+ },
670
+ "path": joint_decision_path.name,
671
+ },
672
+ "joint_decision_single_step": {
673
+ "inputs": {
674
+ "encoder_step": [1, _tensor_shape(encoder_ref)[1], 1],
675
+ "decoder_step": [1, _tensor_shape(decoder_ref)[1], 1],
676
+ },
677
+ "outputs": {
678
+ "token_id": [1, 1, 1],
679
+ "token_prob": [1, 1, 1],
680
+ "top_k_ids": [1, 1, 1, 64],
681
+ "top_k_logits": [1, 1, 1, 64],
682
+ },
683
+ "path": jd_single_path.name,
684
+ },
685
+ },
686
+ }
687
+
688
+ # Export tokenizer vocab if available
689
+ try:
690
+ tokenizer = asr_model.tokenizer
691
+ vocab = {
692
+ "blank_id": int(asr_model.decoder.blank_idx),
693
+ "vocab_size": vocab_size,
694
+ }
695
+ # Try to get special tokens
696
+ if hasattr(tokenizer, "tokenizer"):
697
+ inner_tokenizer = tokenizer.tokenizer
698
+ if hasattr(inner_tokenizer, "get_vocab"):
699
+ full_vocab = inner_tokenizer.get_vocab()
700
+ # Find EOU token
701
+ eou_token = None
702
+ for token, idx in full_vocab.items():
703
+ if "<EOU>" in token.upper() or "eou" in token.lower():
704
+ eou_token = {"token": token, "id": idx}
705
+ break
706
+ if eou_token:
707
+ vocab["eou_token"] = eou_token
708
+ metadata["tokenizer"] = vocab
709
+ except Exception as e:
710
+ typer.echo(f"Warning: Could not export tokenizer info: {e}")
711
+
712
+ metadata_path = output_dir / "metadata.json"
713
+ metadata_path.write_text(json.dumps(metadata, indent=2))
714
+ typer.echo(f"\nExport complete. Metadata written to {metadata_path}")
715
+ typer.echo(f"Output directory: {output_dir}")
716
+
717
+ finally:
718
+ asr_model.decoder._rnnt_export = decoder_export_flag
719
+
720
+
721
+ if __name__ == "__main__":
722
+ app()
Conversion/convert_split_encoder.py ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Split encoder export for true streaming inference.
4
+
5
+ This script exports the encoder in separate components:
6
+ 1. PreEncode (ConvSubsampling) - with pre_encode cache for mel frame overlap
7
+ 2. ConformerStack - 17 conformer layers with attention/time caches
8
+
9
+ This allows proper streaming inference by:
10
+ - Processing fixed-size mel chunks through pre_encode
11
+ - Feeding pre_encode output through conformer layers with persistent caches
12
+ """
13
+
14
+ import json
15
+ from pathlib import Path
16
+ from typing import Tuple
17
+
18
+ import coremltools as ct
19
+ import numpy as np
20
+ import torch
21
+ import typer
22
+ from torch import nn
23
+
24
+ from convert_parakeet_eou import ExportSettings, _coreml_convert, _save_mlpackage
25
+ from individual_components import (
26
+ DecoderWrapper,
27
+ JointDecisionSingleStep,
28
+ JointWrapper,
29
+ PreprocessorWrapper,
30
+ )
31
+
32
+
33
+ class PreEncodeWrapper(nn.Module):
34
+ """Wrapper for pre_encode (ConvSubsampling) with pre-encode cache.
35
+
36
+ The pre_encode module performs 4x subsampling via two conv layers:
37
+ - Conv2d(1, 256, kernel=(3,3), stride=(2,2))
38
+ - Conv2d(256, 256, kernel=(3,3), stride=(2,2))
39
+ - Linear(256 * (mel_dim // 4), hidden_dim)
40
+
41
+ For streaming, we need to cache the last few mel frames to handle
42
+ the convolution overlap at chunk boundaries.
43
+ """
44
+
45
+ def __init__(self, pre_encode: nn.Module, mel_dim: int = 128, pre_cache_size: int = 9):
46
+ super().__init__()
47
+ self.pre_encode = pre_encode
48
+ self.mel_dim = mel_dim
49
+ self.pre_cache_size = pre_cache_size
50
+
51
+ def forward(
52
+ self,
53
+ mel: torch.Tensor,
54
+ mel_length: torch.Tensor,
55
+ pre_cache: torch.Tensor,
56
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
57
+ """
58
+ Args:
59
+ mel: [B, mel_dim, T] - new mel frames (channel-major from preprocessor)
60
+ mel_length: [B] - length of mel
61
+ pre_cache: [B, pre_cache_size, mel_dim] - cached mel frames from previous chunk
62
+
63
+ Returns:
64
+ encoded: [B, T', hidden_dim] - subsampled and projected output
65
+ encoded_length: [B] - output length
66
+ new_cache: [B, pre_cache_size, mel_dim] - new cache for next chunk
67
+ """
68
+ batch_size = mel.shape[0]
69
+
70
+ # Transpose mel from [B, D, T] to [B, T, D]
71
+ mel = mel.transpose(1, 2)
72
+
73
+ # Concatenate cache with new mel
74
+ if self.pre_cache_size > 0:
75
+ mel_with_cache = torch.cat([pre_cache, mel], dim=1) # [B, cache+T, mel_dim]
76
+ adjusted_length = mel_length + self.pre_cache_size
77
+ else:
78
+ mel_with_cache = mel
79
+ adjusted_length = mel_length
80
+
81
+ # Run pre_encode - expects [B, T, F]
82
+ encoded, encoded_length = self.pre_encode(mel_with_cache, adjusted_length)
83
+
84
+ # Extract new cache from end of original mel (before pre_encode)
85
+ if self.pre_cache_size > 0:
86
+ # Take last pre_cache_size frames from input mel
87
+ new_cache = mel[:, -self.pre_cache_size:, :]
88
+ else:
89
+ new_cache = torch.zeros(batch_size, 0, self.mel_dim, dtype=mel.dtype)
90
+
91
+ return encoded, encoded_length, new_cache
92
+
93
+
94
+ class ConformerStackWrapper(nn.Module):
95
+ """Wrapper for conformer layers with cache-aware streaming.
96
+
97
+ This wraps the 17 conformer layers and handles:
98
+ - cache_last_channel: Attention context cache [layers, B, cache_size, hidden]
99
+ - cache_last_time: Time convolution cache [layers, B, hidden, time_cache]
100
+ """
101
+
102
+ def __init__(
103
+ self,
104
+ encoder: nn.Module,
105
+ num_layers: int = 17,
106
+ hidden_dim: int = 512,
107
+ cache_channel_size: int = 70,
108
+ cache_time_size: int = 8,
109
+ ):
110
+ super().__init__()
111
+ self.encoder = encoder
112
+ self.num_layers = num_layers
113
+ self.hidden_dim = hidden_dim
114
+ self.cache_channel_size = cache_channel_size
115
+ self.cache_time_size = cache_time_size
116
+
117
+ # Get positional encoding and normalization
118
+ self.pos_enc = encoder.pos_enc if hasattr(encoder, 'pos_enc') else None
119
+ self.layers = encoder.layers
120
+ self.final_norm = encoder.norm if hasattr(encoder, 'norm') else None
121
+
122
+ def forward(
123
+ self,
124
+ pre_encoded: torch.Tensor,
125
+ pre_encoded_length: torch.Tensor,
126
+ cache_last_channel: torch.Tensor,
127
+ cache_last_time: torch.Tensor,
128
+ cache_last_channel_len: torch.Tensor,
129
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
130
+ """
131
+ Args:
132
+ pre_encoded: [B, T', hidden_dim] - output from pre_encode
133
+ pre_encoded_length: [B] - sequence length
134
+ cache_last_channel: [layers, B, cache_size, hidden_dim] - attention cache
135
+ cache_last_time: [layers, B, hidden_dim, time_cache] - time conv cache
136
+ cache_last_channel_len: [B] - current cache usage length
137
+
138
+ Returns:
139
+ encoded: [B, hidden_dim, T_out] - encoder output (channel-last for decoder)
140
+ encoded_length: [B] - output length
141
+ new_cache_channel: [layers, B, cache_size, hidden_dim]
142
+ new_cache_time: [layers, B, hidden_dim, time_cache]
143
+ new_cache_len: [B]
144
+ """
145
+ # Use the encoder's cache_aware_stream_step but only for the conformer part
146
+ # We need to call it with the pre-encoded features
147
+
148
+ # The FastConformer encoder's cache_aware_stream_step expects:
149
+ # - processed_signal: [B, hidden, T] (channel-first)
150
+ # - processed_signal_length: [B]
151
+ # - cache_last_channel, cache_last_time, cache_last_channel_len
152
+
153
+ # Since pre_encoded is [B, T', hidden_dim], transpose to [B, hidden_dim, T']
154
+ x = pre_encoded.transpose(1, 2) # [B, hidden, T']
155
+
156
+ # Call cache_aware_stream_step
157
+ outputs = self.encoder.cache_aware_stream_step(
158
+ processed_signal=x,
159
+ processed_signal_length=pre_encoded_length,
160
+ cache_last_channel=cache_last_channel,
161
+ cache_last_time=cache_last_time,
162
+ cache_last_channel_len=cache_last_channel_len,
163
+ )
164
+
165
+ # Outputs: (encoded, encoded_len, new_cache_channel, new_cache_time, new_cache_len)
166
+ return outputs[0], outputs[1], outputs[2], outputs[3], outputs[4]
167
+
168
+
169
+ class SimpleConformerWrapper(nn.Module):
170
+ """Simpler approach: Just wrap the full encoder's cache_aware_stream_step.
171
+
172
+ This avoids splitting pre_encode since the cache_aware_stream_step
173
+ handles everything internally including the pre_encode cache.
174
+
175
+ The mel input must be in [B, mel_dim, T] format (channel-first).
176
+ """
177
+
178
+ def __init__(self, encoder: nn.Module):
179
+ super().__init__()
180
+ self.encoder = encoder
181
+
182
+ def forward(
183
+ self,
184
+ mel: torch.Tensor,
185
+ mel_length: torch.Tensor,
186
+ cache_last_channel: torch.Tensor,
187
+ cache_last_time: torch.Tensor,
188
+ cache_last_channel_len: torch.Tensor,
189
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
190
+ """
191
+ Args:
192
+ mel: [B, mel_dim, T] - mel spectrogram (channel-first)
193
+ mel_length: [B] - length
194
+ cache_last_channel: [layers, B, cache_size, hidden]
195
+ cache_last_time: [layers, B, hidden, time_cache]
196
+ cache_last_channel_len: [B]
197
+
198
+ Returns:
199
+ encoded, encoded_length, new_cache_channel, new_cache_time, new_cache_len
200
+ """
201
+ outputs = self.encoder.cache_aware_stream_step(
202
+ processed_signal=mel,
203
+ processed_signal_length=mel_length,
204
+ cache_last_channel=cache_last_channel,
205
+ cache_last_time=cache_last_time,
206
+ cache_last_channel_len=cache_last_channel_len,
207
+ )
208
+ return outputs[0], outputs[1], outputs[2], outputs[3], outputs[4]
209
+
210
+
211
+ class FixedChunkPreEncodeWrapper(nn.Module):
212
+ """Pre-encode with FIXED chunk size to avoid dynamic shape issues.
213
+
214
+ The ConvSubsampling linear layer expects exactly 4352 input features,
215
+ which comes from: 256 channels * (mel_dim // 4 - 1) = 256 * 17 = 4352
216
+
217
+ For mel_dim=128: floor(128/4) - 1 = 31, but the actual calculation is:
218
+ After two conv2d with stride 2: T' = ((T - 3) // 2 + 1 - 3) // 2 + 1
219
+ And freq: F' = ((128 - 3) // 2 + 1 - 3) // 2 + 1 = 30
220
+ Then 256 * 30 / 2 = 3840... let me check the actual model.
221
+
222
+ Actually the subsampling_factor is 4, so T_out = T_in // 4.
223
+ And the linear expects: hidden_channels * (feat_in // subsampling_factor)
224
+ = 256 * (128 // 4) = 256 * 32 = 8192... but that doesn't match 4352.
225
+
226
+ Let me just trace with the actual chunk size the model expects.
227
+ """
228
+
229
+ def __init__(self, pre_encode: nn.Module, mel_dim: int = 128):
230
+ super().__init__()
231
+ self.pre_encode = pre_encode
232
+ self.mel_dim = mel_dim
233
+
234
+ def forward(
235
+ self,
236
+ mel: torch.Tensor,
237
+ mel_length: torch.Tensor,
238
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
239
+ """
240
+ Args:
241
+ mel: [B, T, mel_dim] - mel spectrogram (time-major)
242
+ mel_length: [B] - length
243
+
244
+ Returns:
245
+ encoded: [B, T', hidden_dim] - subsampled output
246
+ encoded_length: [B] - output length
247
+ """
248
+ # Input is [B, D, T] (channel-major)
249
+ # ConvSubsampling expects [B, T, D], so we transpose
250
+ mel = mel.transpose(1, 2)
251
+ return self.pre_encode(mel, mel_length)
252
+
253
+
254
+ class ConformerBatchWrapper(nn.Module):
255
+ """Process pre_encoded features through conformer layers (batch mode)."""
256
+
257
+ def __init__(self, encoder):
258
+ super().__init__()
259
+ self.pos_enc = encoder.pos_enc if hasattr(encoder, 'pos_enc') else None
260
+ self.layers = encoder.layers
261
+ self.norm = encoder.norm if hasattr(encoder, 'norm') else None
262
+
263
+ def forward(self, x: torch.Tensor, input_length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
264
+ """
265
+ Args:
266
+ x: [B, T, hidden_dim] - pre_encoded features
267
+ input_length: [B] - sequence lengths
268
+ Returns:
269
+ out: [B, hidden_dim, T] - encoder output (transposed for decoder)
270
+ output_length: [B] - output length
271
+ """
272
+ # Input x is [B, T, D] (from PreEncodeWrapper)
273
+
274
+ # Input x is [B, T, D] (from PreEncodeWrapper)
275
+ # pos_enc expects [B, T, D]
276
+ # x = x.transpose(1, 2)
277
+
278
+ # Add positional encoding - returns (x + pos_emb, pos_emb)
279
+ pos_emb = None
280
+ if self.pos_enc is not None:
281
+ result = self.pos_enc(x)
282
+ if isinstance(result, tuple):
283
+ x, pos_emb = result
284
+ else:
285
+ x = result
286
+
287
+ # x is already [B, T, D]
288
+ # x = x.transpose(1, 2)
289
+
290
+ # CRITICAL FIX: Don't create attention mask to avoid transpose rank mismatch in CoreML
291
+ # The mask creation causes 5D tensor issues during conversion (perm rank 4 != input rank 5)
292
+ # For batch processing with fixed-length input, we can pass None
293
+ # This works because we're processing padded input with known length
294
+
295
+ # Process through layers without mask
296
+ for layer in self.layers:
297
+ x = layer(x, att_mask=None, pos_emb=pos_emb)
298
+
299
+ # Final normalization
300
+ if self.norm is not None:
301
+ x = self.norm(x)
302
+
303
+ # Transpose back to [B, D, T] for Joint
304
+ x = x.transpose(1, 2)
305
+
306
+ output_length = input_length * 1 # Force separate computation
307
+ return x, output_length
308
+
309
+
310
+ def inspect_encoder_structure(encoder):
311
+ """Print the encoder's internal structure for debugging."""
312
+ print("\n=== Encoder Structure ===")
313
+ print(f"Type: {type(encoder)}")
314
+
315
+ for name, module in encoder.named_children():
316
+ print(f" {name}: {type(module).__name__}")
317
+ if hasattr(module, 'named_children'):
318
+ for subname, submodule in module.named_children():
319
+ print(f" {subname}: {type(submodule).__name__}")
320
+
321
+ if hasattr(encoder, 'streaming_cfg'):
322
+ cfg = encoder.streaming_cfg
323
+ print(f"\nStreaming Config:")
324
+ print(f" chunk_size: {cfg.chunk_size}")
325
+ print(f" shift_size: {cfg.shift_size}")
326
+ print(f" pre_encode_cache_size: {cfg.pre_encode_cache_size}")
327
+ print(f" last_channel_cache_size: {cfg.last_channel_cache_size}")
328
+ if hasattr(cfg, 'last_time_cache_size'):
329
+ print(f" last_time_cache_size: {cfg.last_time_cache_size}")
330
+
331
+ print()
332
+
333
+
334
+ def test_pre_encode_shapes(encoder, mel_dim: int = 128):
335
+ """Test what shapes pre_encode expects and produces."""
336
+ print("\n=== Testing Pre-Encode Shapes ===")
337
+
338
+ pre_encode = encoder.pre_encode
339
+
340
+ for T in [10, 20, 40, 80, 160]:
341
+ mel = torch.randn(1, T, mel_dim)
342
+ mel_len = torch.tensor([T], dtype=torch.long)
343
+ try:
344
+ out, out_len = pre_encode(mel, mel_len)
345
+ print(f" Input [1, {T}, {mel_dim}] -> Output {list(out.shape)}, len={out_len.item()}")
346
+ except Exception as e:
347
+ print(f" Input [1, {T}, {mel_dim}] -> ERROR: {e}")
348
+
349
+
350
+ def main(
351
+ output_dir: str = typer.Option("Models/ParakeetEOU/ShortBatch", help="Output directory"),
352
+ model_id: str = typer.Option(
353
+ "nvidia/parakeet_realtime_eou_120m-v1", help="Model ID"
354
+ ),
355
+ inspect_only: bool = typer.Option(False, help="Only inspect encoder structure"),
356
+ ):
357
+ """Export Parakeet EOU with split encoder for streaming."""
358
+ output_path = Path(output_dir)
359
+ output_path.mkdir(parents=True, exist_ok=True)
360
+
361
+ import nemo.collections.asr as nemo_asr
362
+
363
+ typer.echo(f"Loading model {model_id}...")
364
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(model_id, map_location="cpu")
365
+ asr_model.eval()
366
+
367
+ encoder = asr_model.encoder
368
+ preprocessor = asr_model.preprocessor
369
+
370
+ # Inspect structure
371
+ inspect_encoder_structure(encoder)
372
+
373
+ # Get streaming config
374
+ streaming_cfg = encoder.streaming_cfg
375
+ mel_dim = int(asr_model.cfg.preprocessor.features)
376
+ hidden_dim = int(encoder.d_model)
377
+ num_layers = len(encoder.layers)
378
+
379
+ # Cache sizes from streaming config
380
+ cache_channel_size = 70
381
+ cache_time_size = 8
382
+ if streaming_cfg:
383
+ if streaming_cfg.last_channel_cache_size:
384
+ lcc = streaming_cfg.last_channel_cache_size
385
+ cache_channel_size = int(lcc[0]) if isinstance(lcc, (list, tuple)) else int(lcc)
386
+ if hasattr(streaming_cfg, 'last_time_cache_size') and streaming_cfg.last_time_cache_size:
387
+ ltc = streaming_cfg.last_time_cache_size
388
+ cache_time_size = int(ltc[0]) if isinstance(ltc, (list, tuple)) else int(ltc)
389
+
390
+ typer.echo(f"\nEncoder config:")
391
+ typer.echo(f" mel_dim: {mel_dim}")
392
+ typer.echo(f" hidden_dim: {hidden_dim}")
393
+ typer.echo(f" num_layers: {num_layers}")
394
+ typer.echo(f" cache_channel_size: {cache_channel_size}")
395
+ typer.echo(f" cache_time_size: {cache_time_size}")
396
+
397
+ # Test pre_encode shapes
398
+ test_pre_encode_shapes(encoder, mel_dim)
399
+
400
+ if inspect_only:
401
+ return
402
+
403
+ # Get chunk size from streaming config
404
+ chunk_size = 8 # Default
405
+ if streaming_cfg and streaming_cfg.chunk_size:
406
+ cs = streaming_cfg.chunk_size
407
+ chunk_size = int(cs[0]) if isinstance(cs, (list, tuple)) else int(cs)
408
+
409
+ typer.echo(f" chunk_size: {chunk_size}")
410
+
411
+ # Calculate mel frames needed for one chunk
412
+ # The encoder expects mel in [B, mel_dim, T] format
413
+ # chunk_size is in encoder frames (after 4x subsampling)
414
+ # So we need ~chunk_size * 4 mel frames
415
+ mel_frames_per_chunk = chunk_size * 4 + 9 # Add pre_encode cache size buffer
416
+
417
+ typer.echo(f" mel_frames_per_chunk: {mel_frames_per_chunk}")
418
+
419
+ export_settings = ExportSettings(
420
+ output_dir=output_path,
421
+ compute_units=ct.ComputeUnit.CPU_ONLY,
422
+ deployment_target=ct.target.iOS17,
423
+ compute_precision=None,
424
+ max_audio_seconds=30,
425
+ max_symbol_steps=1,
426
+ )
427
+
428
+ # ========== Export Preprocessor ==========
429
+ typer.echo("\n=== Exporting Preprocessor ===")
430
+
431
+ prep_wrapper = PreprocessorWrapper(preprocessor)
432
+
433
+ sample_rate = 16000
434
+ test_audio = torch.randn(1, sample_rate * 2, dtype=torch.float32)
435
+ test_length = torch.tensor([sample_rate * 2], dtype=torch.int32)
436
+
437
+ traced_prep = torch.jit.trace(prep_wrapper, (test_audio, test_length), strict=False)
438
+ traced_prep.eval()
439
+
440
+ prep_inputs = [
441
+ ct.TensorType(
442
+ name="audio_signal",
443
+ shape=(1, ct.RangeDim(1, sample_rate * 30)),
444
+ dtype=np.float32,
445
+ ),
446
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
447
+ ]
448
+ prep_outputs = [
449
+ ct.TensorType(name="mel", dtype=np.float32),
450
+ ct.TensorType(name="mel_length", dtype=np.int32),
451
+ ]
452
+
453
+ prep_model = _coreml_convert(
454
+ traced_prep, prep_inputs, prep_outputs, export_settings,
455
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
456
+ )
457
+
458
+ prep_path = output_path / "preprocessor.mlpackage"
459
+ _save_mlpackage(prep_model, prep_path, "Preprocessor")
460
+ typer.echo(f"Saved: {prep_path}")
461
+
462
+ # ========== Export Pre-Encode (ConvSubsampling) ==========
463
+ typer.echo("\n=== Exporting Pre-Encode ===")
464
+
465
+ pre_encode = encoder.pre_encode
466
+ # Use fixed chunk wrapper for diagnostic (single large chunk)
467
+ pre_encode_wrapper = FixedChunkPreEncodeWrapper(pre_encode, mel_dim)
468
+
469
+ # Chunk size for input (1.28s = 128 frames)
470
+ chunk_size_in = 128
471
+
472
+ # Test inputs
473
+ # CRITICAL: Must match PreEncodeWrapper expectation [B, D, T]
474
+ test_mel = torch.randn(1, mel_dim, chunk_size_in, dtype=torch.float32)
475
+ test_mel_len = torch.tensor([chunk_size_in], dtype=torch.long)
476
+
477
+ with torch.no_grad():
478
+ test_out, test_out_len = pre_encode_wrapper(test_mel, test_mel_len)
479
+ typer.echo(f"Pre-encode test: [{chunk_size_in}x{mel_dim}] -> {list(test_out.shape)}")
480
+
481
+ traced_pre = torch.jit.trace(pre_encode_wrapper, (test_mel, test_mel_len), strict=False)
482
+ traced_pre.eval()
483
+
484
+ pre_inputs = [
485
+ ct.TensorType(
486
+ name="mel",
487
+ shape=(1, 128, chunk_size_in),
488
+ dtype=np.float32,
489
+ ),
490
+ ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
491
+ ]
492
+ pre_outputs = [
493
+ ct.TensorType(name="pre_encoded", dtype=np.float32),
494
+ ct.TensorType(name="pre_encoded_length", dtype=np.int32),
495
+ ]
496
+
497
+ try:
498
+ pre_model = _coreml_convert(
499
+ traced_pre, pre_inputs, pre_outputs, export_settings,
500
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
501
+ compute_precision=ct.precision.FLOAT32,
502
+ )
503
+
504
+ pre_path = output_path / "pre_encode.mlpackage"
505
+ _save_mlpackage(pre_model, pre_path, "PreEncode")
506
+ typer.echo(f"Saved: {pre_path}")
507
+ except Exception as e:
508
+ typer.echo(f"Pre-encode export failed: {e}")
509
+ typer.echo("Continuing with other components...")
510
+
511
+ # ========== Export Conformer Layers (trying different approach) ==========
512
+ typer.echo("\n=== Exporting Conformer Layers ===")
513
+
514
+ # Instead of using cache_aware_stream_step directly, let's try layer-by-layer
515
+ # The issue is that cache_aware_stream_step has complex control flow
516
+
517
+ # For now, export a batch encoder that processes pre_encoded input through conformer layers
518
+ # This is a simplified approach that won't have true streaming but will work
519
+
520
+
521
+ conformer_wrapper = ConformerBatchWrapper(encoder)
522
+
523
+ # Test input shape (output from pre_encode)
524
+ with torch.no_grad():
525
+ pre_out, pre_out_len = pre_encode_wrapper(test_mel, test_mel_len)
526
+
527
+ test_conformer_in = pre_out # [B, T', hidden_dim]
528
+ test_conformer_len = pre_out_len.to(torch.long)
529
+
530
+ typer.echo(f"Conformer input shape: {list(test_conformer_in.shape)}")
531
+
532
+ try:
533
+ with torch.no_grad():
534
+ conf_out, conf_len = conformer_wrapper(test_conformer_in, test_conformer_len)
535
+ typer.echo(f"Conformer output shape: {list(conf_out.shape)}")
536
+
537
+ traced_conf = torch.jit.trace(
538
+ conformer_wrapper, (test_conformer_in, test_conformer_len), strict=False
539
+ )
540
+ traced_conf.eval()
541
+
542
+ # Use fixed shapes
543
+ T_pre = test_conformer_in.shape[1]
544
+ conf_inputs = [
545
+ ct.TensorType(
546
+ name="pre_encoded",
547
+ shape=(1, T_pre, 512),
548
+ dtype=np.float32,
549
+ ),
550
+ ct.TensorType(name="pre_encoded_length", shape=(1,), dtype=np.int32),
551
+ ]
552
+ conf_outputs = [
553
+ ct.TensorType(name="encoder", dtype=np.float32),
554
+ ct.TensorType(name="encoder_length", dtype=np.int32),
555
+ ]
556
+
557
+ conf_model = _coreml_convert(
558
+ traced_conf, conf_inputs, conf_outputs, export_settings,
559
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
560
+ )
561
+
562
+ conf_path = output_path / "conformer_batch.mlpackage"
563
+ _save_mlpackage(conf_model, conf_path, "ConformerBatch")
564
+ typer.echo(f"Saved: {conf_path}")
565
+ except Exception as e:
566
+ typer.echo(f"Conformer export failed: {e}")
567
+ import traceback
568
+ traceback.print_exc()
569
+
570
+ # ========== Export Decoder ==========
571
+ typer.echo("\n=== Exporting Decoder ===")
572
+
573
+ decoder = asr_model.decoder
574
+ decoder_wrapper = DecoderWrapper(decoder)
575
+
576
+ decoder_hidden = int(decoder.pred_hidden)
577
+ decoder_layers = 1
578
+
579
+ test_target = torch.tensor([[0]], dtype=torch.int32)
580
+ test_target_len = torch.tensor([1], dtype=torch.int32)
581
+ test_h = torch.zeros(decoder_layers, 1, decoder_hidden, dtype=torch.float32)
582
+ test_c = torch.zeros(decoder_layers, 1, decoder_hidden, dtype=torch.float32)
583
+
584
+ traced_decoder = torch.jit.trace(
585
+ decoder_wrapper, (test_target, test_target_len, test_h, test_c), strict=False
586
+ )
587
+ traced_decoder.eval()
588
+
589
+ decoder_inputs = [
590
+ ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
591
+ ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
592
+ ct.TensorType(name="h_in", shape=(decoder_layers, 1, decoder_hidden), dtype=np.float32),
593
+ ct.TensorType(name="c_in", shape=(decoder_layers, 1, decoder_hidden), dtype=np.float32),
594
+ ]
595
+ decoder_outputs = [
596
+ ct.TensorType(name="decoder", dtype=np.float32),
597
+ ct.TensorType(name="h_out", dtype=np.float32),
598
+ ct.TensorType(name="c_out", dtype=np.float32),
599
+ ]
600
+
601
+ decoder_model = _coreml_convert(
602
+ traced_decoder, decoder_inputs, decoder_outputs, export_settings,
603
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
604
+ )
605
+
606
+ decoder_path = output_path / "decoder.mlpackage"
607
+ _save_mlpackage(decoder_model, decoder_path, "Decoder")
608
+ typer.echo(f"Saved: {decoder_path}")
609
+
610
+ # ========== Export Joint Decision ==========
611
+ typer.echo("\n=== Exporting Joint Decision ===")
612
+
613
+ joint = asr_model.joint
614
+ joint_wrapper = JointWrapper(joint)
615
+ vocab_size = int(asr_model.cfg.joint.num_classes)
616
+
617
+ jd_single = JointDecisionSingleStep(joint_wrapper, vocab_size=vocab_size)
618
+
619
+ # Get test encoder output
620
+ with torch.no_grad():
621
+ # Use pre_encode + conformer for encoder output
622
+ pre_out, pre_len = pre_encode_wrapper(test_mel, test_mel_len)
623
+ enc_out, enc_len = conformer_wrapper(pre_out, pre_len.to(torch.long))
624
+ dec_out, _, _ = decoder_wrapper(test_target, test_target_len, test_h, test_c)
625
+
626
+ enc_step = enc_out[:, :, :1].contiguous()
627
+ dec_step = dec_out[:, :, :1].contiguous()
628
+
629
+ traced_jd = torch.jit.trace(jd_single, (enc_step, dec_step), strict=False)
630
+ traced_jd.eval()
631
+
632
+ jd_inputs = [
633
+ ct.TensorType(name="encoder_step", shape=(1, enc_step.shape[1], 1), dtype=np.float32),
634
+ ct.TensorType(name="decoder_step", shape=(1, dec_step.shape[1], 1), dtype=np.float32),
635
+ ]
636
+ jd_outputs = [
637
+ ct.TensorType(name="token_id", dtype=np.int32),
638
+ ct.TensorType(name="token_prob", dtype=np.float32),
639
+ ct.TensorType(name="top_k_ids", dtype=np.int32),
640
+ ct.TensorType(name="top_k_logits", dtype=np.float32),
641
+ ]
642
+
643
+ jd_model = _coreml_convert(
644
+ traced_jd, jd_inputs, jd_outputs, export_settings,
645
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
646
+ )
647
+
648
+ jd_path = output_path / "joint_decision.mlpackage"
649
+ _save_mlpackage(jd_model, jd_path, "JointDecision")
650
+ typer.echo(f"Saved: {jd_path}")
651
+
652
+ # ========== Save Metadata ==========
653
+ typer.echo("\n=== Saving Metadata ===")
654
+
655
+ metadata = {
656
+ "model_id": model_id,
657
+ "model_name": "parakeet_realtime_eou_120m-v1-split",
658
+ "streaming_mode": "split_encoder",
659
+ "sample_rate": sample_rate,
660
+ "mel_dim": mel_dim,
661
+ "hidden_dim": hidden_dim,
662
+ "num_layers": num_layers,
663
+ "mel_frames_per_chunk": mel_frames_per_chunk,
664
+ "vocab_size": vocab_size,
665
+ "blank_id": vocab_size,
666
+ "decoder_hidden": decoder_hidden,
667
+ "decoder_layers": decoder_layers,
668
+ "cache_channel_size": cache_channel_size,
669
+ "cache_time_size": cache_time_size,
670
+ "components": {
671
+ "preprocessor": "preprocessor.mlpackage",
672
+ "pre_encode": "pre_encode.mlpackage",
673
+ "conformer": "conformer_batch.mlpackage",
674
+ "decoder": "decoder.mlpackage",
675
+ "joint_decision": "joint_decision.mlpackage",
676
+ },
677
+ }
678
+
679
+ with open(output_path / "metadata.json", "w") as f:
680
+ json.dump(metadata, f, indent=2)
681
+ typer.echo(f"Saved: {output_path / 'metadata.json'}")
682
+
683
+ # Copy vocabulary
684
+ tokenizer = asr_model.tokenizer
685
+ vocab = {}
686
+ for i in range(tokenizer.vocab_size):
687
+ vocab[str(i)] = tokenizer.ids_to_tokens([i])[0]
688
+
689
+ with open(output_path / "vocab.json", "w") as f:
690
+ json.dump(vocab, f, indent=2, ensure_ascii=False)
691
+ typer.echo(f"Saved: {output_path / 'vocab.json'}")
692
+
693
+ typer.echo("\n=== Export Complete ===")
694
+ typer.echo(f"Output directory: {output_path}")
695
+
696
+
697
+ if __name__ == "__main__":
698
+ typer.run(main)
Conversion/individual_components.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export Parakeet Realtime EOU 120M RNNT components into CoreML."""
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Optional, Tuple
8
+
9
+ import coremltools as ct
10
+ import torch
11
+
12
+
13
+ @dataclass
14
+ class ExportSettings:
15
+ output_dir: Path
16
+ compute_units: ct.ComputeUnit
17
+ deployment_target: Optional[ct.target]
18
+ compute_precision: Optional[ct.precision]
19
+ max_audio_seconds: float
20
+ max_symbol_steps: int
21
+
22
+
23
+ class PreprocessorWrapper(torch.nn.Module):
24
+ """Wrapper for the audio preprocessor (mel spectrogram extraction)."""
25
+
26
+ def __init__(self, module: torch.nn.Module) -> None:
27
+ super().__init__()
28
+ self.module = module
29
+
30
+ def forward(
31
+ self, audio_signal: torch.Tensor, length: torch.Tensor
32
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ mel, mel_length = self.module(
34
+ input_signal=audio_signal, length=length.to(dtype=torch.long)
35
+ )
36
+ return mel, mel_length
37
+
38
+
39
+ class EncoderWrapper(torch.nn.Module):
40
+ """Wrapper for the cache-aware FastConformer encoder.
41
+
42
+ Note: For the realtime EOU model, the encoder is cache-aware which means
43
+ it can operate in a streaming fashion. For CoreML export, we export
44
+ without cache state for simplicity (full-context mode).
45
+ """
46
+
47
+ def __init__(self, module: torch.nn.Module) -> None:
48
+ super().__init__()
49
+ self.module = module
50
+
51
+ def forward(
52
+ self, features: torch.Tensor, length: torch.Tensor
53
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
54
+ encoded, encoded_lengths = self.module(
55
+ audio_signal=features, length=length.to(dtype=torch.long)
56
+ )
57
+ # Synthesize per-frame timestamps (seconds) using the 80 ms encoder stride.
58
+ # Shape: [B, T_enc]
59
+ frame_times = (
60
+ torch.arange(encoded.shape[-1], device=encoded.device, dtype=torch.float32)
61
+ * 0.08
62
+ )
63
+ return encoded, encoded_lengths, frame_times
64
+
65
+
66
+ class DecoderWrapper(torch.nn.Module):
67
+ """Wrapper for the RNNT prediction network (decoder)."""
68
+
69
+ def __init__(self, module: torch.nn.Module) -> None:
70
+ super().__init__()
71
+ self.module = module
72
+
73
+ def forward(
74
+ self,
75
+ targets: torch.Tensor,
76
+ target_lengths: torch.Tensor,
77
+ h_in: torch.Tensor,
78
+ c_in: torch.Tensor,
79
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
80
+ state = [h_in, c_in]
81
+ decoder_output, _, new_state = self.module(
82
+ targets=targets.to(dtype=torch.long),
83
+ target_length=target_lengths.to(dtype=torch.long),
84
+ states=state,
85
+ )
86
+ return decoder_output, new_state[0], new_state[1]
87
+
88
+
89
+ class JointWrapper(torch.nn.Module):
90
+ """Wrapper for the RNNT joint network.
91
+
92
+ Note: Unlike Parakeet TDT v3, the realtime EOU model does NOT have
93
+ duration outputs (num_extra_outputs). The joint network outputs only
94
+ token logits over the vocabulary + blank.
95
+ """
96
+
97
+ def __init__(self, module: torch.nn.Module) -> None:
98
+ super().__init__()
99
+ self.module = module
100
+
101
+ def forward(
102
+ self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor
103
+ ) -> torch.Tensor:
104
+ # Input: encoder_outputs [B, D, T], decoder_outputs [B, D, U]
105
+ # Transpose to match what projection layers expect
106
+ encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D]
107
+ decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D]
108
+
109
+ # Apply projections
110
+ enc_proj = self.module.enc(encoder_outputs) # [B, T, joint_hidden]
111
+ dec_proj = self.module.pred(decoder_outputs) # [B, U, joint_hidden]
112
+
113
+ # Explicit broadcasting along T and U to avoid converter ambiguity
114
+ x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, joint_hidden]
115
+ x = self.module.joint_net[0](x) # ReLU
116
+ x = self.module.joint_net[1](x) # Dropout (no-op in eval)
117
+ out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+blank]
118
+ return out
119
+
120
+
121
+ class MelEncoderWrapper(torch.nn.Module):
122
+ """Fused wrapper: waveform -> mel -> encoder.
123
+
124
+ Inputs:
125
+ - audio_signal: [B, S]
126
+ - audio_length: [B]
127
+
128
+ Outputs:
129
+ - encoder: [B, D, T_enc]
130
+ - encoder_length: [B]
131
+ - frame_times: [T_enc]
132
+ """
133
+
134
+ def __init__(
135
+ self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper
136
+ ) -> None:
137
+ super().__init__()
138
+ self.preprocessor = preprocessor
139
+ self.encoder = encoder
140
+
141
+ def forward(
142
+ self, audio_signal: torch.Tensor, audio_length: torch.Tensor
143
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
144
+ mel, mel_length = self.preprocessor(audio_signal, audio_length)
145
+ encoded, enc_len, frame_times = self.encoder(mel, mel_length.to(dtype=torch.int32))
146
+ return encoded, enc_len, frame_times
147
+
148
+
149
+ class JointDecisionWrapper(torch.nn.Module):
150
+ """Joint + decision head: outputs label id and label prob.
151
+
152
+ Unlike Parakeet TDT v3, this model does NOT have duration outputs.
153
+
154
+ Inputs:
155
+ - encoder_outputs: [B, D, T]
156
+ - decoder_outputs: [B, D, U]
157
+
158
+ Returns:
159
+ - token_id: [B, T, U] int32
160
+ - token_prob: [B, T, U] float32
161
+ """
162
+
163
+ def __init__(self, joint: JointWrapper, vocab_size: int) -> None:
164
+ super().__init__()
165
+ self.joint = joint
166
+ self.vocab_with_blank = int(vocab_size) + 1
167
+
168
+ def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
169
+ logits = self.joint(encoder_outputs, decoder_outputs)
170
+
171
+ # Token selection
172
+ token_ids = torch.argmax(logits, dim=-1).to(dtype=torch.int32)
173
+ token_probs_all = torch.softmax(logits, dim=-1)
174
+ # gather expects int64 (long) indices; cast only for gather
175
+ token_prob = torch.gather(
176
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
177
+ ).squeeze(-1)
178
+
179
+ return token_ids, token_prob
180
+
181
+
182
+ class JointDecisionSingleStep(torch.nn.Module):
183
+ """Single-step variant for streaming: encoder_step -> token decision.
184
+
185
+ Inputs:
186
+ - encoder_step: [B=1, D, T=1]
187
+ - decoder_step: [B=1, D, U=1]
188
+
189
+ Returns:
190
+ - token_id: [1, 1, 1] int32
191
+ - token_prob: [1, 1, 1] float32
192
+ - top_k_ids: [1, 1, 1, K] int32
193
+ - top_k_logits: [1, 1, 1, K] float32
194
+ """
195
+
196
+ def __init__(self, joint: JointWrapper, vocab_size: int, top_k: int = 64) -> None:
197
+ super().__init__()
198
+ self.joint = joint
199
+ self.vocab_with_blank = int(vocab_size) + 1
200
+ self.top_k = int(top_k)
201
+
202
+ def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
203
+ # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
204
+ logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+blank]
205
+
206
+ token_ids = torch.argmax(logits, dim=-1, keepdim=False).to(dtype=torch.int32)
207
+ token_probs_all = torch.softmax(logits, dim=-1)
208
+ token_prob = torch.gather(
209
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
210
+ ).squeeze(-1)
211
+
212
+ # Also expose top-K candidates for host-side processing
213
+ topk_logits, topk_ids_long = torch.topk(
214
+ logits, k=min(self.top_k, logits.shape[-1]), dim=-1
215
+ )
216
+ topk_ids = topk_ids_long.to(dtype=torch.int32)
217
+ return token_ids, token_prob, topk_ids, topk_logits
218
+
219
+
220
+ def _coreml_convert(
221
+ traced: torch.jit.ScriptModule,
222
+ inputs,
223
+ outputs,
224
+ settings: ExportSettings,
225
+ compute_units_override: Optional[ct.ComputeUnit] = None,
226
+ compute_precision: Optional[ct.precision] = None,
227
+ ) -> ct.models.MLModel:
228
+ cu = (
229
+ compute_units_override
230
+ if compute_units_override is not None
231
+ else settings.compute_units
232
+ )
233
+ kwargs = {
234
+ "convert_to": "mlprogram",
235
+ "inputs": inputs,
236
+ "outputs": outputs,
237
+ "compute_units": cu,
238
+ }
239
+ print("Converting:", traced.__class__.__name__)
240
+ print("Conversion kwargs:", kwargs)
241
+ if settings.deployment_target is not None:
242
+ kwargs["minimum_deployment_target"] = settings.deployment_target
243
+
244
+ # Priority: explicit argument > settings
245
+ if compute_precision is not None:
246
+ kwargs["compute_precision"] = compute_precision
247
+ elif settings.compute_precision is not None:
248
+ kwargs["compute_precision"] = settings.compute_precision
249
+
250
+ return ct.convert(traced, **kwargs)
Inference/debug_nemo_streaming.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import numpy as np
4
+ import nemo.collections.asr as nemo_asr
5
+ from omegaconf import OmegaConf
6
+
7
+ print("DEBUG SCRIPT STARTED")
8
+
9
+ def debug_streaming_config(
10
+ audio_path: str,
11
+ model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
12
+ ):
13
+ print(f"\n{'='*60}")
14
+ print(f"Debugging NeMo Streaming Configuration")
15
+ print(f"{'='*60}")
16
+
17
+ # Load model
18
+ print("Loading NeMo model...")
19
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(
20
+ model_id, map_location="cpu"
21
+ )
22
+ asr_model.eval()
23
+
24
+ encoder = asr_model.encoder
25
+
26
+ # Print current streaming config
27
+ print("\n--- Current Streaming Config ---")
28
+ if hasattr(encoder, 'streaming_cfg'):
29
+ print(encoder.streaming_cfg)
30
+ else:
31
+ print("No streaming_cfg found on encoder!")
32
+
33
+ # Experiment 1: Try to set a simpler streaming config
34
+ # Based on FastConformer defaults or common settings
35
+ print("\n--- Experiment 1: Setting Explicit Streaming Config ---")
36
+
37
+ # Try 160ms chunk (16 frames)
38
+ # Subsampling is 8x. 16 frames input -> 2 frames output?
39
+ # Or is chunk_size in output frames?
40
+ # NeMo docs say chunk_size is in "steps" (subsampled frames).
41
+
42
+ # Let's try to set a config that matches what we think it should be
43
+ # chunk_size=16 (160ms if 10ms stride?) No, 16 steps * 8 * 10ms = 1280ms?
44
+ # Let's check the default again: chunk_size=[9, 16]
45
+
46
+ # Let's try to run inference with the DEFAULT config first on a small chunk
47
+ audio, sr = sf.read(audio_path)
48
+ if len(audio.shape) > 1: audio = audio.mean(axis=1)
49
+
50
+ # Take 1.28s chunk
51
+ chunk_samples = int(1.28 * sr)
52
+ chunk = audio[:chunk_samples]
53
+ chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).float()
54
+ chunk_len = torch.tensor([len(chunk)], dtype=torch.int32)
55
+
56
+ # Preprocess
57
+ with torch.no_grad():
58
+ mel, mel_len = asr_model.preprocessor(
59
+ input_signal=chunk_tensor,
60
+ length=chunk_len
61
+ )
62
+
63
+ print(f"\nMel shape: {mel.shape}")
64
+
65
+ # Initialize cache
66
+ cache_last_channel, cache_last_time, cache_len = encoder.get_initial_cache_state(1)
67
+
68
+ # Run step
69
+ try:
70
+ with torch.no_grad():
71
+ outputs = encoder.cache_aware_stream_step(
72
+ processed_signal=mel,
73
+ processed_signal_length=mel_len,
74
+ cache_last_channel=cache_last_channel,
75
+ cache_last_time=cache_last_time,
76
+ cache_last_channel_len=cache_len,
77
+ )
78
+ enc_out = outputs[0]
79
+ print(f"Default Config Output Shape: {enc_out.shape}")
80
+ print(f"Default Config Output Mean: {enc_out.mean().item():.4f}")
81
+ print(f"Default Config Output Std: {enc_out.std().item():.4f}")
82
+
83
+ # Decode
84
+ decoder = asr_model.decoder
85
+ joint = asr_model.joint
86
+ blank_id = int(decoder.blank_idx)
87
+ vocab = asr_model.tokenizer.tokenizer.get_vocab()
88
+ id_to_token = {v: k for k, v in vocab.items()}
89
+
90
+ # Simple greedy decode of this frame
91
+ # (Copying simplified logic)
92
+ h = torch.zeros(int(decoder.pred_rnn_layers), 1, int(decoder.pred_hidden))
93
+ c = torch.zeros(int(decoder.pred_rnn_layers), 1, int(decoder.pred_hidden))
94
+
95
+ # Just check first frame
96
+ enc_frame = enc_out[:, :, 0:1]
97
+ targets = torch.tensor([[blank_id]], dtype=torch.int64)
98
+ target_len = torch.tensor([1], dtype=torch.int64)
99
+ with torch.no_grad():
100
+ dec_out, _, _ = decoder(targets=targets, target_length=target_len, states=[h, c])
101
+ joint_out = joint.joint(enc_frame.transpose(1, 2), dec_out[:, :, :1].transpose(1, 2))
102
+ logits = joint_out.squeeze()
103
+ token_id = logits.argmax().item()
104
+ print(f"Predicted Token ID: {token_id} ({id_to_token.get(token_id, '???')})")
105
+
106
+ except Exception as e:
107
+ print(f"Default Config Failed: {e}")
108
+
109
+ # Experiment 3: Multi-chunk streaming with EXACTLY 128 frames
110
+ print("\n--- Experiment 3: Multi-chunk Streaming (128 frames / 1280ms) ---")
111
+
112
+ audio, sr = sf.read(audio_path)
113
+ if len(audio.shape) > 1: audio = audio.mean(axis=1)
114
+
115
+ print(f"Audio loaded: {len(audio)} samples, SR: {sr}")
116
+
117
+ # Chunk size: 1280ms = 1.28s
118
+ # Samples: 1.28 * 16000 = 20480
119
+ chunk_samples = 20480
120
+ print(f"Chunk samples: {chunk_samples}")
121
+
122
+ # Initialize cache
123
+ cache_last_channel, cache_last_time, cache_len = encoder.get_initial_cache_state(1)
124
+
125
+ # Initialize decoder state
126
+ decoder = asr_model.decoder
127
+ joint = asr_model.joint
128
+ blank_id = int(decoder.blank_idx)
129
+ vocab = asr_model.tokenizer.tokenizer.get_vocab()
130
+ id_to_token = {v: k for k, v in vocab.items()}
131
+
132
+ h = torch.zeros(int(decoder.pred_rnn_layers), 1, int(decoder.pred_hidden))
133
+ c = torch.zeros(int(decoder.pred_rnn_layers), 1, int(decoder.pred_hidden))
134
+ decoder_state = (h, c)
135
+
136
+ # Try 160ms chunks
137
+ # 160ms * 16000 = 2560 samples
138
+ chunk_samples = 2560
139
+ print(f"Testing with 160ms chunks ({chunk_samples} samples)")
140
+
141
+ num_chunks = (len(audio) + chunk_samples - 1) // chunk_samples
142
+ all_tokens = []
143
+
144
+ for i in range(num_chunks):
145
+ start = i * chunk_samples
146
+ end = min(start + chunk_samples, len(audio))
147
+ chunk = audio[start:end]
148
+
149
+ # Pad if needed
150
+ if len(chunk) < chunk_samples:
151
+ chunk = np.pad(chunk, (0, chunk_samples - len(chunk)))
152
+
153
+ chunk_tensor = torch.from_numpy(chunk).unsqueeze(0).float()
154
+ chunk_len = torch.tensor([len(chunk)], dtype=torch.int32)
155
+
156
+ # Preprocess
157
+ with torch.no_grad():
158
+ mel, mel_len = asr_model.preprocessor(
159
+ input_signal=chunk_tensor,
160
+ length=chunk_len
161
+ )
162
+
163
+ # Run encoder
164
+ with torch.no_grad():
165
+ outputs = encoder.cache_aware_stream_step(
166
+ processed_signal=mel,
167
+ processed_signal_length=mel_len,
168
+ cache_last_channel=cache_last_channel,
169
+ cache_last_time=cache_last_time,
170
+ cache_last_channel_len=cache_len,
171
+ )
172
+
173
+ enc_out = outputs[0]
174
+ cache_last_channel = outputs[2]
175
+ cache_last_time = outputs[3]
176
+ cache_len = outputs[4]
177
+
178
+ # Decode (Greedy)
179
+ chunk_tokens = []
180
+ time_steps = enc_out.shape[2]
181
+
182
+ for t in range(time_steps):
183
+ enc_frame = enc_out[:, :, t:t+1]
184
+ current_token = blank_id if not chunk_tokens else chunk_tokens[-1]
185
+
186
+ # Max symbols per step
187
+ for _ in range(5):
188
+ targets = torch.tensor([[current_token]], dtype=torch.int64)
189
+ target_len = torch.tensor([1], dtype=torch.int64)
190
+ with torch.no_grad():
191
+ dec_out, _, (h, c) = decoder(targets=targets, target_length=target_len, states=[h, c])
192
+ joint_out = joint.joint(enc_frame.transpose(1, 2), dec_out[:, :, :1].transpose(1, 2))
193
+ logits = joint_out.squeeze()
194
+ token_id = logits.argmax().item()
195
+
196
+ if token_id == blank_id:
197
+ break
198
+
199
+ chunk_tokens.append(token_id)
200
+ current_token = token_id
201
+
202
+ all_tokens.extend(chunk_tokens)
203
+
204
+ # Print chunk text
205
+ chunk_text = ""
206
+ for tid in chunk_tokens:
207
+ tstr = id_to_token.get(tid, '')
208
+ if tstr.startswith(' '): chunk_text += " " + tstr[1:]
209
+ else: chunk_text += tstr
210
+ print(f"Chunk {i+1}: '{chunk_text}' (Mel: {mel.shape}, Enc: {enc_out.shape})")
211
+
212
+ # Final text
213
+ final_text = ""
214
+ for tid in all_tokens:
215
+ tstr = id_to_token.get(tid, '')
216
+ if tstr.startswith(' '): final_text += " " + tstr[1:]
217
+ else: final_text += tstr
218
+ print(f"\nFinal Text: '{final_text}'")
Inference/print_config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import nemo.collections.asr as nemo_asr
3
+ import torch
4
+
5
+ model_id = "nvidia/parakeet_realtime_eou_120m-v1"
6
+ print(f"Loading {model_id}...")
7
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(model_id, map_location="cpu")
8
+
9
+ print("\n=== Model Config ===")
10
+ if hasattr(asr_model.encoder, 'streaming_cfg'):
11
+ print(f"Streaming Config: {asr_model.encoder.streaming_cfg}")
12
+ else:
13
+ print("No streaming_cfg found on encoder")
14
+
15
+ if hasattr(asr_model.encoder, 'subsampling_factor'):
16
+ print(f"Subsampling Factor: {asr_model.encoder.subsampling_factor}")
17
+ else:
18
+ print("No subsampling_factor found on encoder")
19
+
20
+ print(f"\nPreprocessor Config:")
21
+ print(asr_model.cfg.preprocessor)
Inference/test_full_pytorch_streaming.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Full PyTorch streaming inference with decoder/joint to compare with CoreML."""
3
+
4
+ import torch
5
+ import soundfile as sf
6
+ import numpy as np
7
+ from pathlib import Path
8
+
9
+ import nemo.collections.asr as nemo_asr
10
+
11
+
12
+ def greedy_decode_streaming(
13
+ encoder_output,
14
+ encoder_length,
15
+ decoder_model,
16
+ joint_model,
17
+ decoder_state,
18
+ blank_id,
19
+ eos_id=None,
20
+ max_symbols_per_step=10
21
+ ):
22
+ """Simplified greedy decoding for streaming (processes encoder output incrementally)."""
23
+
24
+ batch_size = encoder_output.shape[0]
25
+ time_steps = encoder_output.shape[2]
26
+
27
+ tokens = []
28
+
29
+ # Decoder hidden state (h, c) from previous chunk or zeros
30
+ h, c = decoder_state
31
+
32
+ for t in range(time_steps):
33
+ # Get encoder frame [B, D, 1]
34
+ enc_frame = encoder_output[:, :, t:t+1]
35
+
36
+ # Start with blank or last token
37
+ current_token = blank_id if not tokens else tokens[-1]
38
+
39
+ symbols_this_frame = 0
40
+ while symbols_this_frame < max_symbols_per_step:
41
+ # Run decoder
42
+ targets = torch.tensor([[current_token]], dtype=torch.int64)
43
+ target_len = torch.tensor([1], dtype=torch.int64)
44
+
45
+ with torch.no_grad():
46
+ dec_out, _, (h, c) = decoder_model(
47
+ targets=targets,
48
+ target_length=target_len,
49
+ states=[h, c]
50
+ )
51
+
52
+ # Run joint [B, D_enc, 1] + [B, D_dec, 1] -> [B, vocab]
53
+ with torch.no_grad():
54
+ # Joint network expects specific input format
55
+ # Use positional arguments as keyword arguments might vary
56
+ joint_out = joint_model.joint(
57
+ enc_frame.transpose(1, 2), # [B, 1, D_enc]
58
+ dec_out[:, :, :1].transpose(1, 2), # [B, 1, D_dec]
59
+ )
60
+
61
+ # Get prediction [B, 1, 1, vocab] -> [vocab]
62
+ logits = joint_out.squeeze(0).squeeze(0).squeeze(0) # [vocab]
63
+ token_id = logits.argmax().item()
64
+
65
+ if token_id == blank_id:
66
+ break # Move to next frame
67
+
68
+ tokens.append(token_id)
69
+ current_token = token_id
70
+ symbols_this_frame += 1
71
+
72
+ if eos_id is not None and token_id == eos_id:
73
+ break
74
+
75
+ return tokens, (h, c)
76
+
77
+
78
+ def test_full_streaming_inference(
79
+ audio_path: str,
80
+ chunk_ms: int = 320,
81
+ model_id: str = "nvidia/parakeet_realtime_eou_120m-v1"
82
+ ):
83
+ """Run complete streaming inference including decoder and joint."""
84
+
85
+ print(f"\n{'='*60}")
86
+ print(f"Full PyTorch Streaming Inference (with Decoder/Joint)")
87
+ print(f"{'='*60}")
88
+ print(f"Audio: {audio_path}")
89
+ print(f"Chunk size: {chunk_ms}ms\n")
90
+
91
+ # Load model
92
+ print("Loading NeMo model...")
93
+ asr_model = nemo_asr.models.ASRModel.from_pretrained(
94
+ model_id, map_location="cpu"
95
+ )
96
+ asr_model.eval()
97
+
98
+ # Get components
99
+ encoder = asr_model.encoder
100
+ decoder = asr_model.decoder
101
+ joint = asr_model.joint
102
+
103
+ # Enable RNNT export mode
104
+ decoder._rnnt_export = True
105
+
106
+ # Get config
107
+ sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
108
+ chunk_samples = int(chunk_ms / 1000 * sample_rate)
109
+ blank_id = int(decoder.blank_idx)
110
+ vocab = asr_model.tokenizer.tokenizer.get_vocab()
111
+ id_to_token = {v: k for k, v in vocab.items()}
112
+
113
+ # Check for EOU token
114
+ eou_id = vocab.get('<EOU>', None)
115
+
116
+ print(f"Vocab size: {len(vocab)}, Blank ID: {blank_id}, EOU ID: {eou_id}")
117
+
118
+ # Load audio
119
+ audio, sr = sf.read(audio_path)
120
+ if sr != sample_rate:
121
+ raise ValueError(f"Audio sample rate {sr} != model rate {sample_rate}")
122
+
123
+ if len(audio.shape) > 1:
124
+ audio = audio.mean(axis=1)
125
+
126
+ print(f"Audio: {len(audio)} samples ({len(audio)/sample_rate:.2f}s)")
127
+
128
+ # Initialize cache state
129
+ cache_last_channel, cache_last_time, cache_len = encoder.get_initial_cache_state(1)
130
+
131
+ # Initialize decoder state
132
+ decoder_hidden_size = int(decoder.pred_hidden)
133
+ decoder_layers = int(decoder.pred_rnn_layers)
134
+ h = torch.zeros(decoder_layers, 1, decoder_hidden_size)
135
+ c = torch.zeros(decoder_layers, 1, decoder_hidden_size)
136
+ decoder_state = (h, c)
137
+
138
+ # Audio buffer for continuous preprocessing
139
+ buffer_size_seconds = 4.0
140
+ buffer_samples = int(buffer_size_seconds * sample_rate)
141
+ audio_buffer = np.zeros(buffer_samples, dtype=np.float32)
142
+
143
+ # Process chunks
144
+ num_chunks = (len(audio) + chunk_samples - 1) // chunk_samples
145
+ print(f"Processing {num_chunks} chunks with buffering...\n")
146
+
147
+ all_tokens = []
148
+
149
+ for i in range(num_chunks):
150
+ start_idx = i * chunk_samples
151
+ end_idx = min(start_idx + chunk_samples, len(audio))
152
+ chunk = audio[start_idx:end_idx]
153
+
154
+ # Pad last chunk
155
+ if len(chunk) < chunk_samples:
156
+ chunk = np.pad(chunk, (0, chunk_samples - len(chunk)))
157
+
158
+ # Update buffer
159
+ audio_buffer = np.roll(audio_buffer, -len(chunk))
160
+ audio_buffer[-len(chunk):] = chunk
161
+
162
+ # Use entire buffer for preprocessing
163
+ buffer_tensor = torch.from_numpy(audio_buffer).unsqueeze(0).float()
164
+ buffer_len = torch.tensor([len(audio_buffer)], dtype=torch.int32)
165
+
166
+ # Preprocessor
167
+ with torch.no_grad():
168
+ mel, mel_len = asr_model.preprocessor(
169
+ input_signal=buffer_tensor,
170
+ length=buffer_len
171
+ )
172
+
173
+ # Extract features for the NEW chunk
174
+ # Stride is usually 10ms (0.01s)
175
+ stride_ms = 10
176
+ # Calculate expected frames for this chunk
177
+ # For 1280ms -> 128 frames. CoreML expects 129.
178
+ # For 2500ms -> 250 frames.
179
+
180
+ # We should extract frames corresponding to the chunk duration
181
+ # Plus maybe 1 frame for overlap/safety?
182
+ # NeMo uses: int(chunk_len / stride)
183
+
184
+ extract_frames = int(chunk_ms / stride_ms)
185
+ if chunk_ms == 1280:
186
+ extract_frames = 129 # Special case for our CoreML model
187
+
188
+ total_frames = mel.shape[2]
189
+
190
+ if total_frames >= extract_frames:
191
+ mel_chunk = mel[:, :, -extract_frames:]
192
+ mel_chunk_len = torch.tensor([extract_frames], dtype=torch.int32)
193
+ else:
194
+ mel_chunk = mel
195
+ mel_chunk_len = torch.tensor([total_frames], dtype=torch.int32)
196
+
197
+ # Streaming encoder
198
+ with torch.no_grad():
199
+ outputs = encoder.cache_aware_stream_step(
200
+ processed_signal=mel_chunk,
201
+ processed_signal_length=mel_chunk_len,
202
+ cache_last_channel=cache_last_channel,
203
+ cache_last_time=cache_last_time,
204
+ cache_last_channel_len=cache_len,
205
+ )
206
+
207
+ enc_out = outputs[0] # [B, hidden, T]
208
+ enc_len = outputs[1]
209
+ cache_last_channel = outputs[2]
210
+ cache_last_time = outputs[3]
211
+ cache_len = outputs[4]
212
+
213
+ # Decode this chunk
214
+ chunk_tokens, decoder_state = greedy_decode_streaming(
215
+ enc_out, enc_len.item(),
216
+ decoder, joint, decoder_state,
217
+ blank_id, eou_id
218
+ )
219
+
220
+ all_tokens.extend(chunk_tokens)
221
+
222
+ # Convert tokens to text
223
+ chunk_text = ""
224
+ for token_id in chunk_tokens:
225
+ token_str = id_to_token.get(token_id, f"<{token_id}>")
226
+ if token_str.startswith('▁'):
227
+ chunk_text += " " + token_str[1:]
228
+ else:
229
+ chunk_text += token_str
230
+
231
+ print(f"Chunk {i+1}/{num_chunks}: "
232
+ f"enc_frames={enc_len.item()}, "
233
+ f"tokens={len(chunk_tokens)}, "
234
+ f"text=\"{chunk_text.strip()}\"")
235
+
236
+ # Final text
237
+ final_text = ""
238
+ for token_id in all_tokens:
239
+ token_str = id_to_token.get(token_id, f"<{token_id}>")
240
+ if token_str.startswith('▁'):
241
+ final_text += " " + token_str[1:]
242
+ else:
243
+ final_text += token_str
244
+
245
+ print(f"\n{'='*60}")
246
+ print(f"Final Result:")
247
+ print(f"{'='*60}")
248
+ print(f"Text: \"{final_text.strip()}\"")
249
+ print(f"Tokens: {len(all_tokens)}")
250
+ print(f"Token IDs: {all_tokens[:20]}{'...' if len(all_tokens) > 20 else ''}")
251
+
252
+ return all_tokens, final_text.strip()
253
+
254
+
255
+ if __name__ == "__main__":
256
+ audio_path = "she_sells_seashells_16k.wav"
257
+
258
+ if not Path(audio_path).exists():
259
+ print(f"ERROR: Audio file not found: {audio_path}")
260
+ exit(1)
261
+
262
+ # Test with 1280ms chunks (matching Swift implementation)
263
+ print("Testing with 1280ms chunks (matching Swift implementation):")
264
+ tokens, text = test_full_streaming_inference(audio_path, chunk_ms=1280)
265
+
266
+ # Test with full audio (batch simulation)
267
+ print("\n\nTesting with full audio (2500ms):")
268
+ tokens3, text3 = test_full_streaming_inference(audio_path, chunk_ms=1280)
269
+
270
+ # Test with 160ms chunks (NVIDIA recommendation)
271
+ print("\n\nTesting with 160ms chunks (NVIDIA recommendation):")
272
+ tokens4, text4 = test_full_streaming_inference(audio_path, chunk_ms=160)
273
+
274
+ # Test with 720ms chunks (Possible config value)
275
+ print("\n\nTesting with 720ms chunks (Possible config value):")
276
+ tokens5, text5 = test_full_streaming_inference(audio_path, chunk_ms=720)
README.md CHANGED
@@ -1,128 +1,69 @@
1
- # Parakeet Realtime EOU 120M - CoreML
2
-
3
- CoreML conversion of NVIDIA's [Parakeet Realtime EOU 120M](https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1) streaming speech recognition model for Apple Silicon.
4
-
5
- ## Model Overview
6
-
7
- The Parakeet Realtime EOU 120M is a streaming speech recognition model optimized for:
8
- - **Low latency**: 80-160ms streaming latency
9
- - **End-of-utterance detection**: Emits `<EOU>` token when utterance ends
10
- - **Real-time processing**: Cache-aware streaming FastConformer architecture
11
-
12
- ### Architecture
13
- - **Encoder**: Cache-aware streaming FastConformer with 17 layers
14
- - **Decoder**: RNNT (Recurrent Neural Transducer)
15
- - **Parameters**: 120M
16
- - **Input**: 16kHz mono audio
17
- - **Output**: Text with optional `<EOU>` token
18
- - **Vocab Size**: 1026 tokens (1024 + blank + EOU)
19
-
20
- ## Models
21
-
22
- ### Batch Models (`batch_models/`)
23
-
24
- Split-encoder architecture optimized for batch/offline processing:
25
-
26
- | Component | Format | Description |
27
- |-----------|--------|-------------|
28
- | `preprocessor.mlpackage` | mlpackage | Audio -> Mel spectrogram |
29
- | `pre_encode.mlpackage` | mlpackage | Mel -> Pre-encoded features |
30
- | `conformer_batch.mlpackage` | mlpackage | Conformer encoder (batch mode) |
31
- | `decoder.mlpackage` | mlpackage | RNNT prediction network |
32
- | `joint_decision.mlpackage` | mlpackage | Joint + argmax decision |
33
-
34
- Pre-compiled versions (`.mlmodelc`) are also included for faster loading.
35
-
36
- **Configuration:**
37
- - `mel_dim`: 128
38
- - `hidden_dim`: 512
39
- - `num_layers`: 17
40
- - `mel_frames_per_chunk`: 45
41
-
42
- ### Streaming Models (`streaming_models/`)
43
-
44
- True streaming architecture with cache management:
45
-
46
- | Component | Description |
47
- |-----------|-------------|
48
- | `parakeet_eou_streaming_preprocessor.mlpackage` | Audio -> Mel (streaming) |
49
- | `parakeet_eou_streaming_encoder.mlpackage` | Conformer encoder with cache |
50
- | `parakeet_eou_streaming_decoder.mlpackage` | RNNT decoder |
51
- | `parakeet_eou_streaming_joint_decision.mlpackage` | Joint + argmax |
52
-
53
- **Configuration:**
54
- - `chunk_ms`: 160ms per chunk
55
- - `chunk_samples`: 2560 samples
56
- - Cache shapes: 17 layers x [1, 70, 512] channel cache, [1, 512, 8] time cache
57
-
58
- ## Scripts
59
-
60
- ### Conversion (`scripts/conversion/`)
61
-
62
- - `convert_parakeet_eou.py` - Original conversion script
63
- - `convert_split_encoder.py` - Split encoder conversion for batch models
64
- - `convert_streaming.py` - Streaming model conversion
65
-
66
- ### Inference (`scripts/inference/`)
67
-
68
- - `debug_nemo_streaming.py` - NeMo streaming inference reference
69
- - `test_full_pytorch_streaming.py` - PyTorch streaming test
70
- - `verify_coreml_values.py` - CoreML value verification
71
 
72
  ## Usage
73
 
74
- ### Setup
75
-
76
- ```bash
77
- python3 -m venv .venv
78
- source .venv/bin/activate
79
- pip install torch coremltools numpy soundfile
80
- pip install nemo-toolkit[asr]
81
- ```
82
-
83
- ### Converting from Source
84
-
85
- ```bash
86
- # Batch models
87
- python scripts/conversion/convert_split_encoder.py
88
-
89
- # Streaming models
90
- python scripts/conversion/convert_streaming.py
91
- ```
92
-
93
- ### Loading in Python
94
-
95
- ```python
96
- import coremltools as ct
97
-
98
- # Load batch model
99
- preprocessor = ct.models.MLModel("batch_models/preprocessor.mlpackage")
100
- encoder = ct.models.MLModel("batch_models/conformer_batch.mlpackage")
101
- decoder = ct.models.MLModel("batch_models/decoder.mlpackage")
102
- joint = ct.models.MLModel("batch_models/joint_decision.mlpackage")
103
- ```
104
-
105
- ### Loading in Swift
106
 
107
  ```swift
108
- import CoreML
109
-
110
- let preprocessor = try MLModel(contentsOf: URL(fileURLWithPath: "batch_models/preprocessor.mlpackage"))
111
- let encoder = try MLModel(contentsOf: URL(fileURLWithPath: "batch_models/conformer_batch.mlpackage"))
112
- // ... etc
113
  ```
114
 
115
- ## Platform Requirements
116
-
117
- - **macOS**: 14.0+
118
- - **iOS**: 17.0+
119
- - **Hardware**: Apple Silicon (M1/M2/M3/M4, A14+) recommended for ANE acceleration
 
 
 
 
120
 
121
- ## License
 
122
 
123
- Please refer to NVIDIA's original model license at [nvidia/parakeet_realtime_eou_120m-v1](https://huggingface.co/nvidia/parakeet_realtime_eou_120m-v1).
124
-
125
- ## Acknowledgments
126
-
127
- - Original model by NVIDIA NeMo team
128
- - CoreML conversion by FluidAudio
 
1
+ # Parakeet EOU Integration & Findings
2
+
3
+ This directory contains the scripts and documentation for integrating the NVIDIA Parakeet Realtime EOU 120M model into FluidAudio.
4
+
5
+ ## Executive Summary
6
+
7
+ * **Goal:** Enable low-latency, streaming speech recognition with End-of-Utterance (EOU) detection on Apple Silicon.
8
+ * **Result:** The "Authentic Streaming" mode of the `nvidia/parakeet-realtime-eou-120m-v1` model is **fundamentally broken** (produces garbage output).
9
+ * **Solution:** We implemented a **"Short Batch" strategy**. We use the model's **Batch Encoder** (which works perfectly) with small, fixed-size input chunks (1.28s). This provides high accuracy (~40% WER) with streaming-like latency (~1.3s).
10
+
11
+ ## Directory Structure
12
+
13
+ * `Conversion/`: Scripts to export the PyTorch model to CoreML.
14
+ * `convert_split_encoder.py`: **(Primary)** Exports the "Short Batch" model (1.28s chunks).
15
+ * `convert_parakeet_eou.py`: Original export script.
16
+ * `individual_components.py`: Shared model definitions.
17
+ * `Inference/`: Scripts to test and verify the model in Python.
18
+ * `test_full_pytorch_streaming.py`: **(Proof)** Demonstrates that the original PyTorch model fails in streaming mode.
19
+ * `debug_nemo_streaming.py`: Debug script for streaming logic.
20
+
21
+ ## The Journey & Findings
22
+
23
+ ### 1. The Streaming Failure
24
+ We initially attempted to use the model's native streaming encoder (`CacheAwareStreamingConfig`).
25
+ * **Observation:** The model produced garbage output (e.g., "z", "znions", "arsith") regardless of the input audio.
26
+ * **Investigation:**
27
+ * We verified the CoreML export numerically against PyTorch (it matched).
28
+ * We implemented audio buffering (NeMo-style) to fix edge artifacts.
29
+ * We tested various chunk sizes (160ms, 320ms, 640ms, 1280ms).
30
+ * **Root Cause:** We ran `test_full_pytorch_streaming.py` using the *original* NeMo library and model. It *also* produced garbage. This confirmed that the **model weights themselves** are likely untrained or incompatible with the streaming configuration exposed in the checkpoint.
31
+
32
+ ### 2. The "Short Batch" Solution
33
+ Since the **Batch Encoder** (FastConformer) works correctly (WER ~3-4% on clean audio), we pivoted to using it for pseudo-streaming.
34
+ * **Method:** We re-exported the Batch Encoder to accept a fixed input size of **128 Mel frames (1.28 seconds)**.
35
+ * **Implementation:** `BatchEouAsrManager.swift` accumulates audio, feeds 1.28s chunks to the encoder, and preserves the RNNT Decoder's state (LSTM hidden/cell states) between chunks to maintain context.
36
+ * **Results:**
37
+ * **Accuracy:** ~40% WER on `test-clean` (100 files). Much better than Streaming (76% WER), though lower than full-context Batch due to chunking.
38
+ * **Latency:** ~1.3s (chunk size) + processing time.
39
+ * **Performance:** ~23x Real-Time Factor (RTFx) on M2.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  ## Usage
42
 
43
+ ### Swift (Production)
44
+ Use `BatchEouAsrManager` for all transcription.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  ```swift
47
+ let manager = BatchEouAsrManager()
48
+ await manager.initialize()
49
+ let result = try await manager.transcribe(audioSamples)
 
 
50
  ```
51
 
52
+ ### Benchmarking
53
+ * **Short Batch (Working):**
54
+ ```bash
55
+ swift run -c release fluidaudio batch-eou-benchmark --subset test-clean --max-files 100
56
+ ```
57
+ * **Authentic Streaming (Broken - for demo only):**
58
+ ```bash
59
+ swift run -c release fluidaudio eou-benchmark --streaming --chunk-duration 160
60
+ ```
61
 
62
+ ## Model Export
63
+ To re-export the Short Batch model:
64
 
65
+ ```bash
66
+ python3 Scripts/ParakeetEOU/Conversion/convert_split_encoder.py \
67
+ --output-dir Models/ParakeetEOU/ShortBatch \
68
+ --model-id nvidia/parakeet-realtime-eou-120m-v1
69
+ ```
 
StreamingModelConvert/.DS_Store ADDED
Binary file (8.2 kB). View file
 
StreamingModelConvert/metadata.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "nvidia/parakeet_realtime_eou_120m-v1",
3
+ "model_name": "parakeet_realtime_eou_120m-v1-streaming",
4
+ "streaming_mode": true,
5
+ "sample_rate": 16000,
6
+ "chunk_ms": 160,
7
+ "chunk_samples": 2560,
8
+ "max_chunks": 100,
9
+ "vocab_size": 1026,
10
+ "blank_id": 1026,
11
+ "decoder_hidden": 640,
12
+ "decoder_layers": 1,
13
+ "mel_dim": 128,
14
+ "pre_encode_cache_size": 0,
15
+ "cache_shapes": {
16
+ "cache_last_channel": [
17
+ 17,
18
+ 1,
19
+ 70,
20
+ 512
21
+ ],
22
+ "cache_last_time": [
23
+ 17,
24
+ 1,
25
+ 512,
26
+ 8
27
+ ]
28
+ },
29
+ "components": {
30
+ "preprocessor": "parakeet_eou_streaming_preprocessor.mlpackage",
31
+ "encoder": "parakeet_eou_streaming_encoder.mlpackage",
32
+ "decoder": "parakeet_eou_streaming_decoder.mlpackage",
33
+ "joint_decision": "parakeet_eou_streaming_joint_decision.mlpackage"
34
+ }
35
+ }
StreamingModelConvert/parakeet_eou_streaming_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2449c825902d5b3beda31501f91d6c6af356f0c2fdfcac570bdf8ad04093bf
3
+ size 6738
StreamingModelConvert/parakeet_eou_streaming_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b4cacecdcd9df79ab1e56de67230baf5a8664d2afe0bb8f3408eefa972cb2f4
3
+ size 7873600
StreamingModelConvert/parakeet_eou_streaming_decoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "2265DA56-AEEA-4347-9AF4-0F9A3394043D": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "E18CC67E-153B-449A-80AD-EF00FADACC68": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "E18CC67E-153B-449A-80AD-EF00FADACC68"
18
+ }
StreamingModelConvert/parakeet_eou_streaming_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad9917fe7ae6f41a1075adc3f50672226f4addf7b8f6667876c6961ed287b6d4
3
+ size 506025
StreamingModelConvert/parakeet_eou_streaming_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:448df7bd99102f54384094bdbadea537b919b5ed4faa4e5450df53d49fab0a27
3
+ size 213109568
StreamingModelConvert/parakeet_eou_streaming_encoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "BAE06F5C-C2A9-4E4B-976B-384BFB7D720B": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "D77E0309-5E2A-4D96-9879-FB41433AD5CB": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "D77E0309-5E2A-4D96-9879-FB41433AD5CB"
18
+ }
StreamingModelConvert/parakeet_eou_streaming_joint_decision.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9d78f143362b55e1ab343655d4586e0f0bcaa4ee2c5ed0ccbc378015dfd6d15
3
+ size 8697
StreamingModelConvert/parakeet_eou_streaming_joint_decision.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039b2010a269153f5a96edf28637f921a86ef8822f248f2d6712f7a6bce84b4
3
+ size 2794182
StreamingModelConvert/parakeet_eou_streaming_joint_decision.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "540596F0-C7C9-41C4-806F-3AF5CDC03FD1": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "F32E3C88-50AD-471C-9A18-8F982E65CD96": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "F32E3C88-50AD-471C-9A18-8F982E65CD96"
18
+ }
StreamingModelConvert/parakeet_eou_streaming_preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0c000416ecc121df2acae8553ec1efba526a721ac60c5a79052ca0c0666e11c
3
+ size 13785
StreamingModelConvert/parakeet_eou_streaming_preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f257ad1ac11575d73a6ffda555319b2c96b0a224f0dc03ddd8c62950e9b18e53
3
+ size 592384
StreamingModelConvert/parakeet_eou_streaming_preprocessor.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "301A322A-CE26-40B9-85F8-8004DB0A2ABD": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "A2E94F4A-AC34-4A51-9D97-8F00BC6BF3B5": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "A2E94F4A-AC34-4A51-9D97-8F00BC6BF3B5"
18
+ }
StreamingModelConvert/vocab.json ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": "<unk>",
3
+ "2": "▁t",
4
+ "3": "▁th",
5
+ "4": "▁a",
6
+ "5": "▁i",
7
+ "6": "▁the",
8
+ "7": "▁s",
9
+ "8": "re",
10
+ "9": "▁w",
11
+ "10": "▁o",
12
+ "11": "in",
13
+ "12": "at",
14
+ "13": "er",
15
+ "14": "nd",
16
+ "15": "ou",
17
+ "16": "▁c",
18
+ "17": "▁b",
19
+ "18": "▁h",
20
+ "19": "en",
21
+ "20": "on",
22
+ "21": "▁m",
23
+ "22": "▁f",
24
+ "23": "ing",
25
+ "24": "▁p",
26
+ "25": "▁to",
27
+ "26": "▁and",
28
+ "27": "▁d",
29
+ "28": "an",
30
+ "29": "or",
31
+ "30": "es",
32
+ "31": "▁y",
33
+ "32": "▁l",
34
+ "33": "▁of",
35
+ "34": "ll",
36
+ "35": "▁in",
37
+ "36": "ed",
38
+ "37": "it",
39
+ "38": "▁g",
40
+ "39": "is",
41
+ "40": "▁you",
42
+ "41": "▁n",
43
+ "42": "ar",
44
+ "43": "om",
45
+ "44": "as",
46
+ "45": "ve",
47
+ "46": "▁e",
48
+ "47": "ic",
49
+ "48": "▁it",
50
+ "49": "al",
51
+ "50": "us",
52
+ "51": "▁wh",
53
+ "52": "▁we",
54
+ "53": "▁be",
55
+ "54": "ion",
56
+ "55": "ow",
57
+ "56": "le",
58
+ "57": "▁is",
59
+ "58": "et",
60
+ "59": "ent",
61
+ "60": "ot",
62
+ "61": "ut",
63
+ "62": "▁re",
64
+ "63": "▁on",
65
+ "64": "ay",
66
+ "65": "▁ha",
67
+ "66": "ig",
68
+ "67": "▁so",
69
+ "68": "ct",
70
+ "69": "▁he",
71
+ "70": "▁for",
72
+ "71": "ver",
73
+ "72": "ke",
74
+ "73": "ro",
75
+ "74": "▁st",
76
+ "75": "id",
77
+ "76": "▁go",
78
+ "77": "all",
79
+ "78": "se",
80
+ "79": "ly",
81
+ "80": "▁u",
82
+ "81": "ch",
83
+ "82": "st",
84
+ "83": "ld",
85
+ "84": "▁k",
86
+ "85": "ce",
87
+ "86": "ur",
88
+ "87": "▁li",
89
+ "88": "am",
90
+ "89": "▁r",
91
+ "90": "ht",
92
+ "91": "▁j",
93
+ "92": "ith",
94
+ "93": "▁se",
95
+ "94": "ir",
96
+ "95": "▁as",
97
+ "96": "▁an",
98
+ "97": "im",
99
+ "98": "▁do",
100
+ "99": "ad",
101
+ "100": "▁was",
102
+ "101": "ight",
103
+ "102": "th",
104
+ "103": "▁are",
105
+ "104": "▁but",
106
+ "105": "▁sh",
107
+ "106": "ust",
108
+ "107": "ally",
109
+ "108": "▁not",
110
+ "109": "▁or",
111
+ "110": "▁com",
112
+ "111": "▁can",
113
+ "112": "▁me",
114
+ "113": "op",
115
+ "114": "▁mo",
116
+ "115": "▁at",
117
+ "116": "ill",
118
+ "117": "▁ch",
119
+ "118": "▁ne",
120
+ "119": "ant",
121
+ "120": "▁de",
122
+ "121": "▁kn",
123
+ "122": "▁one",
124
+ "123": "il",
125
+ "124": "ol",
126
+ "125": "▁con",
127
+ "126": "ter",
128
+ "127": "▁ab",
129
+ "128": "▁fr",
130
+ "129": "ere",
131
+ "130": "ck",
132
+ "131": "▁al",
133
+ "132": "▁all",
134
+ "133": "qu",
135
+ "134": "▁pro",
136
+ "135": "▁som",
137
+ "136": "ould",
138
+ "137": "▁tw",
139
+ "138": "ul",
140
+ "139": "ra",
141
+ "140": "od",
142
+ "141": "ers",
143
+ "142": "▁su",
144
+ "143": "ive",
145
+ "144": "▁v",
146
+ "145": "use",
147
+ "146": "ate",
148
+ "147": "ge",
149
+ "148": "if",
150
+ "149": "▁ex",
151
+ "150": "ess",
152
+ "151": "pp",
153
+ "152": "▁lo",
154
+ "153": "out",
155
+ "154": "▁if",
156
+ "155": "est",
157
+ "156": "ain",
158
+ "157": "ist",
159
+ "158": "and",
160
+ "159": "ea",
161
+ "160": "very",
162
+ "161": "art",
163
+ "162": "▁wor",
164
+ "163": "▁my",
165
+ "164": "ab",
166
+ "165": "ment",
167
+ "166": "▁bec",
168
+ "167": "un",
169
+ "168": "ity",
170
+ "169": "ri",
171
+ "170": "pe",
172
+ "171": "ions",
173
+ "172": "▁by",
174
+ "173": "ok",
175
+ "174": "our",
176
+ "175": "ort",
177
+ "176": "ind",
178
+ "177": "ink",
179
+ "178": "nt",
180
+ "179": "▁up",
181
+ "180": "um",
182
+ "181": "▁don",
183
+ "182": "▁get",
184
+ "183": "red",
185
+ "184": "▁out",
186
+ "185": "el",
187
+ "186": "ause",
188
+ "187": "res",
189
+ "188": "▁ma",
190
+ "189": "ich",
191
+ "190": "▁us",
192
+ "191": "rou",
193
+ "192": "▁int",
194
+ "193": "em",
195
+ "194": "os",
196
+ "195": "ies",
197
+ "196": "ie",
198
+ "197": "▁pl",
199
+ "198": "▁tr",
200
+ "199": "ven",
201
+ "200": "ous",
202
+ "201": "▁le",
203
+ "202": "▁two",
204
+ "203": "ard",
205
+ "204": "ine",
206
+ "205": "▁co",
207
+ "206": "een",
208
+ "207": "▁now",
209
+ "208": "ty",
210
+ "209": "her",
211
+ "210": "ack",
212
+ "211": "▁pe",
213
+ "212": "ame",
214
+ "213": "▁how",
215
+ "214": "▁who",
216
+ "215": "▁see",
217
+ "216": "▁tim",
218
+ "217": "ect",
219
+ "218": "ast",
220
+ "219": "▁our",
221
+ "220": "ci",
222
+ "221": "ree",
223
+ "222": "ople",
224
+ "223": "gh",
225
+ "224": "▁no",
226
+ "225": "▁had",
227
+ "226": "▁man",
228
+ "227": "▁qu",
229
+ "228": "▁en",
230
+ "229": "ide",
231
+ "230": "ure",
232
+ "231": "ud",
233
+ "232": "so",
234
+ "233": "▁his",
235
+ "234": "▁sa",
236
+ "235": "▁sp",
237
+ "236": "▁say",
238
+ "237": "ose",
239
+ "238": "ther",
240
+ "239": "▁act",
241
+ "240": "▁ta",
242
+ "241": "▁cl",
243
+ "242": "ings",
244
+ "243": "pt",
245
+ "244": "king",
246
+ "245": "▁any",
247
+ "246": "▁has",
248
+ "247": "▁un",
249
+ "248": "iv",
250
+ "249": "▁im",
251
+ "250": "▁ag",
252
+ "251": "▁te",
253
+ "252": "▁fe",
254
+ "253": "one",
255
+ "254": "per",
256
+ "255": "ong",
257
+ "256": "▁po",
258
+ "257": "▁ad",
259
+ "258": "ff",
260
+ "259": "ore",
261
+ "260": "itt",
262
+ "261": "ans",
263
+ "262": "iz",
264
+ "263": "eah",
265
+ "264": "reat",
266
+ "265": "act",
267
+ "266": "own",
268
+ "267": "hing",
269
+ "268": "enty",
270
+ "269": "age",
271
+ "270": "ber",
272
+ "271": "ice",
273
+ "272": "▁am",
274
+ "273": "ple",
275
+ "274": "are",
276
+ "275": "▁per",
277
+ "276": "und",
278
+ "277": "ite",
279
+ "278": "ix",
280
+ "279": "pl",
281
+ "280": "▁way",
282
+ "281": "▁did",
283
+ "282": "▁pr",
284
+ "283": "▁got",
285
+ "284": "ars",
286
+ "285": "▁she",
287
+ "286": "▁let",
288
+ "287": "ag",
289
+ "288": "▁ac",
290
+ "289": "int",
291
+ "290": "▁ar",
292
+ "291": "ry",
293
+ "292": "ign",
294
+ "293": "ish",
295
+ "294": "��fir",
296
+ "295": "ace",
297
+ "296": "ble",
298
+ "297": "og",
299
+ "298": "ue",
300
+ "299": "▁ye",
301
+ "300": "ap",
302
+ "301": "iff",
303
+ "302": "▁ro",
304
+ "303": "▁her",
305
+ "304": "nder",
306
+ "305": "▁ok",
307
+ "306": "▁res",
308
+ "307": "▁gu",
309
+ "308": "ence",
310
+ "309": "▁may",
311
+ "310": "ated",
312
+ "311": "ip",
313
+ "312": "▁bo",
314
+ "313": "▁him",
315
+ "314": "way",
316
+ "315": "ac",
317
+ "316": "ical",
318
+ "317": "ass",
319
+ "318": "ase",
320
+ "319": "▁dis",
321
+ "320": "able",
322
+ "321": "ick",
323
+ "322": "▁app",
324
+ "323": "ance",
325
+ "324": "▁pre",
326
+ "325": "▁six",
327
+ "326": "▁off",
328
+ "327": "▁new",
329
+ "328": "ia",
330
+ "329": "orm",
331
+ "330": "ank",
332
+ "331": "▁lot",
333
+ "332": "ach",
334
+ "333": "▁fo",
335
+ "334": "inet",
336
+ "335": "ire",
337
+ "336": "ary",
338
+ "337": "ult",
339
+ "338": "▁tal",
340
+ "339": "▁mu",
341
+ "340": "▁bl",
342
+ "341": "ount",
343
+ "342": "sel",
344
+ "343": "vel",
345
+ "344": "▁br",
346
+ "345": "▁imp",
347
+ "346": "ep",
348
+ "347": "cess",
349
+ "348": "ord",
350
+ "349": "▁sc",
351
+ "350": "▁inc",
352
+ "351": "ound",
353
+ "352": "ang",
354
+ "353": "be",
355
+ "354": "ress",
356
+ "355": "uct",
357
+ "356": "▁ind",
358
+ "357": "▁af",
359
+ "358": "ving",
360
+ "359": "▁oh",
361
+ "360": "▁bet",
362
+ "361": "▁use",
363
+ "362": "ome",
364
+ "363": "ens",
365
+ "364": "ys",
366
+ "365": "▁bu",
367
+ "366": "co",
368
+ "367": "ory",
369
+ "368": "ater",
370
+ "369": "ild",
371
+ "370": "ght",
372
+ "371": "ial",
373
+ "372": "▁day",
374
+ "373": "ning",
375
+ "374": "na",
376
+ "375": "ile",
377
+ "376": "▁spe",
378
+ "377": "▁mar",
379
+ "378": "ody",
380
+ "379": "ough",
381
+ "380": "ade",
382
+ "381": "vers",
383
+ "382": "xt",
384
+ "383": "▁fl",
385
+ "384": "▁ke",
386
+ "385": "ian",
387
+ "386": "▁sy",
388
+ "387": "▁put",
389
+ "388": "fore",
390
+ "389": "ub",
391
+ "390": "▁ph",
392
+ "391": "fe",
393
+ "392": "▁em",
394
+ "393": "▁ser",
395
+ "394": "form",
396
+ "395": "ting",
397
+ "396": "te",
398
+ "397": "av",
399
+ "398": "ious",
400
+ "399": "▁rec",
401
+ "400": "ks",
402
+ "401": "▁gr",
403
+ "402": "ces",
404
+ "403": "wn",
405
+ "404": "ors",
406
+ "405": "▁jo",
407
+ "406": "ents",
408
+ "407": "▁des",
409
+ "408": "▁try",
410
+ "409": "▁equ",
411
+ "410": "▁z",
412
+ "411": "▁rem",
413
+ "412": "▁str",
414
+ "413": "self",
415
+ "414": "▁bit",
416
+ "415": "ph",
417
+ "416": "ved",
418
+ "417": "▁why",
419
+ "418": "▁bas",
420
+ "419": "▁hel",
421
+ "420": "▁rel",
422
+ "421": "ath",
423
+ "422": "ject",
424
+ "423": "ail",
425
+ "424": "▁la",
426
+ "425": "ual",
427
+ "426": "▁god",
428
+ "427": "▁nat",
429
+ "428": "erm",
430
+ "429": "day",
431
+ "430": "▁id",
432
+ "431": "ft",
433
+ "432": "▁wr",
434
+ "433": "▁min",
435
+ "434": "ates",
436
+ "435": "▁gen",
437
+ "436": "tain",
438
+ "437": "▁ob",
439
+ "438": "ull",
440
+ "439": "ict",
441
+ "440": "▁tra",
442
+ "441": "▁end",
443
+ "442": "▁hig",
444
+ "443": "▁fif",
445
+ "444": "oth",
446
+ "445": "tern",
447
+ "446": "▁its",
448
+ "447": "vent",
449
+ "448": "▁sm",
450
+ "449": "ons",
451
+ "450": "▁add",
452
+ "451": "iss",
453
+ "452": "▁bel",
454
+ "453": "ful",
455
+ "454": "get",
456
+ "455": "▁ele",
457
+ "456": "▁rep",
458
+ "457": "ak",
459
+ "458": "▁ho",
460
+ "459": "▁pos",
461
+ "460": "▁num",
462
+ "461": "ange",
463
+ "462": "ves",
464
+ "463": "ific",
465
+ "464": "urn",
466
+ "465": "ise",
467
+ "466": "▁cr",
468
+ "467": "▁um",
469
+ "468": "ward",
470
+ "469": "▁reg",
471
+ "470": "ady",
472
+ "471": "ower",
473
+ "472": "uc",
474
+ "473": "▁dec",
475
+ "474": "lic",
476
+ "475": "▁set",
477
+ "476": "▁gon",
478
+ "477": "▁op",
479
+ "478": "▁ear",
480
+ "479": "▁sub",
481
+ "480": "▁sl",
482
+ "481": "les",
483
+ "482": "stem",
484
+ "483": "cial",
485
+ "484": "olog",
486
+ "485": "atch",
487
+ "486": "ily",
488
+ "487": "body",
489
+ "488": "nds",
490
+ "489": "ular",
491
+ "490": "ren",
492
+ "491": "▁own",
493
+ "492": "▁too",
494
+ "493": "cent",
495
+ "494": "ible",
496
+ "495": "pect",
497
+ "496": "ered",
498
+ "497": "ways",
499
+ "498": "teen",
500
+ "499": "▁uh",
501
+ "500": "▁big",
502
+ "501": "▁mod",
503
+ "502": "▁att",
504
+ "503": "▁car",
505
+ "504": "gr",
506
+ "505": "▁acc",
507
+ "506": "ied",
508
+ "507": "mun",
509
+ "508": "ib",
510
+ "509": "▁mon",
511
+ "510": "▁sch",
512
+ "511": "▁pol",
513
+ "512": "▁dat",
514
+ "513": "▁fin",
515
+ "514": "▁sim",
516
+ "515": "▁inv",
517
+ "516": "▁def",
518
+ "517": "ked",
519
+ "518": "▁ent",
520
+ "519": "▁yes",
521
+ "520": "ows",
522
+ "521": "ics",
523
+ "522": "ited",
524
+ "523": "ute",
525
+ "524": "ism",
526
+ "525": "ps",
527
+ "526": "▁ed",
528
+ "527": "▁el",
529
+ "528": "ably",
530
+ "529": "ppen",
531
+ "530": "als",
532
+ "531": "▁ten",
533
+ "532": "ract",
534
+ "533": "ss",
535
+ "534": "▁ass",
536
+ "535": "▁met",
537
+ "536": "gan",
538
+ "537": "▁eng",
539
+ "538": "▁stu",
540
+ "539": "ween",
541
+ "540": "arch",
542
+ "541": "▁gl",
543
+ "542": "▁cor",
544
+ "543": "▁dr",
545
+ "544": "vern",
546
+ "545": "▁ty",
547
+ "546": "▁run",
548
+ "547": "hip",
549
+ "548": "cus",
550
+ "549": "cond",
551
+ "550": "▁ins",
552
+ "551": "irty",
553
+ "552": "▁pub",
554
+ "553": "lud",
555
+ "554": "llow",
556
+ "555": "▁cou",
557
+ "556": "ew",
558
+ "557": "iew",
559
+ "558": "▁sur",
560
+ "559": "ero",
561
+ "560": "ood",
562
+ "561": "ness",
563
+ "562": "▁fun",
564
+ "563": "▁eff",
565
+ "564": "cept",
566
+ "565": "▁ca",
567
+ "566": "▁exp",
568
+ "567": "duct",
569
+ "568": "▁sw",
570
+ "569": "ize",
571
+ "570": "ope",
572
+ "571": "▁par",
573
+ "572": "kes",
574
+ "573": "cy",
575
+ "574": "▁ev",
576
+ "575": "▁ref",
577
+ "576": "ell",
578
+ "577": "▁bus",
579
+ "578": "ug",
580
+ "579": "rib",
581
+ "580": "▁cur",
582
+ "581": "mo",
583
+ "582": "ock",
584
+ "583": "ures",
585
+ "584": "air",
586
+ "585": "▁war",
587
+ "586": "str",
588
+ "587": "▁med",
589
+ "588": "▁wa",
590
+ "589": "▁val",
591
+ "590": "▁sin",
592
+ "591": "blem",
593
+ "592": "▁fam",
594
+ "593": "li",
595
+ "594": "▁far",
596
+ "595": "▁cle",
597
+ "596": "▁col",
598
+ "597": "mon",
599
+ "598": "▁gra",
600
+ "599": "led",
601
+ "600": "ense",
602
+ "601": "tin",
603
+ "602": "ues",
604
+ "603": "its",
605
+ "604": "▁mem",
606
+ "605": "▁inf",
607
+ "606": "▁eas",
608
+ "607": "ideo",
609
+ "608": "▁top",
610
+ "609": "io",
611
+ "610": "pan",
612
+ "611": "▁hum",
613
+ "612": "▁old",
614
+ "613": "ead",
615
+ "614": "▁ord",
616
+ "615": "ric",
617
+ "616": "ants",
618
+ "617": "oy",
619
+ "618": "esn",
620
+ "619": "uck",
621
+ "620": "ason",
622
+ "621": "ced",
623
+ "622": "ool",
624
+ "623": "rat",
625
+ "624": "ouse",
626
+ "625": "▁lar",
627
+ "626": "▁art",
628
+ "627": "▁wee",
629
+ "628": "▁cer",
630
+ "629": "ized",
631
+ "630": "▁mat",
632
+ "631": "con",
633
+ "632": "erg",
634
+ "633": "land",
635
+ "634": "ines",
636
+ "635": "▁chr",
637
+ "636": "▁aut",
638
+ "637": "▁lea",
639
+ "638": "▁sou",
640
+ "639": "oney",
641
+ "640": "tty",
642
+ "641": "▁ple",
643
+ "642": "ulat",
644
+ "643": "oks",
645
+ "644": "▁few",
646
+ "645": "▁sol",
647
+ "646": "▁che",
648
+ "647": "chn",
649
+ "648": "ird",
650
+ "649": "▁bre",
651
+ "650": "▁dur",
652
+ "651": "▁wom",
653
+ "652": "me",
654
+ "653": "izat",
655
+ "654": "eric",
656
+ "655": "ote",
657
+ "656": "▁uni",
658
+ "657": "eren",
659
+ "658": "arn",
660
+ "659": "ross",
661
+ "660": "ices",
662
+ "661": "ten",
663
+ "662": "eral",
664
+ "663": "ever",
665
+ "664": "ieve",
666
+ "665": "lish",
667
+ "666": "ash",
668
+ "667": "▁opp",
669
+ "668": "alth",
670
+ "669": "ger",
671
+ "670": "▁sk",
672
+ "671": "▁red",
673
+ "672": "peri",
674
+ "673": "▁det",
675
+ "674": "▁ext",
676
+ "675": "ner",
677
+ "676": "ah",
678
+ "677": "▁var",
679
+ "678": "▁loc",
680
+ "679": "gram",
681
+ "680": "ists",
682
+ "681": "ives",
683
+ "682": "▁es",
684
+ "683": "▁nor",
685
+ "684": "tro",
686
+ "685": "ale",
687
+ "686": "▁iss",
688
+ "687": "▁pri",
689
+ "688": "gin",
690
+ "689": "az",
691
+ "690": "oc",
692
+ "691": "▁pop",
693
+ "692": "ern",
694
+ "693": "▁sit",
695
+ "694": "ket",
696
+ "695": "▁pa",
697
+ "696": "▁law",
698
+ "697": "ages",
699
+ "698": "br",
700
+ "699": "▁cam",
701
+ "700": "▁mom",
702
+ "701": "osed",
703
+ "702": "▁bro",
704
+ "703": "ne",
705
+ "704": "bs",
706
+ "705": "▁cre",
707
+ "706": "erat",
708
+ "707": "▁sec",
709
+ "708": "▁cap",
710
+ "709": "▁vis",
711
+ "710": "▁pat",
712
+ "711": "ield",
713
+ "712": "iet",
714
+ "713": "▁tri",
715
+ "714": "up",
716
+ "715": "▁bra",
717
+ "716": "ts",
718
+ "717": "▁mot",
719
+ "718": "▁unt",
720
+ "719": "put",
721
+ "720": "bo",
722
+ "721": "ork",
723
+ "722": "mer",
724
+ "723": "ital",
725
+ "724": "▁air",
726
+ "725": "ined",
727
+ "726": "▁beh",
728
+ "727": "▁adv",
729
+ "728": "▁ret",
730
+ "729": "imes",
731
+ "730": "▁tea",
732
+ "731": "ural",
733
+ "732": "sid",
734
+ "733": "ters",
735
+ "734": "▁pur",
736
+ "735": "▁sci",
737
+ "736": "bers",
738
+ "737": "ient",
739
+ "738": "ier",
740
+ "739": "cc",
741
+ "740": "sw",
742
+ "741": "▁av",
743
+ "742": "reen",
744
+ "743": "ode",
745
+ "744": "ont",
746
+ "745": "▁dra",
747
+ "746": "ann",
748
+ "747": "nect",
749
+ "748": "▁x",
750
+ "749": "▁eu",
751
+ "750": "ton",
752
+ "751": "inat",
753
+ "752": "ene",
754
+ "753": "ared",
755
+ "754": "els",
756
+ "755": "▁mor",
757
+ "756": "▁rat",
758
+ "757": "cri",
759
+ "758": "▁men",
760
+ "759": "▁ah",
761
+ "760": "ames",
762
+ "761": "▁arm",
763
+ "762": "eak",
764
+ "763": "▁pay",
765
+ "764": "▁hal",
766
+ "765": "ins",
767
+ "766": "ilit",
768
+ "767": "stit",
769
+ "768": "▁ra",
770
+ "769": "▁leg",
771
+ "770": "cl",
772
+ "771": "pr",
773
+ "772": "▁wal",
774
+ "773": "▁bad",
775
+ "774": "▁ge",
776
+ "775": "roup",
777
+ "776": "▁mus",
778
+ "777": "man",
779
+ "778": "▁gi",
780
+ "779": "eds",
781
+ "780": "▁aw",
782
+ "781": "po",
783
+ "782": "ark",
784
+ "783": "row",
785
+ "784": "▁dep",
786
+ "785": "ully",
787
+ "786": "ral",
788
+ "787": "lect",
789
+ "788": "pend",
790
+ "789": "▁sev",
791
+ "790": "ime",
792
+ "791": "gest",
793
+ "792": "here",
794
+ "793": "▁yet",
795
+ "794": "ted",
796
+ "795": "▁rev",
797
+ "796": "ds",
798
+ "797": "▁ask",
799
+ "798": "less",
800
+ "799": "▁di",
801
+ "800": "ets",
802
+ "801": "line",
803
+ "802": "▁aff",
804
+ "803": "ired",
805
+ "804": "▁est",
806
+ "805": "ken",
807
+ "806": "vid",
808
+ "807": "most",
809
+ "808": "ivid",
810
+ "809": "unch",
811
+ "810": "par",
812
+ "811": "med",
813
+ "812": "rop",
814
+ "813": "ased",
815
+ "814": "eone",
816
+ "815": "▁ve",
817
+ "816": "▁abs",
818
+ "817": "ergy",
819
+ "818": "ret",
820
+ "819": "▁saw",
821
+ "820": "▁ey",
822
+ "821": "▁cal",
823
+ "822": "uat",
824
+ "823": "▁mid",
825
+ "824": "vat",
826
+ "825": "ream",
827
+ "826": "vice",
828
+ "827": "ians",
829
+ "828": "rent",
830
+ "829": "ctor",
831
+ "830": "err",
832
+ "831": "ush",
833
+ "832": "ases",
834
+ "833": "▁suc",
835
+ "834": "erms",
836
+ "835": "ave",
837
+ "836": "angu",
838
+ "837": "ries",
839
+ "838": "▁wo",
840
+ "839": "arts",
841
+ "840": "▁fil",
842
+ "841": "▁fat",
843
+ "842": "▁cho",
844
+ "843": "orts",
845
+ "844": "▁fre",
846
+ "845": "ee",
847
+ "846": "ught",
848
+ "847": "eng",
849
+ "848": "ump",
850
+ "849": "▁bar",
851
+ "850": "ying",
852
+ "851": "ane",
853
+ "852": "▁tem",
854
+ "853": "anks",
855
+ "854": "ury",
856
+ "855": "iat",
857
+ "856": "mit",
858
+ "857": "trol",
859
+ "858": "▁net",
860
+ "859": "▁maj",
861
+ "860": "▁cra",
862
+ "861": "ling",
863
+ "862": "▁fig",
864
+ "863": "orn",
865
+ "864": "icat",
866
+ "865": "pany",
867
+ "866": "▁occ",
868
+ "867": "ott",
869
+ "868": "ands",
870
+ "869": "▁exc",
871
+ "870": "▁mr",
872
+ "871": "ency",
873
+ "872": "rope",
874
+ "873": "itch",
875
+ "874": "▁lit",
876
+ "875": "abil",
877
+ "876": "not",
878
+ "877": "ma",
879
+ "878": "▁typ",
880
+ "879": "▁opt",
881
+ "880": "ob",
882
+ "881": "ser",
883
+ "882": "ety",
884
+ "883": "ms",
885
+ "884": "peci",
886
+ "885": "aces",
887
+ "886": "aut",
888
+ "887": "▁hon",
889
+ "888": "cuss",
890
+ "889": "▁sal",
891
+ "890": "▁sor",
892
+ "891": "att",
893
+ "892": "▁lab",
894
+ "893": "▁har",
895
+ "894": "urch",
896
+ "895": "nded",
897
+ "896": "uce",
898
+ "897": "ids",
899
+ "898": "▁hy",
900
+ "899": "▁fut",
901
+ "900": "▁ste",
902
+ "901": "ours",
903
+ "902": "ems",
904
+ "903": "utes",
905
+ "904": "ng",
906
+ "905": "ta",
907
+ "906": "▁won",
908
+ "907": "▁fa",
909
+ "908": "▁env",
910
+ "909": "ards",
911
+ "910": "▁job",
912
+ "911": "ium",
913
+ "912": "▁dot",
914
+ "913": "▁obv",
915
+ "914": "ina",
916
+ "915": "side",
917
+ "916": "elve",
918
+ "917": "cu",
919
+ "918": "▁jes",
920
+ "919": "▁pot",
921
+ "920": "▁pie",
922
+ "921": "▁tre",
923
+ "922": "▁hey",
924
+ "923": "▁mag",
925
+ "924": "ron",
926
+ "925": "▁key",
927
+ "926": "swer",
928
+ "927": "▁win",
929
+ "928": "ucat",
930
+ "929": "work",
931
+ "930": "ides",
932
+ "931": "▁low",
933
+ "932": "▁vol",
934
+ "933": "▁oth",
935
+ "934": "atic",
936
+ "935": "lf",
937
+ "936": "ads",
938
+ "937": "inds",
939
+ "938": "com",
940
+ "939": "ths",
941
+ "940": "▁ver",
942
+ "941": "ised",
943
+ "942": "lo",
944
+ "943": "▁squ",
945
+ "944": "▁cut",
946
+ "945": "oked",
947
+ "946": "irit",
948
+ "947": "ateg",
949
+ "948": "ppy",
950
+ "949": "mitt",
951
+ "950": "come",
952
+ "951": "hn",
953
+ "952": "igin",
954
+ "953": "mand",
955
+ "954": "▁dam",
956
+ "955": "ho",
957
+ "956": "▁da",
958
+ "957": "▁fur",
959
+ "958": "iron",
960
+ "959": "ilar",
961
+ "960": "▁fac",
962
+ "961": "▁neg",
963
+ "962": "▁ago",
964
+ "963": "ged",
965
+ "964": "miss",
966
+ "965": "enth",
967
+ "966": "▁dou",
968
+ "967": "▁hit",
969
+ "968": "▁guy",
970
+ "969": "▁bi",
971
+ "970": "ove",
972
+ "971": "fess",
973
+ "972": "ples",
974
+ "973": "owed",
975
+ "974": "ured",
976
+ "975": "▁ris",
977
+ "976": "ints",
978
+ "977": "rew",
979
+ "978": "▁sum",
980
+ "979": "▁hu",
981
+ "980": "ploy",
982
+ "981": "ude",
983
+ "982": "ried",
984
+ "983": "▁cir",
985
+ "984": "▁dev",
986
+ "985": "ear",
987
+ "986": "▁tot",
988
+ "987": "▁ann",
989
+ "988": "duc",
990
+ "989": "ik",
991
+ "990": "pon",
992
+ "991": "sted",
993
+ "992": "▁ide",
994
+ "993": "▁'",
995
+ "994": "ipp",
996
+ "995": "▁eat",
997
+ "996": "▁dom",
998
+ "997": "▁",
999
+ "998": "e",
1000
+ "999": "t",
1001
+ "1000": "o",
1002
+ "1001": "a",
1003
+ "1002": "i",
1004
+ "1003": "n",
1005
+ "1004": "s",
1006
+ "1005": "r",
1007
+ "1006": "h",
1008
+ "1007": "l",
1009
+ "1008": "d",
1010
+ "1009": "u",
1011
+ "1010": "c",
1012
+ "1011": "m",
1013
+ "1012": "y",
1014
+ "1013": "g",
1015
+ "1014": "w",
1016
+ "1015": "f",
1017
+ "1016": "p",
1018
+ "1017": "b",
1019
+ "1018": "v",
1020
+ "1019": "k",
1021
+ "1020": "'",
1022
+ "1021": "j",
1023
+ "1022": "x",
1024
+ "1023": "q",
1025
+ "1024": "z",
1026
+ "1025": "<EOU>",
1027
+ "1026": "<EOB>"
1028
+ }