alexwengg commited on
Commit
fe9b550
·
verified ·
1 Parent(s): 31bfda7

Upload 23 files

Browse files
CTCHead.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bb619400f0ca4a5873c5f2bf7bf78a645944d3f4acd544bed689a7a420f4634
3
+ size 2048
CTCHead.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
3
+ size 1051842
CTCHead.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "174E9828-F0D9-496B-B767-165878007DCB": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "ABA1F560-2FDF-40A2-BB0D-DE27A2824BED": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "ABA1F560-2FDF-40A2-BB0D-DE27A2824BED"
18
+ }
Decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95c442cea0d8d78e3de7c45e6a0502a7284b783915971f161a0e58a4e1fa7153
3
+ size 8544
Decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d5c4adf473c11e8c86daae6da87dbf4a0bf1c8b716fdd4a9378906208b41381
3
+ size 7872384
Decoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "7BB82BEE-DB48-4BA0-8F0E-AC39162FD7F3": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "E80560F7-8F68-462F-8B00-ADCB6B6F88F7": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "E80560F7-8F68-462F-8B00-ADCB6B6F88F7"
18
+ }
Encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:987981947a1239c7d9d1936168534058ddeb39e0da2bb0b36f91381f00183b1e
3
+ size 492504
Encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
3
+ size 215143424
Encoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "020BD32E-EB4A-4192-B46F-8CFA4932627D": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "242DF9D4-730A-4735-97CD-5C4C16E79595": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "242DF9D4-730A-4735-97CD-5C4C16E79595"
18
+ }
JointDecision.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7f0f6c8cb9481b7d303268c34ddd7fb9e69e1cfda9880c1dce06a64539cb389
3
+ size 8788
JointDecision.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
3
+ size 2798028
JointDecision.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "82C059F0-F2A7-4566-B14C-7BC1F1E136E2": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "D30A2839-E7D8-40D0-AF8B-72C2EF998325": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "82C059F0-F2A7-4566-B14C-7BC1F1E136E2"
18
+ }
Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:432a9ebe9eb0bee221560ed7bfef5278fb907652e4e0f20ba03b997c394a9335
3
+ size 19924
Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1609930989479ea65e0608b2cd6c54fef7f1623cc240cd6d993e24e2491133ac
3
+ size 807968
Preprocessor.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "C99314D3-C9C5-4FAD-8419-E34671E6E467": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "E684B627-55A9-4175-92F1-FA535236EE66": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "E684B627-55A9-4175-92F1-FA535236EE66"
18
+ }
README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Parakeet-TDT-CTC-110M CoreML
2
+
3
+ NVIDIA's Parakeet-TDT-CTC-110M model converted to CoreML format for efficient inference on Apple Silicon.
4
+
5
+ ## Model Description
6
+
7
+ This is a hybrid ASR model with a shared Conformer encoder and two decoder heads:
8
+ - **CTC Head**: Fast greedy decoding, ideal for keyword spotting
9
+ - **TDT Head**: Token-Duration Transducer for high-quality transcription
10
+
11
+ ### Architecture
12
+
13
+ | Component | Description | Size |
14
+ |-----------|-------------|------|
15
+ | Preprocessor | Mel spectrogram extraction | ~1 MB |
16
+ | Encoder | Conformer encoder (shared) | ~400 MB |
17
+ | CTCHead | CTC output projection | ~4 MB |
18
+ | Decoder | TDT prediction network (LSTM) | ~25 MB |
19
+ | JointDecision | TDT joint network | ~6 MB |
20
+
21
+ **Total size**: ~436 MB
22
+
23
+ ### Performance
24
+
25
+ Benchmarked on Earnings22 dataset (772 audio files):
26
+
27
+ | Metric | Value |
28
+ |--------|-------|
29
+ | Keyword Recall | 100% (1309/1309) |
30
+ | WER | 17.97% |
31
+ | RTFx (M4 Pro) | 358x real-time |
32
+
33
+ ## Requirements
34
+
35
+ - macOS 13+ (Ventura or later)
36
+ - Apple Silicon (M1/M2/M3/M4)
37
+ - Python 3.10+
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ # Using uv (recommended)
43
+ uv sync
44
+
45
+ # Or using pip
46
+ pip install -e .
47
+
48
+ # For audio file support (WAV, MP3, etc.)
49
+ pip install -e ".[audio]"
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ### Python Inference
55
+
56
+ ```python
57
+ from scripts.inference import ParakeetCoreML
58
+
59
+ # Load model
60
+ model = ParakeetCoreML("./model")
61
+
62
+ # Transcribe with TDT (higher quality)
63
+ text = model.transcribe("audio.wav", mode="tdt")
64
+ print(text)
65
+
66
+ # Or use CTC for faster keyword spotting
67
+ text = model.transcribe("audio.wav", mode="ctc")
68
+ print(text)
69
+ ```
70
+
71
+ ### Command Line
72
+
73
+ ```bash
74
+ # TDT decoding (default, higher quality)
75
+ uv run scripts/inference.py --audio audio.wav --model-dir ./model
76
+
77
+ # CTC decoding (faster, good for keyword spotting)
78
+ uv run scripts/inference.py --audio audio.wav --model-dir ./model --mode ctc
79
+ ```
80
+
81
+ ## Model Conversion
82
+
83
+ To convert from the original NeMo model:
84
+
85
+ ```bash
86
+ # Install conversion dependencies
87
+ uv sync --extra convert
88
+
89
+ # Run conversion
90
+ uv run scripts/convert_nemo_to_coreml.py --output-dir ./model
91
+ ```
92
+
93
+ This will:
94
+ 1. Download the original model from NVIDIA (`nvidia/parakeet-tdt_ctc-110m`)
95
+ 2. Convert each component to CoreML format
96
+ 3. Extract vocabulary and create metadata
97
+
98
+ ## File Structure
99
+
100
+ ```
101
+ model/
102
+ ├── Preprocessor.mlpackage # Audio → Mel spectrogram
103
+ ├── Encoder.mlpackage # Mel → Encoder features
104
+ ├── CTCHead.mlpackage # Encoder → CTC log probs
105
+ ├── Decoder.mlpackage # TDT prediction network
106
+ ├── JointDecision.mlpackage # TDT joint network
107
+ ├── vocab.json # Token vocabulary (1024 tokens)
108
+ └── metadata.json # Model configuration
109
+ ```
110
+
111
+ ## Decoding Modes
112
+
113
+ ### TDT Mode (Recommended for Transcription)
114
+ - Uses Token-Duration Transducer decoding
115
+ - Higher accuracy (17.97% WER)
116
+ - Predicts both tokens and durations
117
+ - Best for full transcription tasks
118
+
119
+ ### CTC Mode (Recommended for Keyword Spotting)
120
+ - Greedy CTC decoding
121
+ - Faster inference
122
+ - 100% keyword recall on Earnings22
123
+ - Best for detecting specific words/phrases
124
+
125
+ ## Custom Vocabulary / Keyword Spotting
126
+
127
+ For keyword spotting, CTC mode with custom vocabulary boosting achieves 100% recall:
128
+
129
+ ```python
130
+ # Load custom vocabulary with token IDs
131
+ with open("custom_vocab.json") as f:
132
+ keywords = json.load(f) # {"keyword": [token_ids], ...}
133
+
134
+ # Run CTC decoding
135
+ tokens = model.decode_ctc(encoder_output)
136
+
137
+ # Check for keyword matches
138
+ for keyword, expected_ids in keywords.items():
139
+ if is_subsequence(expected_ids, tokens):
140
+ print(f"Found keyword: {keyword}")
141
+ ```
142
+
143
+ ## License
144
+
145
+ This model conversion is released under the Apache 2.0 License, same as the original NVIDIA model.
146
+
147
+ ## Citation
148
+
149
+ If you use this model, please cite the original NVIDIA work:
150
+
151
+ ```bibtex
152
+ @misc{nvidia_parakeet_tdt_ctc,
153
+ title={Parakeet-TDT-CTC-110M},
154
+ author={NVIDIA},
155
+ year={2024},
156
+ publisher={Hugging Face},
157
+ url={https://huggingface.co/nvidia/parakeet-tdt_ctc-110m}
158
+ }
159
+ ```
160
+
161
+ ## Acknowledgments
162
+
163
+ - Original model by [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)
164
+ - CoreML conversion by [FluidInference](https://github.com/FluidInference)
__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Parakeet-TDT-CTC-110M CoreML scripts."""
2
+
3
+ from .inference import ParakeetCoreML
4
+
5
+ __all__ = ["ParakeetCoreML"]
convert_nemo_to_coreml.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert NVIDIA Parakeet-TDT-CTC-110M model from NeMo to CoreML format.
4
+
5
+ This script downloads the original NeMo model from NVIDIA and converts it to
6
+ CoreML format with separate components for efficient Apple Silicon inference.
7
+
8
+ Components:
9
+ - Preprocessor: Audio preprocessing (mel spectrogram)
10
+ - Encoder: Conformer encoder (shared between TDT and CTC)
11
+ - CTCHead: CTC output head for keyword spotting
12
+ - Decoder: TDT decoder LSTM
13
+ - JointDecision: TDT joint network for token + duration prediction
14
+
15
+ Usage:
16
+ uv run scripts/convert_nemo_to_coreml.py --output-dir ./model
17
+
18
+ Requirements:
19
+ - Python 3.10+
20
+ - PyTorch
21
+ - NeMo toolkit
22
+ - coremltools
23
+ """
24
+
25
+ import argparse
26
+ import json
27
+ import os
28
+ import shutil
29
+ from pathlib import Path
30
+
31
+ import coremltools as ct
32
+ import numpy as np
33
+ import torch
34
+ import torch.nn as nn
35
+ from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
36
+
37
+
38
+ def load_nemo_model(model_name: str = "nvidia/parakeet-tdt_ctc-110m") -> EncDecHybridRNNTCTCModel:
39
+ """Load the NeMo model from NVIDIA."""
40
+ print(f"Loading NeMo model: {model_name}")
41
+ model = EncDecHybridRNNTCTCModel.from_pretrained(model_name)
42
+ model.eval()
43
+ return model
44
+
45
+
46
+ class PreprocessorWrapper(nn.Module):
47
+ """Wrapper for mel spectrogram preprocessing."""
48
+
49
+ def __init__(self, preprocessor):
50
+ super().__init__()
51
+ self.preprocessor = preprocessor
52
+
53
+ def forward(self, audio_signal: torch.Tensor, audio_length: torch.Tensor):
54
+ processed_signal, processed_length = self.preprocessor(
55
+ input_signal=audio_signal,
56
+ length=audio_length
57
+ )
58
+ return processed_signal, processed_length
59
+
60
+
61
+ class EncoderWrapper(nn.Module):
62
+ """Wrapper for Conformer encoder."""
63
+
64
+ def __init__(self, encoder):
65
+ super().__init__()
66
+ self.encoder = encoder
67
+
68
+ def forward(self, mel: torch.Tensor, mel_length: torch.Tensor):
69
+ encoded, encoded_length = self.encoder(
70
+ audio_signal=mel,
71
+ length=mel_length
72
+ )
73
+ return encoded, encoded_length
74
+
75
+
76
+ class CTCHeadWrapper(nn.Module):
77
+ """Wrapper for CTC output head."""
78
+
79
+ def __init__(self, ctc_decoder):
80
+ super().__init__()
81
+ self.ctc_decoder = ctc_decoder
82
+
83
+ def forward(self, encoder_output: torch.Tensor):
84
+ # CTC head outputs log probabilities over vocabulary
85
+ logits = self.ctc_decoder.decoder_layers(encoder_output)
86
+ log_probs = torch.log_softmax(logits, dim=-1)
87
+ return log_probs
88
+
89
+
90
+ class DecoderWrapper(nn.Module):
91
+ """Wrapper for TDT decoder LSTM."""
92
+
93
+ def __init__(self, decoder):
94
+ super().__init__()
95
+ self.decoder = decoder
96
+ self.hidden_size = decoder.pred_hidden
97
+
98
+ def forward(self, targets: torch.Tensor, target_length: torch.Tensor,
99
+ h_in: torch.Tensor, c_in: torch.Tensor):
100
+ # Run decoder LSTM
101
+ output, (h_out, c_out) = self.decoder.prediction_network.lstm(
102
+ self.decoder.prediction_network.embed(targets),
103
+ (h_in, c_in)
104
+ )
105
+ # Project to decoder hidden dimension
106
+ decoder_output = self.decoder.prediction_network.dec_out(output)
107
+ return decoder_output, h_out, c_out
108
+
109
+
110
+ class JointDecisionWrapper(nn.Module):
111
+ """Wrapper for joint network that predicts token and duration."""
112
+
113
+ def __init__(self, joint):
114
+ super().__init__()
115
+ self.joint = joint
116
+
117
+ def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
118
+ # Combine encoder and decoder features
119
+ combined = self.joint.joint_net(
120
+ torch.cat([encoder_step, decoder_step], dim=-1)
121
+ )
122
+
123
+ # Token prediction
124
+ token_logits = self.joint.joint(combined)
125
+ token_id = torch.argmax(token_logits, dim=-1)
126
+ token_prob = torch.softmax(token_logits, dim=-1).max(dim=-1).values
127
+
128
+ # Duration prediction
129
+ duration_logits = self.joint.tdt_joint(combined)
130
+ duration_bin = torch.argmax(duration_logits, dim=-1)
131
+
132
+ return token_id, token_prob, duration_bin
133
+
134
+
135
+ def convert_preprocessor(model: EncDecHybridRNNTCTCModel, output_dir: Path):
136
+ """Convert preprocessor to CoreML."""
137
+ print("Converting Preprocessor...")
138
+
139
+ wrapper = PreprocessorWrapper(model.preprocessor)
140
+ wrapper.eval()
141
+
142
+ # Sample inputs
143
+ audio = torch.randn(1, 240000) # 15 seconds at 16kHz
144
+ length = torch.tensor([240000])
145
+
146
+ traced = torch.jit.trace(wrapper, (audio, length))
147
+
148
+ mlmodel = ct.convert(
149
+ traced,
150
+ inputs=[
151
+ ct.TensorType(name="audio_signal", shape=(1, ct.RangeDim(16000, 240000))),
152
+ ct.TensorType(name="audio_length", shape=(1,)),
153
+ ],
154
+ outputs=[
155
+ ct.TensorType(name="mel"),
156
+ ct.TensorType(name="mel_length"),
157
+ ],
158
+ minimum_deployment_target=ct.target.iOS16,
159
+ )
160
+
161
+ mlmodel.save(output_dir / "Preprocessor.mlpackage")
162
+ print(" Saved Preprocessor.mlpackage")
163
+
164
+
165
+ def convert_encoder(model: EncDecHybridRNNTCTCModel, output_dir: Path):
166
+ """Convert encoder to CoreML."""
167
+ print("Converting Encoder...")
168
+
169
+ wrapper = EncoderWrapper(model.encoder)
170
+ wrapper.eval()
171
+
172
+ # Sample inputs (mel spectrogram shape)
173
+ mel = torch.randn(1, 80, 1500) # 80 mel bins, ~15 seconds
174
+ mel_length = torch.tensor([1500])
175
+
176
+ traced = torch.jit.trace(wrapper, (mel, mel_length))
177
+
178
+ mlmodel = ct.convert(
179
+ traced,
180
+ inputs=[
181
+ ct.TensorType(name="mel", shape=(1, 80, ct.RangeDim(100, 1500))),
182
+ ct.TensorType(name="mel_length", shape=(1,)),
183
+ ],
184
+ outputs=[
185
+ ct.TensorType(name="encoder"),
186
+ ct.TensorType(name="encoder_length"),
187
+ ],
188
+ minimum_deployment_target=ct.target.iOS16,
189
+ )
190
+
191
+ mlmodel.save(output_dir / "Encoder.mlpackage")
192
+ print(" Saved Encoder.mlpackage")
193
+
194
+
195
+ def convert_ctc_head(model: EncDecHybridRNNTCTCModel, output_dir: Path):
196
+ """Convert CTC head to CoreML."""
197
+ print("Converting CTCHead...")
198
+
199
+ wrapper = CTCHeadWrapper(model.ctc_decoder)
200
+ wrapper.eval()
201
+
202
+ # Sample input (encoder output)
203
+ encoder_output = torch.randn(1, 188, 512) # batch, time, hidden
204
+
205
+ traced = torch.jit.trace(wrapper, encoder_output)
206
+
207
+ mlmodel = ct.convert(
208
+ traced,
209
+ inputs=[
210
+ ct.TensorType(name="encoder_output", shape=(1, ct.RangeDim(10, 300), 512)),
211
+ ],
212
+ outputs=[
213
+ ct.TensorType(name="ctc_log_probs"),
214
+ ],
215
+ minimum_deployment_target=ct.target.iOS16,
216
+ )
217
+
218
+ mlmodel.save(output_dir / "CTCHead.mlpackage")
219
+ print(" Saved CTCHead.mlpackage")
220
+
221
+
222
+ def convert_decoder(model: EncDecHybridRNNTCTCModel, output_dir: Path):
223
+ """Convert decoder to CoreML."""
224
+ print("Converting Decoder...")
225
+
226
+ wrapper = DecoderWrapper(model.decoder)
227
+ wrapper.eval()
228
+
229
+ hidden_size = wrapper.hidden_size
230
+ num_layers = model.decoder.pred_num_layers
231
+
232
+ # Sample inputs
233
+ targets = torch.zeros(1, 1, dtype=torch.long)
234
+ target_length = torch.tensor([1])
235
+ h_in = torch.zeros(num_layers, 1, hidden_size)
236
+ c_in = torch.zeros(num_layers, 1, hidden_size)
237
+
238
+ traced = torch.jit.trace(wrapper, (targets, target_length, h_in, c_in))
239
+
240
+ mlmodel = ct.convert(
241
+ traced,
242
+ inputs=[
243
+ ct.TensorType(name="targets", shape=(1, 1)),
244
+ ct.TensorType(name="target_length", shape=(1,)),
245
+ ct.TensorType(name="h_in", shape=(num_layers, 1, hidden_size)),
246
+ ct.TensorType(name="c_in", shape=(num_layers, 1, hidden_size)),
247
+ ],
248
+ outputs=[
249
+ ct.TensorType(name="decoder"),
250
+ ct.TensorType(name="h_out"),
251
+ ct.TensorType(name="c_out"),
252
+ ],
253
+ minimum_deployment_target=ct.target.iOS16,
254
+ )
255
+
256
+ mlmodel.save(output_dir / "Decoder.mlpackage")
257
+ print(" Saved Decoder.mlpackage")
258
+
259
+
260
+ def convert_joint(model: EncDecHybridRNNTCTCModel, output_dir: Path):
261
+ """Convert joint network to CoreML."""
262
+ print("Converting JointDecision...")
263
+
264
+ wrapper = JointDecisionWrapper(model.joint)
265
+ wrapper.eval()
266
+
267
+ # Sample inputs
268
+ encoder_step = torch.randn(1, 512, 1)
269
+ decoder_step = torch.randn(1, 640, 1)
270
+
271
+ traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
272
+
273
+ mlmodel = ct.convert(
274
+ traced,
275
+ inputs=[
276
+ ct.TensorType(name="encoder_step", shape=(1, 512, 1)),
277
+ ct.TensorType(name="decoder_step", shape=(1, 640, 1)),
278
+ ],
279
+ outputs=[
280
+ ct.TensorType(name="token_id"),
281
+ ct.TensorType(name="token_prob"),
282
+ ct.TensorType(name="duration_bin"),
283
+ ],
284
+ minimum_deployment_target=ct.target.iOS16,
285
+ )
286
+
287
+ mlmodel.save(output_dir / "JointDecision.mlpackage")
288
+ print(" Saved JointDecision.mlpackage")
289
+
290
+
291
+ def extract_vocabulary(model: EncDecHybridRNNTCTCModel, output_dir: Path):
292
+ """Extract vocabulary from model."""
293
+ print("Extracting vocabulary...")
294
+
295
+ vocab = model.decoding.decoding.vocabulary
296
+ vocab_dict = {i: token for i, token in enumerate(vocab)}
297
+
298
+ with open(output_dir / "vocab.json", "w") as f:
299
+ json.dump(vocab_dict, f, indent=2, ensure_ascii=False)
300
+
301
+ print(f" Saved vocab.json ({len(vocab_dict)} tokens)")
302
+ return len(vocab_dict)
303
+
304
+
305
+ def create_metadata(model: EncDecHybridRNNTCTCModel, output_dir: Path, vocab_size: int):
306
+ """Create metadata file."""
307
+ print("Creating metadata...")
308
+
309
+ metadata = {
310
+ "model_id": "nvidia/parakeet-tdt_ctc-110m",
311
+ "sample_rate": 16000,
312
+ "max_audio_seconds": 15.0,
313
+ "max_audio_samples": 240000,
314
+ "vocab_size": vocab_size,
315
+ "vocab_with_blank": vocab_size + 1,
316
+ "num_extra": 5, # TDT duration bins
317
+ "decoder_hidden_dim": model.decoder.pred_hidden,
318
+ "decoder_num_layers": model.decoder.pred_num_layers,
319
+ "components": {
320
+ "preprocessor": "Preprocessor.mlpackage",
321
+ "encoder": "Encoder.mlpackage",
322
+ "ctc_head": "CTCHead.mlpackage",
323
+ "decoder": "Decoder.mlpackage",
324
+ "joint_decision": "JointDecision.mlpackage"
325
+ }
326
+ }
327
+
328
+ with open(output_dir / "metadata.json", "w") as f:
329
+ json.dump(metadata, f, indent=2)
330
+
331
+ print(" Saved metadata.json")
332
+
333
+
334
+ def main():
335
+ parser = argparse.ArgumentParser(description="Convert Parakeet-TDT-CTC-110M to CoreML")
336
+ parser.add_argument("--output-dir", type=str, default="./model",
337
+ help="Output directory for CoreML models")
338
+ parser.add_argument("--model-name", type=str, default="nvidia/parakeet-tdt_ctc-110m",
339
+ help="NeMo model name or path")
340
+ args = parser.parse_args()
341
+
342
+ output_dir = Path(args.output_dir)
343
+ output_dir.mkdir(parents=True, exist_ok=True)
344
+
345
+ # Load model
346
+ model = load_nemo_model(args.model_name)
347
+
348
+ # Convert components
349
+ convert_preprocessor(model, output_dir)
350
+ convert_encoder(model, output_dir)
351
+ convert_ctc_head(model, output_dir)
352
+ convert_decoder(model, output_dir)
353
+ convert_joint(model, output_dir)
354
+
355
+ # Extract vocabulary and create metadata
356
+ vocab_size = extract_vocabulary(model, output_dir)
357
+ create_metadata(model, output_dir, vocab_size)
358
+
359
+ print(f"\nConversion complete! Models saved to: {output_dir}")
360
+ print("\nTo compile models for Apple Silicon:")
361
+ print(" xcrun coremlcompiler compile Encoder.mlpackage Encoder.mlmodelc")
362
+
363
+
364
+ if __name__ == "__main__":
365
+ main()
inference.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Inference script for Parakeet-TDT-CTC-110M CoreML model.
4
+
5
+ This script demonstrates how to run inference using the converted CoreML models
6
+ on Apple Silicon. It supports both TDT (Token-Duration Transducer) decoding for
7
+ full transcription and CTC decoding for keyword spotting.
8
+
9
+ Usage:
10
+ uv run scripts/inference.py --audio audio.wav --mode tdt
11
+ uv run scripts/inference.py --audio audio.wav --mode ctc
12
+
13
+ Requirements:
14
+ - macOS 13+ with Apple Silicon
15
+ - Python 3.10+
16
+ - coremltools
17
+ """
18
+
19
+ import argparse
20
+ import json
21
+ from pathlib import Path
22
+
23
+ import coremltools as ct
24
+ import numpy as np
25
+
26
+
27
+ class ParakeetCoreML:
28
+ """CoreML inference wrapper for Parakeet-TDT-CTC-110M."""
29
+
30
+ def __init__(self, model_dir: str):
31
+ """Load CoreML models from directory.
32
+
33
+ Args:
34
+ model_dir: Path to directory containing .mlpackage files
35
+ """
36
+ self.model_dir = Path(model_dir)
37
+
38
+ # Load metadata
39
+ with open(self.model_dir / "metadata.json") as f:
40
+ self.metadata = json.load(f)
41
+
42
+ # Load vocabulary
43
+ with open(self.model_dir / "vocab.json") as f:
44
+ vocab_dict = json.load(f)
45
+ self.vocab = {int(k): v for k, v in vocab_dict.items()}
46
+
47
+ self.blank_id = len(self.vocab) # Blank token is last
48
+
49
+ # Load models
50
+ print("Loading CoreML models...")
51
+ self.preprocessor = ct.models.MLModel(
52
+ str(self.model_dir / "Preprocessor.mlpackage")
53
+ )
54
+ self.encoder = ct.models.MLModel(
55
+ str(self.model_dir / "Encoder.mlpackage")
56
+ )
57
+ self.ctc_head = ct.models.MLModel(
58
+ str(self.model_dir / "CTCHead.mlpackage")
59
+ )
60
+ self.decoder = ct.models.MLModel(
61
+ str(self.model_dir / "Decoder.mlpackage")
62
+ )
63
+ self.joint = ct.models.MLModel(
64
+ str(self.model_dir / "JointDecision.mlpackage")
65
+ )
66
+ print("Models loaded successfully.")
67
+
68
+ def load_audio(self, audio_path: str) -> np.ndarray:
69
+ """Load audio file and convert to 16kHz mono.
70
+
71
+ Args:
72
+ audio_path: Path to audio file (WAV, MP3, etc.)
73
+
74
+ Returns:
75
+ Audio samples as float32 numpy array
76
+ """
77
+ try:
78
+ import librosa
79
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True)
80
+ return audio.astype(np.float32)
81
+ except ImportError:
82
+ # Fallback to scipy for WAV files
83
+ from scipy.io import wavfile
84
+ sr, audio = wavfile.read(audio_path)
85
+
86
+ # Convert to mono if stereo
87
+ if len(audio.shape) > 1:
88
+ audio = audio.mean(axis=1)
89
+
90
+ # Resample if needed
91
+ if sr != 16000:
92
+ from scipy import signal
93
+ num_samples = int(len(audio) * 16000 / sr)
94
+ audio = signal.resample(audio, num_samples)
95
+
96
+ # Normalize to float32 [-1, 1]
97
+ if audio.dtype == np.int16:
98
+ audio = audio.astype(np.float32) / 32768.0
99
+ elif audio.dtype == np.int32:
100
+ audio = audio.astype(np.float32) / 2147483648.0
101
+
102
+ return audio.astype(np.float32)
103
+
104
+ def preprocess(self, audio: np.ndarray) -> tuple[np.ndarray, int]:
105
+ """Convert audio to mel spectrogram.
106
+
107
+ Args:
108
+ audio: Audio samples as float32 array
109
+
110
+ Returns:
111
+ Tuple of (mel spectrogram, mel length)
112
+ """
113
+ audio_signal = audio.reshape(1, -1).astype(np.float32)
114
+ audio_length = np.array([len(audio)], dtype=np.int32)
115
+
116
+ result = self.preprocessor.predict({
117
+ "audio_signal": audio_signal,
118
+ "audio_length": audio_length
119
+ })
120
+
121
+ return result["mel"], int(result["mel_length"][0])
122
+
123
+ def encode(self, mel: np.ndarray, mel_length: int) -> tuple[np.ndarray, int]:
124
+ """Run encoder on mel spectrogram.
125
+
126
+ Args:
127
+ mel: Mel spectrogram from preprocessor
128
+ mel_length: Length of mel spectrogram
129
+
130
+ Returns:
131
+ Tuple of (encoder output, encoder length)
132
+ """
133
+ result = self.encoder.predict({
134
+ "mel": mel,
135
+ "mel_length": np.array([mel_length], dtype=np.int32)
136
+ })
137
+
138
+ return result["encoder"], int(result["encoder_length"][0])
139
+
140
+ def decode_ctc(self, encoder_output: np.ndarray) -> list[int]:
141
+ """CTC greedy decoding.
142
+
143
+ Args:
144
+ encoder_output: Output from encoder
145
+
146
+ Returns:
147
+ List of token IDs (with duplicates and blanks removed)
148
+ """
149
+ result = self.ctc_head.predict({"encoder_output": encoder_output})
150
+ log_probs = result["ctc_log_probs"]
151
+
152
+ # Greedy decoding: take argmax at each timestep
153
+ predictions = np.argmax(log_probs[0], axis=-1)
154
+
155
+ # Remove duplicates and blanks
156
+ tokens = []
157
+ prev_token = self.blank_id
158
+ for token in predictions:
159
+ if token != self.blank_id and token != prev_token:
160
+ tokens.append(int(token))
161
+ prev_token = token
162
+
163
+ return tokens
164
+
165
+ def decode_tdt(self, encoder_output: np.ndarray, encoder_length: int) -> list[int]:
166
+ """TDT (Token-Duration Transducer) decoding.
167
+
168
+ Args:
169
+ encoder_output: Output from encoder
170
+ encoder_length: Length of encoder output
171
+
172
+ Returns:
173
+ List of token IDs
174
+ """
175
+ hidden_size = self.metadata["decoder_hidden_dim"]
176
+ num_layers = self.metadata["decoder_num_layers"]
177
+
178
+ # Initialize decoder state
179
+ h = np.zeros((num_layers, 1, hidden_size), dtype=np.float32)
180
+ c = np.zeros((num_layers, 1, hidden_size), dtype=np.float32)
181
+
182
+ # Start with blank token
183
+ targets = np.zeros((1, 1), dtype=np.int32)
184
+ target_length = np.array([1], dtype=np.int32)
185
+
186
+ tokens = []
187
+ frame = 0
188
+ max_tokens = 1000 # Safety limit
189
+
190
+ while frame < encoder_length and len(tokens) < max_tokens:
191
+ # Get decoder output
192
+ decoder_result = self.decoder.predict({
193
+ "targets": targets,
194
+ "target_length": target_length,
195
+ "h_in": h,
196
+ "c_in": c
197
+ })
198
+
199
+ decoder_output = decoder_result["decoder"]
200
+ h = decoder_result["h_out"]
201
+ c = decoder_result["c_out"]
202
+
203
+ # Get encoder step
204
+ encoder_step = encoder_output[0, frame:frame+1, :].T.reshape(1, -1, 1)
205
+ decoder_step = decoder_output.T.reshape(1, -1, 1)
206
+
207
+ # Joint prediction
208
+ joint_result = self.joint.predict({
209
+ "encoder_step": encoder_step.astype(np.float32),
210
+ "decoder_step": decoder_step.astype(np.float32)
211
+ })
212
+
213
+ token_id = int(joint_result["token_id"])
214
+ duration_bin = int(joint_result["duration_bin"])
215
+
216
+ # Duration bins: 0=0, 1=1, 2=2, 3=3, 4=4+
217
+ durations = [0, 1, 2, 3, 4]
218
+ duration = durations[min(duration_bin, 4)]
219
+
220
+ if token_id != self.blank_id:
221
+ tokens.append(token_id)
222
+ # Update decoder input
223
+ targets = np.array([[token_id]], dtype=np.int32)
224
+
225
+ # Advance by duration (minimum 1 frame)
226
+ frame += max(1, duration)
227
+
228
+ return tokens
229
+
230
+ def tokens_to_text(self, tokens: list[int]) -> str:
231
+ """Convert token IDs to text.
232
+
233
+ Args:
234
+ tokens: List of token IDs
235
+
236
+ Returns:
237
+ Decoded text string
238
+ """
239
+ pieces = [self.vocab.get(t, "") for t in tokens]
240
+ # Join and handle SentencePiece encoding
241
+ text = "".join(pieces).replace("▁", " ").strip()
242
+ return text
243
+
244
+ def transcribe(self, audio_path: str, mode: str = "tdt") -> str:
245
+ """Transcribe audio file.
246
+
247
+ Args:
248
+ audio_path: Path to audio file
249
+ mode: Decoding mode - "tdt" for full transcription, "ctc" for keyword spotting
250
+
251
+ Returns:
252
+ Transcribed text
253
+ """
254
+ # Load and preprocess audio
255
+ audio = self.load_audio(audio_path)
256
+ mel, mel_length = self.preprocess(audio)
257
+
258
+ # Encode
259
+ encoder_output, encoder_length = self.encode(mel, mel_length)
260
+
261
+ # Decode
262
+ if mode == "ctc":
263
+ tokens = self.decode_ctc(encoder_output)
264
+ else:
265
+ tokens = self.decode_tdt(encoder_output, encoder_length)
266
+
267
+ # Convert to text
268
+ text = self.tokens_to_text(tokens)
269
+
270
+ return text
271
+
272
+
273
+ def main():
274
+ parser = argparse.ArgumentParser(
275
+ description="Run inference with Parakeet-TDT-CTC-110M CoreML model"
276
+ )
277
+ parser.add_argument(
278
+ "--audio", type=str, required=True,
279
+ help="Path to audio file (WAV, MP3, etc.)"
280
+ )
281
+ parser.add_argument(
282
+ "--model-dir", type=str, default="./model",
283
+ help="Directory containing CoreML model files"
284
+ )
285
+ parser.add_argument(
286
+ "--mode", type=str, choices=["tdt", "ctc"], default="tdt",
287
+ help="Decoding mode: 'tdt' for transcription, 'ctc' for keyword spotting"
288
+ )
289
+ args = parser.parse_args()
290
+
291
+ # Load model
292
+ model = ParakeetCoreML(args.model_dir)
293
+
294
+ # Transcribe
295
+ print(f"\nTranscribing: {args.audio}")
296
+ print(f"Mode: {args.mode.upper()}")
297
+ print("-" * 40)
298
+
299
+ text = model.transcribe(args.audio, mode=args.mode)
300
+ print(f"Result: {text}")
301
+
302
+
303
+ if __name__ == "__main__":
304
+ main()
metadata.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "nvidia/parakeet-tdt_ctc-110m",
3
+ "sample_rate": 16000,
4
+ "max_audio_seconds": 15.0,
5
+ "max_audio_samples": 240000,
6
+ "vocab_size": 1024,
7
+ "vocab_with_blank": 1025,
8
+ "num_extra": 5,
9
+ "decoder_hidden_dim": 640,
10
+ "decoder_num_layers": 1,
11
+ "components": {
12
+ "preprocessor": "Preprocessor.mlpackage",
13
+ "encoder": "Encoder.mlpackage",
14
+ "ctc_head": "CTCHead.mlpackage",
15
+ "decoder": "Decoder.mlpackage",
16
+ "joint_decision": "JointDecision.mlpackage"
17
+ }
18
+ }
pyproject.toml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "parakeet-tdt-ctc-110m-coreml"
3
+ version = "1.0.0"
4
+ description = "NVIDIA Parakeet-TDT-CTC-110M converted to CoreML format for Apple Silicon"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = { text = "Apache-2.0" }
8
+ authors = [
9
+ { name = "FluidInference" }
10
+ ]
11
+ keywords = ["asr", "speech-recognition", "coreml", "apple-silicon", "nvidia", "parakeet"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "License :: OSI Approved :: Apache Software License",
16
+ "Operating System :: MacOS",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
23
+ ]
24
+
25
+ dependencies = [
26
+ "coremltools>=7.0",
27
+ "numpy>=1.24.0",
28
+ "scipy>=1.10.0",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ convert = [
33
+ "torch>=2.0.0",
34
+ "nemo-toolkit[asr]>=1.20.0",
35
+ ]
36
+ audio = [
37
+ "librosa>=0.10.0",
38
+ ]
39
+ dev = [
40
+ "pytest>=7.0.0",
41
+ "ruff>=0.1.0",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://huggingface.co/FluidInference/parakeet-tdt-ctc-110m-coreml"
46
+ Repository = "https://huggingface.co/FluidInference/parakeet-tdt-ctc-110m-coreml"
47
+ Issues = "https://github.com/FluidInference/fluidaudio/issues"
48
+
49
+ [project.scripts]
50
+ parakeet-inference = "scripts.inference:main"
51
+
52
+ [build-system]
53
+ requires = ["hatchling"]
54
+ build-backend = "hatchling.build"
55
+
56
+ [tool.hatch.build.targets.wheel]
57
+ packages = ["scripts"]
58
+
59
+ [tool.ruff]
60
+ line-length = 100
61
+ target-version = "py310"
62
+
63
+ [tool.ruff.lint]
64
+ select = ["E", "F", "I", "N", "W"]
65
+ ignore = ["E501"]
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
@@ -1 +1 @@
1
- {"0": "<unk>", "1": " t", "2": " th", "3": " a", "4": "in", "5": "re", "6": " the", "7": " w", "8": " s", "9": " o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": " h", "16": " c", "17": " b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": " f", "23": " to", "24": " m", "25": "es", "26": " p", "27": "or", "28": "an", "29": " d", "30": "ll", "31": " I", "32": "ed", "33": " and", "34": " l", "35": " of", "36": " in", "37": " y", "38": "ar", "39": " g", "40": " you", "41": "as", "42": "om", "43": " n", "44": "ve", "45": " that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": " e", "53": "ut", "54": " it", "55": "ot", "56": " be", "57": " T", "58": "ion", "59": " is", "60": " wh", "61": " re", "62": " on", "63": " we", "64": "ent", "65": " A", "66": "ay", "67": " ha", "68": " Th", "69": "id", "70": " S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": " for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": " he", "81": " st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": " this", "91": "if", "92": " W", "93": "oo", "94": "ri", "95": " was", "96": "ght", "97": " u", "98": " with", "99": "ad", "100": "ch", "101": " se", "102": " k", "103": " an", "104": " The", "105": " li", "106": " do", "107": " B", "108": " have", "109": " as", "110": "th", "111": " are", "112": " sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": " H", "118": " j", "119": "ter", "120": " go", "121": " And", "122": "ation", "123": " C", "124": " so", "125": "ome", "126": " not", "127": "op", "128": "il", "129": "ore", "130": " ne", "131": " can", "132": " me", "133": " at", "134": "ould", "135": "ant", "136": " M", "137": " like", "138": "ere", "139": " they", "140": "ra", "141": "ers", "142": " ab", "143": " de", "144": " kn", "145": "ge", "146": " Y", "147": " ch", "148": "ul", "149": "pp", "150": " or", "151": " al", "152": " con", "153": " com", "154": "ess", "155": " su", "156": "out", "157": " your", "158": " So", "159": "ate", "160": " one", "161": " all", "162": " ex", "163": "est", "164": " fr", "165": " just", "166": " pro", "167": " know", "168": " O", "169": "ain", "170": " but", "171": "ol", "172": "ive", "173": " v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": " my", "179": "el", "180": " N", "181": "nt", "182": " It", "183": " what", "184": "ab", "185": " P", "186": " wor", "187": " out", "188": " there", "189": " up", "190": "um", "191": " from", "192": "pe", "193": " tw", "194": " r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": " L", "200": "ist", "201": " about", "202": "ide", "203": "ig", "204": "ake", "205": " D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": " We", "214": " get", "215": " E", "216": " G", "217": "ack", "218": " le", "219": "ity", "220": "od", "221": " F", "222": "ard", "223": " pl", "224": " our", "225": " int", "226": "ment", "227": " will", "228": "ies", "229": " by", "230": "ink", "231": "ca", "232": " if", "233": "red", "234": "her", "235": "ie", "236": " us", "237": " some", "238": " don", "239": "ven", "240": "ood", "241": "ast", "242": " R", "243": " his", "244": " tim", "245": " tr", "246": " more", "247": "ich", "248": "ous", "249": "ame", "250": " going", "251": " had", "252": " them", "253": "ook", "254": " pe", "255": " Wh", "256": " You", "257": " But", "258": "ine", "259": " here", "260": " would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": " has", "267": "ect", "268": " think", "269": " fe", "270": "ong", "271": " see", "272": " when", "273": " who", "274": " were", "275": " really", "276": " their", "277": " want", "278": "one", "279": "ople", "280": " then", "281": " time", "282": " sa", "283": "ap", "284": " te", "285": " He", "286": " ye", "287": "ck", "288": " her", "289": " thing", "290": " right", "291": " which", "292": "itt", "293": "ice", "294": "act", "295": " people", "296": "ty", "297": " two", "298": " J", "299": " im", "300": "ther", "301": "ci", "302": "ose", "303": " cl", "304": " qu", "305": " man", "306": " also", "307": "ree", "308": " en", "309": "ud", "310": " how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": " any", "316": "ff", "317": "ace", "318": "per", "319": " because", "320": " very", "321": "own", "322": " ad", "323": " act", "324": " been", "325": " now", "326": " ag", "327": " into", "328": " comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": " these", "335": "ays", "336": "ep", "337": " This", "338": " she", "339": "ans", "340": "ah", "341": "een", "342": " over", "343": "ry", "344": " lo", "345": "age", "346": " pr", "347": " sp", "348": "ue", "349": " co", "350": "ick", "351": "ber", "352": " did", "353": "ip", "354": "ach", "355": " back", "356": " no", "357": " cont", "358": " other", "359": " every", "360": "pt", "361": " need", "362": " him", "363": " U", "364": " In", "365": " work", "366": "irst", "367": " part", "368": " look", "369": "ittle", "370": "ble", "371": "iz", "372": " un", "373": " make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": " little", "379": " off", "380": " than", "381": " got", "382": "ually", "383": " per", "384": " good", "385": " way", "386": " could", "387": " ac", "388": " imp", "389": "able", "390": " where", "391": "iff", "392": " That", "393": " res", "394": "ount", "395": "pl", "396": "ance", "397": " first", "398": " ro", "399": " pre", "400": "ass", "401": " say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": " somet", "408": "ound", "409": " down", "410": " diff", "411": "sel", "412": " gu", "413": " am", "414": "ress", "415": " lot", "416": "ence", "417": " dis", "418": "orm", "419": "ix", "420": " po", "421": "ving", "422": "enty", "423": " K", "424": " spe", "425": "und", "426": "he", "427": " much", "428": " ar", "429": "round", "430": " app", "431": "co", "432": "ark", "433": " new", "434": "ater", "435": "ult", "436": "end", "437": " even", "438": " start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": " well", "444": "be", "445": " They", "446": " three", "447": "ign", "448": "ild", "449": " said", "450": "ough", "451": "ang", "452": " too", "453": "ade", "454": " bl", "455": "ens", "456": " inc", "457": "ia", "458": " those", "459": " mo", "460": " take", "461": " through", "462": " fl", "463": " kind", "464": " things", "465": " bet", "466": " only", "467": " St", "468": " let", "469": "cess", "470": " Ch", "471": "ary", "472": "vel", "473": " If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": " again", "480": " something", "481": "onna", "482": "fore", "483": " may", "484": "ting", "485": " bu", "486": " differe", "487": "urn", "488": " gonna", "489": " does", "490": "uct", "491": "og", "492": " twenty", "493": " gr", "494": " Ye", "495": "wn", "496": " should", "497": " comm", "498": "ition", "499": " under", "500": " hel", "501": "ory", "502": " fo", "503": " use", "504": "igh", "505": "ife", "506": " actually", "507": " tal", "508": " call", "509": "ents", "510": "ious", "511": "ull", "512": " There", "513": " Yeah", "514": " most", "515": " ke", "516": "ors", "517": "ved", "518": "ys", "519": " sc", "520": " happ", "521": "ope", "522": " help", "523": "atch", "524": " What", "525": " rem", "526": "ple", "527": " Now", "528": " br", "529": "ool", "530": "oth", "531": " four", "532": "self", "533": " str", "534": "ne", "535": "thing", "536": " put", "537": "ial", "538": " great", "539": "ail", "540": "ub", "541": "ning", "542": " sm", "543": " feel", "544": " five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": " many", "552": " hundred", "553": " years", "554": " being", "555": " come", "556": " mean", "557": "ily", "558": " different", "559": " after", "560": " ser", "561": " show", "562": "form", "563": "ful", "564": "oy", "565": " six", "566": " vide", "567": " V", "568": " its", "569": " point", "570": " day", "571": " des", "572": "ons", "573": " bit", "574": " bel", "575": " before", "576": " aw", "577": " end", "578": " Oh", "579": " still", "580": "ath", "581": " long", "582": " '", "583": "ise", "584": "ob", "585": "day", "586": " add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": " cr", "592": " around", "593": " try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": " find", "600": "ward", "601": " As", "602": " eight", "603": "lic", "604": " same", "605": " pos", "606": " em", "607": " made", "608": " supp", "609": " life", "610": " Be", "611": "pect", "612": " dec", "613": " play", "614": "ange", "615": " att", "616": " pers", "617": "ways", "618": " high", "619": " hand", "620": " next", "621": " cons", "622": " own", "623": " inv", "624": "ower", "625": " ind", "626": "ert", "627": "ng", "628": "ave", "629": " year", "630": " big", "631": "ating", "632": " world", "633": " rel", "634": " sure", "635": " tra", "636": "ew", "637": "ered", "638": " fin", "639": " Well", "640": " sl", "641": " doing", "642": "bs", "643": " set", "644": " rec", "645": "ual", "646": "cial", "647": " ph", "648": "erm", "649": " love", "650": "ph", "651": " real", "652": " last", "653": "ict", "654": " bo", "655": " ra", "656": "ible", "657": " wr", "658": "mer", "659": " count", "660": "ities", "661": " always", "662": "inet", "663": "ments", "664": "uc", "665": " might", "666": " inter", "667": " video", "668": "gin", "669": " tell", "670": " never", "671": "vent", "672": " import", "673": "ied", "674": " sy", "675": " How", "676": "ically", "677": "ought", "678": " thir", "679": " rep", "680": "ks", "681": "ib", "682": " fam", "683": "ject", "684": " bas", "685": " She", "686": " give", "687": "akes", "688": " ninet", "689": " reg", "690": " min", "691": " op", "692": " def", "693": " didn", "694": "te", "695": " cour", "696": " why", "697": " ent", "698": " place", "699": " ins", "700": " car", "701": "ather", "702": " person", "703": "ular", "704": " inst", "705": " prod", "706": "lect", "707": " Al", "708": " today", "709": " bec", "710": " sur", "711": " All", "712": " another", "713": " bus", "714": " keep", "715": "ell", "716": "ese", "717": "riend", "718": " quest", "719": " talk", "720": "als", "721": "ings", "722": " mon", "723": "cond", "724": "old", "725": " acc", "726": " la", "727": " num", "728": "ident", "729": " che", "730": "iness", "731": " turn", "732": " ear", "733": " No", "734": "ousand", "735": " better", "736": "ific", "737": " loo", "738": " gl", "739": "oc", "740": " important", "741": "ited", "742": " An", "743": " thousand", "744": "ility", "745": "llow", "746": " used", "747": " gen", "748": " sim", "749": "li", "750": " happen", "751": " Un", "752": " Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": " watch", "758": " For", "759": " sw", "760": "ren", "761": "ute", "762": "ever", "763": " pol", "764": " sch", "765": " When", "766": " such", "767": " fif", "768": " home", "769": " cle", "770": " contin", "771": "ouse", "772": " friend", "773": "uring", "774": " Okay", "775": "gr", "776": " able", "777": " stud", "778": " eff", "779": "hip", "780": "body", "781": " top", "782": "ness", "783": " exper", "784": " pret", "785": " both", "786": " done", "787": "cri", "788": " mark", "789": " while", "790": " old", "791": "ros", "792": "ont", "793": " second", "794": "ative", "795": " thought", "796": " best", "797": " found", "798": "iew", "799": " belie", "800": " each", "801": "erest", "802": " tri", "803": " eas", "804": " ca", "805": " fact", "806": " care", "807": " fun", "808": "atter", "809": "ures", "810": " head", "811": " lear", "812": " water", "813": " hard", "814": " few", "815": " side", "816": "ween", "817": " exp", "818": " away", "819": "its", "820": " ext", "821": "lud", "822": " run", "823": " trans", "824": "ince", "825": " sk", "826": " open", "827": "cus", "828": " between", "829": " called", "830": " wee", "831": " pretty", "832": "ason", "833": " far", "834": "ember", "835": "omm", "836": " interest", "837": "any", "838": "ner", "839": "uff", "840": " pres", "841": " cur", "842": " child", "843": "ee", "844": " toget", "845": " together", "846": "olog", "847": " God", "848": "ond", "849": " char", "850": " looking", "851": "stem", "852": "az", "853": "cent", "854": " ob", "855": " ass", "856": "land", "857": " doesn", "858": " business", "859": " course", "860": " ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": " ref", "868": " name", "869": "ross", "870": " grow", "871": "oney", "872": " went", "873": "ics", "874": "teen", "875": " cou", "876": " prob", "877": " ret", "878": " guys", "879": " came", "880": "ash", "881": "led", "882": " Eur", "883": "ues", "884": " ide", "885": "gan", "886": " everything", "887": " getting", "888": " ask", "889": " cor", "890": " build", "891": " sign", "892": " small", "893": "uck", "894": " el", "895": " col", "896": " Is", "897": "ational", "898": "stand", "899": "cy", "900": " conf", "901": "der", "902": " bre", "903": " cap", "904": " mod", "905": "ets", "906": "ike", "907": " number", "908": " comple", "909": "ertain", "910": " ever", "911": " coll", "912": " hum", "913": " Europe", "914": " cre", "915": " met", "916": " exam", "917": " move", "918": " pass", "919": " left", "920": " system", "921": " includ", "922": " Thank", "923": "cept", "924": " wom", "925": " product", "926": "ten", "927": " rest", "928": " probably", "929": " dri", "930": " Do", "931": " gener", "932": " anything", "933": " lar", "934": " My", "935": " school", "936": " lead", "937": " sub", "938": " ty", "939": " plan", "940": " seem", "941": " whole", "942": "irect", "943": " light", "944": " must", "945": " mom", "946": " opp", "947": " support", "948": " family", "949": "ices", "950": "amp", "951": " proble", "952": " dr", "953": "ready", "954": " using", "955": "ense", "956": " prov", "957": "ush", "958": "ax", "959": " power", "960": " Re", "961": "alth", "962": " ev", "963": " stand", "964": " war", "965": "ts", "966": " ", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}
 
1
+ {"0": "<unk>", "1": "\u2581t", "2": "\u2581th", "3": "\u2581a", "4": "in", "5": "re", "6": "\u2581the", "7": "\u2581w", "8": "\u2581s", "9": "\u2581o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "\u2581h", "16": "\u2581c", "17": "\u2581b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "\u2581f", "23": "\u2581to", "24": "\u2581m", "25": "es", "26": "\u2581p", "27": "or", "28": "an", "29": "\u2581d", "30": "ll", "31": "\u2581I", "32": "ed", "33": "\u2581and", "34": "\u2581l", "35": "\u2581of", "36": "\u2581in", "37": "\u2581y", "38": "ar", "39": "\u2581g", "40": "\u2581you", "41": "as", "42": "om", "43": "\u2581n", "44": "ve", "45": "\u2581that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "\u2581e", "53": "ut", "54": "\u2581it", "55": "ot", "56": "\u2581be", "57": "\u2581T", "58": "ion", "59": "\u2581is", "60": "\u2581wh", "61": "\u2581re", "62": "\u2581on", "63": "\u2581we", "64": "ent", "65": "\u2581A", "66": "ay", "67": "\u2581ha", "68": "\u2581Th", "69": "id", "70": "\u2581S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "\u2581for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "\u2581he", "81": "\u2581st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "\u2581this", "91": "if", "92": "\u2581W", "93": "oo", "94": "ri", "95": "\u2581was", "96": "ght", "97": "\u2581u", "98": "\u2581with", "99": "ad", "100": "ch", "101": "\u2581se", "102": "\u2581k", "103": "\u2581an", "104": "\u2581The", "105": "\u2581li", "106": "\u2581do", "107": "\u2581B", "108": "\u2581have", "109": "\u2581as", "110": "th", "111": "\u2581are", "112": "\u2581sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "\u2581H", "118": "\u2581j", "119": "ter", "120": "\u2581go", "121": "\u2581And", "122": "ation", "123": "\u2581C", "124": "\u2581so", "125": "ome", "126": "\u2581not", "127": "op", "128": "il", "129": "ore", "130": "\u2581ne", "131": "\u2581can", "132": "\u2581me", "133": "\u2581at", "134": "ould", "135": "ant", "136": "\u2581M", "137": "\u2581like", "138": "ere", "139": "\u2581they", "140": "ra", "141": "ers", "142": "\u2581ab", "143": "\u2581de", "144": "\u2581kn", "145": "ge", "146": "\u2581Y", "147": "\u2581ch", "148": "ul", "149": "pp", "150": "\u2581or", "151": "\u2581al", "152": "\u2581con", "153": "\u2581com", "154": "ess", "155": "\u2581su", "156": "out", "157": "\u2581your", "158": "\u2581So", "159": "ate", "160": "\u2581one", "161": "\u2581all", "162": "\u2581ex", "163": "est", "164": "\u2581fr", "165": "\u2581just", "166": "\u2581pro", "167": "\u2581know", "168": "\u2581O", "169": "ain", "170": "\u2581but", "171": "ol", "172": "ive", "173": "\u2581v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "\u2581my", "179": "el", "180": "\u2581N", "181": "nt", "182": "\u2581It", "183": "\u2581what", "184": "ab", "185": "\u2581P", "186": "\u2581wor", "187": "\u2581out", "188": "\u2581there", "189": "\u2581up", "190": "um", "191": "\u2581from", "192": "pe", "193": "\u2581tw", "194": "\u2581r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "\u2581L", "200": "ist", "201": "\u2581about", "202": "ide", "203": "ig", "204": "ake", "205": "\u2581D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "\u2581We", "214": "\u2581get", "215": "\u2581E", "216": "\u2581G", "217": "ack", "218": "\u2581le", "219": "ity", "220": "od", "221": "\u2581F", "222": "ard", "223": "\u2581pl", "224": "\u2581our", "225": "\u2581int", "226": "ment", "227": "\u2581will", "228": "ies", "229": "\u2581by", "230": "ink", "231": "ca", "232": "\u2581if", "233": "red", "234": "her", "235": "ie", "236": "\u2581us", "237": "\u2581some", "238": "\u2581don", "239": "ven", "240": "ood", "241": "ast", "242": "\u2581R", "243": "\u2581his", "244": "\u2581tim", "245": "\u2581tr", "246": "\u2581more", "247": "ich", "248": "ous", "249": "ame", "250": "\u2581going", "251": "\u2581had", "252": "\u2581them", "253": "ook", "254": "\u2581pe", "255": "\u2581Wh", "256": "\u2581You", "257": "\u2581But", "258": "ine", "259": "\u2581here", "260": "\u2581would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "\u2581has", "267": "ect", "268": "\u2581think", "269": "\u2581fe", "270": "ong", "271": "\u2581see", "272": "\u2581when", "273": "\u2581who", "274": "\u2581were", "275": "\u2581really", "276": "\u2581their", "277": "\u2581want", "278": "one", "279": "ople", "280": "\u2581then", "281": "\u2581time", "282": "\u2581sa", "283": "ap", "284": "\u2581te", "285": "\u2581He", "286": "\u2581ye", "287": "ck", "288": "\u2581her", "289": "\u2581thing", "290": "\u2581right", "291": "\u2581which", "292": "itt", "293": "ice", "294": "act", "295": "\u2581people", "296": "ty", "297": "\u2581two", "298": "\u2581J", "299": "\u2581im", "300": "ther", "301": "ci", "302": "ose", "303": "\u2581cl", "304": "\u2581qu", "305": "\u2581man", "306": "\u2581also", "307": "ree", "308": "\u2581en", "309": "ud", "310": "\u2581how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "\u2581any", "316": "ff", "317": "ace", "318": "per", "319": "\u2581because", "320": "\u2581very", "321": "own", "322": "\u2581ad", "323": "\u2581act", "324": "\u2581been", "325": "\u2581now", "326": "\u2581ag", "327": "\u2581into", "328": "\u2581comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "\u2581these", "335": "ays", "336": "ep", "337": "\u2581This", "338": "\u2581she", "339": "ans", "340": "ah", "341": "een", "342": "\u2581over", "343": "ry", "344": "\u2581lo", "345": "age", "346": "\u2581pr", "347": "\u2581sp", "348": "ue", "349": "\u2581co", "350": "ick", "351": "ber", "352": "\u2581did", "353": "ip", "354": "ach", "355": "\u2581back", "356": "\u2581no", "357": "\u2581cont", "358": "\u2581other", "359": "\u2581every", "360": "pt", "361": "\u2581need", "362": "\u2581him", "363": "\u2581U", "364": "\u2581In", "365": "\u2581work", "366": "irst", "367": "\u2581part", "368": "\u2581look", "369": "ittle", "370": "ble", "371": "iz", "372": "\u2581un", "373": "\u2581make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "\u2581little", "379": "\u2581off", "380": "\u2581than", "381": "\u2581got", "382": "ually", "383": "\u2581per", "384": "\u2581good", "385": "\u2581way", "386": "\u2581could", "387": "\u2581ac", "388": "\u2581imp", "389": "able", "390": "\u2581where", "391": "iff", "392": "\u2581That", "393": "\u2581res", "394": "ount", "395": "pl", "396": "ance", "397": "\u2581first", "398": "\u2581ro", "399": "\u2581pre", "400": "ass", "401": "\u2581say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "\u2581somet", "408": "ound", "409": "\u2581down", "410": "\u2581diff", "411": "sel", "412": "\u2581gu", "413": "\u2581am", "414": "ress", "415": "\u2581lot", "416": "ence", "417": "\u2581dis", "418": "orm", "419": "ix", "420": "\u2581po", "421": "ving", "422": "enty", "423": "\u2581K", "424": "\u2581spe", "425": "und", "426": "he", "427": "\u2581much", "428": "\u2581ar", "429": "round", "430": "\u2581app", "431": "co", "432": "ark", "433": "\u2581new", "434": "ater", "435": "ult", "436": "end", "437": "\u2581even", "438": "\u2581start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "\u2581well", "444": "be", "445": "\u2581They", "446": "\u2581three", "447": "ign", "448": "ild", "449": "\u2581said", "450": "ough", "451": "ang", "452": "\u2581too", "453": "ade", "454": "\u2581bl", "455": "ens", "456": "\u2581inc", "457": "ia", "458": "\u2581those", "459": "\u2581mo", "460": "\u2581take", "461": "\u2581through", "462": "\u2581fl", "463": "\u2581kind", "464": "\u2581things", "465": "\u2581bet", "466": "\u2581only", "467": "\u2581St", "468": "\u2581let", "469": "cess", "470": "\u2581Ch", "471": "ary", "472": "vel", "473": "\u2581If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "\u2581again", "480": "\u2581something", "481": "onna", "482": "fore", "483": "\u2581may", "484": "ting", "485": "\u2581bu", "486": "\u2581differe", "487": "urn", "488": "\u2581gonna", "489": "\u2581does", "490": "uct", "491": "og", "492": "\u2581twenty", "493": "\u2581gr", "494": "\u2581Ye", "495": "wn", "496": "\u2581should", "497": "\u2581comm", "498": "ition", "499": "\u2581under", "500": "\u2581hel", "501": "ory", "502": "\u2581fo", "503": "\u2581use", "504": "igh", "505": "ife", "506": "\u2581actually", "507": "\u2581tal", "508": "\u2581call", "509": "ents", "510": "ious", "511": "ull", "512": "\u2581There", "513": "\u2581Yeah", "514": "\u2581most", "515": "\u2581ke", "516": "ors", "517": "ved", "518": "ys", "519": "\u2581sc", "520": "\u2581happ", "521": "ope", "522": "\u2581help", "523": "atch", "524": "\u2581What", "525": "\u2581rem", "526": "ple", "527": "\u2581Now", "528": "\u2581br", "529": "ool", "530": "oth", "531": "\u2581four", "532": "self", "533": "\u2581str", "534": "ne", "535": "thing", "536": "\u2581put", "537": "ial", "538": "\u2581great", "539": "ail", "540": "ub", "541": "ning", "542": "\u2581sm", "543": "\u2581feel", "544": "\u2581five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "\u2581many", "552": "\u2581hundred", "553": "\u2581years", "554": "\u2581being", "555": "\u2581come", "556": "\u2581mean", "557": "ily", "558": "\u2581different", "559": "\u2581after", "560": "\u2581ser", "561": "\u2581show", "562": "form", "563": "ful", "564": "oy", "565": "\u2581six", "566": "\u2581vide", "567": "\u2581V", "568": "\u2581its", "569": "\u2581point", "570": "\u2581day", "571": "\u2581des", "572": "ons", "573": "\u2581bit", "574": "\u2581bel", "575": "\u2581before", "576": "\u2581aw", "577": "\u2581end", "578": "\u2581Oh", "579": "\u2581still", "580": "ath", "581": "\u2581long", "582": "\u2581'", "583": "ise", "584": "ob", "585": "day", "586": "\u2581add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "\u2581cr", "592": "\u2581around", "593": "\u2581try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "\u2581find", "600": "ward", "601": "\u2581As", "602": "\u2581eight", "603": "lic", "604": "\u2581same", "605": "\u2581pos", "606": "\u2581em", "607": "\u2581made", "608": "\u2581supp", "609": "\u2581life", "610": "\u2581Be", "611": "pect", "612": "\u2581dec", "613": "\u2581play", "614": "ange", "615": "\u2581att", "616": "\u2581pers", "617": "ways", "618": "\u2581high", "619": "\u2581hand", "620": "\u2581next", "621": "\u2581cons", "622": "\u2581own", "623": "\u2581inv", "624": "ower", "625": "\u2581ind", "626": "ert", "627": "ng", "628": "ave", "629": "\u2581year", "630": "\u2581big", "631": "ating", "632": "\u2581world", "633": "\u2581rel", "634": "\u2581sure", "635": "\u2581tra", "636": "ew", "637": "ered", "638": "\u2581fin", "639": "\u2581Well", "640": "\u2581sl", "641": "\u2581doing", "642": "bs", "643": "\u2581set", "644": "\u2581rec", "645": "ual", "646": "cial", "647": "\u2581ph", "648": "erm", "649": "\u2581love", "650": "ph", "651": "\u2581real", "652": "\u2581last", "653": "ict", "654": "\u2581bo", "655": "\u2581ra", "656": "ible", "657": "\u2581wr", "658": "mer", "659": "\u2581count", "660": "ities", "661": "\u2581always", "662": "inet", "663": "ments", "664": "uc", "665": "\u2581might", "666": "\u2581inter", "667": "\u2581video", "668": "gin", "669": "\u2581tell", "670": "\u2581never", "671": "vent", "672": "\u2581import", "673": "ied", "674": "\u2581sy", "675": "\u2581How", "676": "ically", "677": "ought", "678": "\u2581thir", "679": "\u2581rep", "680": "ks", "681": "ib", "682": "\u2581fam", "683": "ject", "684": "\u2581bas", "685": "\u2581She", "686": "\u2581give", "687": "akes", "688": "\u2581ninet", "689": "\u2581reg", "690": "\u2581min", "691": "\u2581op", "692": "\u2581def", "693": "\u2581didn", "694": "te", "695": "\u2581cour", "696": "\u2581why", "697": "\u2581ent", "698": "\u2581place", "699": "\u2581ins", "700": "\u2581car", "701": "ather", "702": "\u2581person", "703": "ular", "704": "\u2581inst", "705": "\u2581prod", "706": "lect", "707": "\u2581Al", "708": "\u2581today", "709": "\u2581bec", "710": "\u2581sur", "711": "\u2581All", "712": "\u2581another", "713": "\u2581bus", "714": "\u2581keep", "715": "ell", "716": "ese", "717": "riend", "718": "\u2581quest", "719": "\u2581talk", "720": "als", "721": "ings", "722": "\u2581mon", "723": "cond", "724": "old", "725": "\u2581acc", "726": "\u2581la", "727": "\u2581num", "728": "ident", "729": "\u2581che", "730": "iness", "731": "\u2581turn", "732": "\u2581ear", "733": "\u2581No", "734": "ousand", "735": "\u2581better", "736": "ific", "737": "\u2581loo", "738": "\u2581gl", "739": "oc", "740": "\u2581important", "741": "ited", "742": "\u2581An", "743": "\u2581thousand", "744": "ility", "745": "llow", "746": "\u2581used", "747": "\u2581gen", "748": "\u2581sim", "749": "li", "750": "\u2581happen", "751": "\u2581Un", "752": "\u2581Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "\u2581watch", "758": "\u2581For", "759": "\u2581sw", "760": "ren", "761": "ute", "762": "ever", "763": "\u2581pol", "764": "\u2581sch", "765": "\u2581When", "766": "\u2581such", "767": "\u2581fif", "768": "\u2581home", "769": "\u2581cle", "770": "\u2581contin", "771": "ouse", "772": "\u2581friend", "773": "uring", "774": "\u2581Okay", "775": "gr", "776": "\u2581able", "777": "\u2581stud", "778": "\u2581eff", "779": "hip", "780": "body", "781": "\u2581top", "782": "ness", "783": "\u2581exper", "784": "\u2581pret", "785": "\u2581both", "786": "\u2581done", "787": "cri", "788": "\u2581mark", "789": "\u2581while", "790": "\u2581old", "791": "ros", "792": "ont", "793": "\u2581second", "794": "ative", "795": "\u2581thought", "796": "\u2581best", "797": "\u2581found", "798": "iew", "799": "\u2581belie", "800": "\u2581each", "801": "erest", "802": "\u2581tri", "803": "\u2581eas", "804": "\u2581ca", "805": "\u2581fact", "806": "\u2581care", "807": "\u2581fun", "808": "atter", "809": "ures", "810": "\u2581head", "811": "\u2581lear", "812": "\u2581water", "813": "\u2581hard", "814": "\u2581few", "815": "\u2581side", "816": "ween", "817": "\u2581exp", "818": "\u2581away", "819": "its", "820": "\u2581ext", "821": "lud", "822": "\u2581run", "823": "\u2581trans", "824": "ince", "825": "\u2581sk", "826": "\u2581open", "827": "cus", "828": "\u2581between", "829": "\u2581called", "830": "\u2581wee", "831": "\u2581pretty", "832": "ason", "833": "\u2581far", "834": "ember", "835": "omm", "836": "\u2581interest", "837": "any", "838": "ner", "839": "uff", "840": "\u2581pres", "841": "\u2581cur", "842": "\u2581child", "843": "ee", "844": "\u2581toget", "845": "\u2581together", "846": "olog", "847": "\u2581God", "848": "ond", "849": "\u2581char", "850": "\u2581looking", "851": "stem", "852": "az", "853": "cent", "854": "\u2581ob", "855": "\u2581ass", "856": "land", "857": "\u2581doesn", "858": "\u2581business", "859": "\u2581course", "860": "\u2581ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "\u2581ref", "868": "\u2581name", "869": "ross", "870": "\u2581grow", "871": "oney", "872": "\u2581went", "873": "ics", "874": "teen", "875": "\u2581cou", "876": "\u2581prob", "877": "\u2581ret", "878": "\u2581guys", "879": "\u2581came", "880": "ash", "881": "led", "882": "\u2581Eur", "883": "ues", "884": "\u2581ide", "885": "gan", "886": "\u2581everything", "887": "\u2581getting", "888": "\u2581ask", "889": "\u2581cor", "890": "\u2581build", "891": "\u2581sign", "892": "\u2581small", "893": "uck", "894": "\u2581el", "895": "\u2581col", "896": "\u2581Is", "897": "ational", "898": "stand", "899": "cy", "900": "\u2581conf", "901": "der", "902": "\u2581bre", "903": "\u2581cap", "904": "\u2581mod", "905": "ets", "906": "ike", "907": "\u2581number", "908": "\u2581comple", "909": "ertain", "910": "\u2581ever", "911": "\u2581coll", "912": "\u2581hum", "913": "\u2581Europe", "914": "\u2581cre", "915": "\u2581met", "916": "\u2581exam", "917": "\u2581move", "918": "\u2581pass", "919": "\u2581left", "920": "\u2581system", "921": "\u2581includ", "922": "\u2581Thank", "923": "cept", "924": "\u2581wom", "925": "\u2581product", "926": "ten", "927": "\u2581rest", "928": "\u2581probably", "929": "\u2581dri", "930": "\u2581Do", "931": "\u2581gener", "932": "\u2581anything", "933": "\u2581lar", "934": "\u2581My", "935": "\u2581school", "936": "\u2581lead", "937": "\u2581sub", "938": "\u2581ty", "939": "\u2581plan", "940": "\u2581seem", "941": "\u2581whole", "942": "irect", "943": "\u2581light", "944": "\u2581must", "945": "\u2581mom", "946": "\u2581opp", "947": "\u2581support", "948": "\u2581family", "949": "ices", "950": "amp", "951": "\u2581proble", "952": "\u2581dr", "953": "ready", "954": "\u2581using", "955": "ense", "956": "\u2581prov", "957": "ush", "958": "ax", "959": "\u2581power", "960": "\u2581Re", "961": "alth", "962": "\u2581ev", "963": "\u2581stand", "964": "\u2581war", "965": "ts", "966": "\u2581", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}