Upload 23 files
Browse files- CTCHead.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- CTCHead.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- CTCHead.mlpackage/Manifest.json +18 -0
- Decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Decoder.mlpackage/Manifest.json +18 -0
- Encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Encoder.mlpackage/Manifest.json +18 -0
- JointDecision.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- JointDecision.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- JointDecision.mlpackage/Manifest.json +18 -0
- Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Preprocessor.mlpackage/Manifest.json +18 -0
- README.md +164 -0
- __init__.py +5 -0
- convert_nemo_to_coreml.py +365 -0
- inference.py +304 -0
- metadata.json +18 -0
- pyproject.toml +65 -0
- uv.lock +0 -0
- vocab.json +1 -1
CTCHead.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bb619400f0ca4a5873c5f2bf7bf78a645944d3f4acd544bed689a7a420f4634
|
| 3 |
+
size 2048
|
CTCHead.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
|
| 3 |
+
size 1051842
|
CTCHead.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"174E9828-F0D9-496B-B767-165878007DCB": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"ABA1F560-2FDF-40A2-BB0D-DE27A2824BED": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "ABA1F560-2FDF-40A2-BB0D-DE27A2824BED"
|
| 18 |
+
}
|
Decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95c442cea0d8d78e3de7c45e6a0502a7284b783915971f161a0e58a4e1fa7153
|
| 3 |
+
size 8544
|
Decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d5c4adf473c11e8c86daae6da87dbf4a0bf1c8b716fdd4a9378906208b41381
|
| 3 |
+
size 7872384
|
Decoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"7BB82BEE-DB48-4BA0-8F0E-AC39162FD7F3": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"E80560F7-8F68-462F-8B00-ADCB6B6F88F7": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "E80560F7-8F68-462F-8B00-ADCB6B6F88F7"
|
| 18 |
+
}
|
Encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:987981947a1239c7d9d1936168534058ddeb39e0da2bb0b36f91381f00183b1e
|
| 3 |
+
size 492504
|
Encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
|
| 3 |
+
size 215143424
|
Encoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"020BD32E-EB4A-4192-B46F-8CFA4932627D": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"242DF9D4-730A-4735-97CD-5C4C16E79595": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "242DF9D4-730A-4735-97CD-5C4C16E79595"
|
| 18 |
+
}
|
JointDecision.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7f0f6c8cb9481b7d303268c34ddd7fb9e69e1cfda9880c1dce06a64539cb389
|
| 3 |
+
size 8788
|
JointDecision.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
|
| 3 |
+
size 2798028
|
JointDecision.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"82C059F0-F2A7-4566-B14C-7BC1F1E136E2": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"D30A2839-E7D8-40D0-AF8B-72C2EF998325": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "82C059F0-F2A7-4566-B14C-7BC1F1E136E2"
|
| 18 |
+
}
|
Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:432a9ebe9eb0bee221560ed7bfef5278fb907652e4e0f20ba03b997c394a9335
|
| 3 |
+
size 19924
|
Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1609930989479ea65e0608b2cd6c54fef7f1623cc240cd6d993e24e2491133ac
|
| 3 |
+
size 807968
|
Preprocessor.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"C99314D3-C9C5-4FAD-8419-E34671E6E467": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"E684B627-55A9-4175-92F1-FA535236EE66": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "E684B627-55A9-4175-92F1-FA535236EE66"
|
| 18 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Parakeet-TDT-CTC-110M CoreML
|
| 2 |
+
|
| 3 |
+
NVIDIA's Parakeet-TDT-CTC-110M model converted to CoreML format for efficient inference on Apple Silicon.
|
| 4 |
+
|
| 5 |
+
## Model Description
|
| 6 |
+
|
| 7 |
+
This is a hybrid ASR model with a shared Conformer encoder and two decoder heads:
|
| 8 |
+
- **CTC Head**: Fast greedy decoding, ideal for keyword spotting
|
| 9 |
+
- **TDT Head**: Token-Duration Transducer for high-quality transcription
|
| 10 |
+
|
| 11 |
+
### Architecture
|
| 12 |
+
|
| 13 |
+
| Component | Description | Size |
|
| 14 |
+
|-----------|-------------|------|
|
| 15 |
+
| Preprocessor | Mel spectrogram extraction | ~1 MB |
|
| 16 |
+
| Encoder | Conformer encoder (shared) | ~400 MB |
|
| 17 |
+
| CTCHead | CTC output projection | ~4 MB |
|
| 18 |
+
| Decoder | TDT prediction network (LSTM) | ~25 MB |
|
| 19 |
+
| JointDecision | TDT joint network | ~6 MB |
|
| 20 |
+
|
| 21 |
+
**Total size**: ~436 MB
|
| 22 |
+
|
| 23 |
+
### Performance
|
| 24 |
+
|
| 25 |
+
Benchmarked on Earnings22 dataset (772 audio files):
|
| 26 |
+
|
| 27 |
+
| Metric | Value |
|
| 28 |
+
|--------|-------|
|
| 29 |
+
| Keyword Recall | 100% (1309/1309) |
|
| 30 |
+
| WER | 17.97% |
|
| 31 |
+
| RTFx (M4 Pro) | 358x real-time |
|
| 32 |
+
|
| 33 |
+
## Requirements
|
| 34 |
+
|
| 35 |
+
- macOS 13+ (Ventura or later)
|
| 36 |
+
- Apple Silicon (M1/M2/M3/M4)
|
| 37 |
+
- Python 3.10+
|
| 38 |
+
|
| 39 |
+
## Installation
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
# Using uv (recommended)
|
| 43 |
+
uv sync
|
| 44 |
+
|
| 45 |
+
# Or using pip
|
| 46 |
+
pip install -e .
|
| 47 |
+
|
| 48 |
+
# For audio file support (WAV, MP3, etc.)
|
| 49 |
+
pip install -e ".[audio]"
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## Usage
|
| 53 |
+
|
| 54 |
+
### Python Inference
|
| 55 |
+
|
| 56 |
+
```python
|
| 57 |
+
from scripts.inference import ParakeetCoreML
|
| 58 |
+
|
| 59 |
+
# Load model
|
| 60 |
+
model = ParakeetCoreML("./model")
|
| 61 |
+
|
| 62 |
+
# Transcribe with TDT (higher quality)
|
| 63 |
+
text = model.transcribe("audio.wav", mode="tdt")
|
| 64 |
+
print(text)
|
| 65 |
+
|
| 66 |
+
# Or use CTC for faster keyword spotting
|
| 67 |
+
text = model.transcribe("audio.wav", mode="ctc")
|
| 68 |
+
print(text)
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Command Line
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
# TDT decoding (default, higher quality)
|
| 75 |
+
uv run scripts/inference.py --audio audio.wav --model-dir ./model
|
| 76 |
+
|
| 77 |
+
# CTC decoding (faster, good for keyword spotting)
|
| 78 |
+
uv run scripts/inference.py --audio audio.wav --model-dir ./model --mode ctc
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
## Model Conversion
|
| 82 |
+
|
| 83 |
+
To convert from the original NeMo model:
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
# Install conversion dependencies
|
| 87 |
+
uv sync --extra convert
|
| 88 |
+
|
| 89 |
+
# Run conversion
|
| 90 |
+
uv run scripts/convert_nemo_to_coreml.py --output-dir ./model
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
This will:
|
| 94 |
+
1. Download the original model from NVIDIA (`nvidia/parakeet-tdt_ctc-110m`)
|
| 95 |
+
2. Convert each component to CoreML format
|
| 96 |
+
3. Extract vocabulary and create metadata
|
| 97 |
+
|
| 98 |
+
## File Structure
|
| 99 |
+
|
| 100 |
+
```
|
| 101 |
+
model/
|
| 102 |
+
├── Preprocessor.mlpackage # Audio → Mel spectrogram
|
| 103 |
+
├── Encoder.mlpackage # Mel → Encoder features
|
| 104 |
+
├── CTCHead.mlpackage # Encoder → CTC log probs
|
| 105 |
+
├── Decoder.mlpackage # TDT prediction network
|
| 106 |
+
├── JointDecision.mlpackage # TDT joint network
|
| 107 |
+
├── vocab.json # Token vocabulary (1024 tokens)
|
| 108 |
+
└── metadata.json # Model configuration
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Decoding Modes
|
| 112 |
+
|
| 113 |
+
### TDT Mode (Recommended for Transcription)
|
| 114 |
+
- Uses Token-Duration Transducer decoding
|
| 115 |
+
- Higher accuracy (17.97% WER)
|
| 116 |
+
- Predicts both tokens and durations
|
| 117 |
+
- Best for full transcription tasks
|
| 118 |
+
|
| 119 |
+
### CTC Mode (Recommended for Keyword Spotting)
|
| 120 |
+
- Greedy CTC decoding
|
| 121 |
+
- Faster inference
|
| 122 |
+
- 100% keyword recall on Earnings22
|
| 123 |
+
- Best for detecting specific words/phrases
|
| 124 |
+
|
| 125 |
+
## Custom Vocabulary / Keyword Spotting
|
| 126 |
+
|
| 127 |
+
For keyword spotting, CTC mode with custom vocabulary boosting achieves 100% recall:
|
| 128 |
+
|
| 129 |
+
```python
|
| 130 |
+
# Load custom vocabulary with token IDs
|
| 131 |
+
with open("custom_vocab.json") as f:
|
| 132 |
+
keywords = json.load(f) # {"keyword": [token_ids], ...}
|
| 133 |
+
|
| 134 |
+
# Run CTC decoding
|
| 135 |
+
tokens = model.decode_ctc(encoder_output)
|
| 136 |
+
|
| 137 |
+
# Check for keyword matches
|
| 138 |
+
for keyword, expected_ids in keywords.items():
|
| 139 |
+
if is_subsequence(expected_ids, tokens):
|
| 140 |
+
print(f"Found keyword: {keyword}")
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
## License
|
| 144 |
+
|
| 145 |
+
This model conversion is released under the Apache 2.0 License, same as the original NVIDIA model.
|
| 146 |
+
|
| 147 |
+
## Citation
|
| 148 |
+
|
| 149 |
+
If you use this model, please cite the original NVIDIA work:
|
| 150 |
+
|
| 151 |
+
```bibtex
|
| 152 |
+
@misc{nvidia_parakeet_tdt_ctc,
|
| 153 |
+
title={Parakeet-TDT-CTC-110M},
|
| 154 |
+
author={NVIDIA},
|
| 155 |
+
year={2024},
|
| 156 |
+
publisher={Hugging Face},
|
| 157 |
+
url={https://huggingface.co/nvidia/parakeet-tdt_ctc-110m}
|
| 158 |
+
}
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
## Acknowledgments
|
| 162 |
+
|
| 163 |
+
- Original model by [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)
|
| 164 |
+
- CoreML conversion by [FluidInference](https://github.com/FluidInference)
|
__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parakeet-TDT-CTC-110M CoreML scripts."""
|
| 2 |
+
|
| 3 |
+
from .inference import ParakeetCoreML
|
| 4 |
+
|
| 5 |
+
__all__ = ["ParakeetCoreML"]
|
convert_nemo_to_coreml.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert NVIDIA Parakeet-TDT-CTC-110M model from NeMo to CoreML format.
|
| 4 |
+
|
| 5 |
+
This script downloads the original NeMo model from NVIDIA and converts it to
|
| 6 |
+
CoreML format with separate components for efficient Apple Silicon inference.
|
| 7 |
+
|
| 8 |
+
Components:
|
| 9 |
+
- Preprocessor: Audio preprocessing (mel spectrogram)
|
| 10 |
+
- Encoder: Conformer encoder (shared between TDT and CTC)
|
| 11 |
+
- CTCHead: CTC output head for keyword spotting
|
| 12 |
+
- Decoder: TDT decoder LSTM
|
| 13 |
+
- JointDecision: TDT joint network for token + duration prediction
|
| 14 |
+
|
| 15 |
+
Usage:
|
| 16 |
+
uv run scripts/convert_nemo_to_coreml.py --output-dir ./model
|
| 17 |
+
|
| 18 |
+
Requirements:
|
| 19 |
+
- Python 3.10+
|
| 20 |
+
- PyTorch
|
| 21 |
+
- NeMo toolkit
|
| 22 |
+
- coremltools
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import argparse
|
| 26 |
+
import json
|
| 27 |
+
import os
|
| 28 |
+
import shutil
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
|
| 31 |
+
import coremltools as ct
|
| 32 |
+
import numpy as np
|
| 33 |
+
import torch
|
| 34 |
+
import torch.nn as nn
|
| 35 |
+
from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def load_nemo_model(model_name: str = "nvidia/parakeet-tdt_ctc-110m") -> EncDecHybridRNNTCTCModel:
|
| 39 |
+
"""Load the NeMo model from NVIDIA."""
|
| 40 |
+
print(f"Loading NeMo model: {model_name}")
|
| 41 |
+
model = EncDecHybridRNNTCTCModel.from_pretrained(model_name)
|
| 42 |
+
model.eval()
|
| 43 |
+
return model
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class PreprocessorWrapper(nn.Module):
|
| 47 |
+
"""Wrapper for mel spectrogram preprocessing."""
|
| 48 |
+
|
| 49 |
+
def __init__(self, preprocessor):
|
| 50 |
+
super().__init__()
|
| 51 |
+
self.preprocessor = preprocessor
|
| 52 |
+
|
| 53 |
+
def forward(self, audio_signal: torch.Tensor, audio_length: torch.Tensor):
|
| 54 |
+
processed_signal, processed_length = self.preprocessor(
|
| 55 |
+
input_signal=audio_signal,
|
| 56 |
+
length=audio_length
|
| 57 |
+
)
|
| 58 |
+
return processed_signal, processed_length
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class EncoderWrapper(nn.Module):
|
| 62 |
+
"""Wrapper for Conformer encoder."""
|
| 63 |
+
|
| 64 |
+
def __init__(self, encoder):
|
| 65 |
+
super().__init__()
|
| 66 |
+
self.encoder = encoder
|
| 67 |
+
|
| 68 |
+
def forward(self, mel: torch.Tensor, mel_length: torch.Tensor):
|
| 69 |
+
encoded, encoded_length = self.encoder(
|
| 70 |
+
audio_signal=mel,
|
| 71 |
+
length=mel_length
|
| 72 |
+
)
|
| 73 |
+
return encoded, encoded_length
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class CTCHeadWrapper(nn.Module):
|
| 77 |
+
"""Wrapper for CTC output head."""
|
| 78 |
+
|
| 79 |
+
def __init__(self, ctc_decoder):
|
| 80 |
+
super().__init__()
|
| 81 |
+
self.ctc_decoder = ctc_decoder
|
| 82 |
+
|
| 83 |
+
def forward(self, encoder_output: torch.Tensor):
|
| 84 |
+
# CTC head outputs log probabilities over vocabulary
|
| 85 |
+
logits = self.ctc_decoder.decoder_layers(encoder_output)
|
| 86 |
+
log_probs = torch.log_softmax(logits, dim=-1)
|
| 87 |
+
return log_probs
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class DecoderWrapper(nn.Module):
|
| 91 |
+
"""Wrapper for TDT decoder LSTM."""
|
| 92 |
+
|
| 93 |
+
def __init__(self, decoder):
|
| 94 |
+
super().__init__()
|
| 95 |
+
self.decoder = decoder
|
| 96 |
+
self.hidden_size = decoder.pred_hidden
|
| 97 |
+
|
| 98 |
+
def forward(self, targets: torch.Tensor, target_length: torch.Tensor,
|
| 99 |
+
h_in: torch.Tensor, c_in: torch.Tensor):
|
| 100 |
+
# Run decoder LSTM
|
| 101 |
+
output, (h_out, c_out) = self.decoder.prediction_network.lstm(
|
| 102 |
+
self.decoder.prediction_network.embed(targets),
|
| 103 |
+
(h_in, c_in)
|
| 104 |
+
)
|
| 105 |
+
# Project to decoder hidden dimension
|
| 106 |
+
decoder_output = self.decoder.prediction_network.dec_out(output)
|
| 107 |
+
return decoder_output, h_out, c_out
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class JointDecisionWrapper(nn.Module):
|
| 111 |
+
"""Wrapper for joint network that predicts token and duration."""
|
| 112 |
+
|
| 113 |
+
def __init__(self, joint):
|
| 114 |
+
super().__init__()
|
| 115 |
+
self.joint = joint
|
| 116 |
+
|
| 117 |
+
def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
|
| 118 |
+
# Combine encoder and decoder features
|
| 119 |
+
combined = self.joint.joint_net(
|
| 120 |
+
torch.cat([encoder_step, decoder_step], dim=-1)
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Token prediction
|
| 124 |
+
token_logits = self.joint.joint(combined)
|
| 125 |
+
token_id = torch.argmax(token_logits, dim=-1)
|
| 126 |
+
token_prob = torch.softmax(token_logits, dim=-1).max(dim=-1).values
|
| 127 |
+
|
| 128 |
+
# Duration prediction
|
| 129 |
+
duration_logits = self.joint.tdt_joint(combined)
|
| 130 |
+
duration_bin = torch.argmax(duration_logits, dim=-1)
|
| 131 |
+
|
| 132 |
+
return token_id, token_prob, duration_bin
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def convert_preprocessor(model: EncDecHybridRNNTCTCModel, output_dir: Path):
|
| 136 |
+
"""Convert preprocessor to CoreML."""
|
| 137 |
+
print("Converting Preprocessor...")
|
| 138 |
+
|
| 139 |
+
wrapper = PreprocessorWrapper(model.preprocessor)
|
| 140 |
+
wrapper.eval()
|
| 141 |
+
|
| 142 |
+
# Sample inputs
|
| 143 |
+
audio = torch.randn(1, 240000) # 15 seconds at 16kHz
|
| 144 |
+
length = torch.tensor([240000])
|
| 145 |
+
|
| 146 |
+
traced = torch.jit.trace(wrapper, (audio, length))
|
| 147 |
+
|
| 148 |
+
mlmodel = ct.convert(
|
| 149 |
+
traced,
|
| 150 |
+
inputs=[
|
| 151 |
+
ct.TensorType(name="audio_signal", shape=(1, ct.RangeDim(16000, 240000))),
|
| 152 |
+
ct.TensorType(name="audio_length", shape=(1,)),
|
| 153 |
+
],
|
| 154 |
+
outputs=[
|
| 155 |
+
ct.TensorType(name="mel"),
|
| 156 |
+
ct.TensorType(name="mel_length"),
|
| 157 |
+
],
|
| 158 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
mlmodel.save(output_dir / "Preprocessor.mlpackage")
|
| 162 |
+
print(" Saved Preprocessor.mlpackage")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def convert_encoder(model: EncDecHybridRNNTCTCModel, output_dir: Path):
|
| 166 |
+
"""Convert encoder to CoreML."""
|
| 167 |
+
print("Converting Encoder...")
|
| 168 |
+
|
| 169 |
+
wrapper = EncoderWrapper(model.encoder)
|
| 170 |
+
wrapper.eval()
|
| 171 |
+
|
| 172 |
+
# Sample inputs (mel spectrogram shape)
|
| 173 |
+
mel = torch.randn(1, 80, 1500) # 80 mel bins, ~15 seconds
|
| 174 |
+
mel_length = torch.tensor([1500])
|
| 175 |
+
|
| 176 |
+
traced = torch.jit.trace(wrapper, (mel, mel_length))
|
| 177 |
+
|
| 178 |
+
mlmodel = ct.convert(
|
| 179 |
+
traced,
|
| 180 |
+
inputs=[
|
| 181 |
+
ct.TensorType(name="mel", shape=(1, 80, ct.RangeDim(100, 1500))),
|
| 182 |
+
ct.TensorType(name="mel_length", shape=(1,)),
|
| 183 |
+
],
|
| 184 |
+
outputs=[
|
| 185 |
+
ct.TensorType(name="encoder"),
|
| 186 |
+
ct.TensorType(name="encoder_length"),
|
| 187 |
+
],
|
| 188 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
mlmodel.save(output_dir / "Encoder.mlpackage")
|
| 192 |
+
print(" Saved Encoder.mlpackage")
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def convert_ctc_head(model: EncDecHybridRNNTCTCModel, output_dir: Path):
|
| 196 |
+
"""Convert CTC head to CoreML."""
|
| 197 |
+
print("Converting CTCHead...")
|
| 198 |
+
|
| 199 |
+
wrapper = CTCHeadWrapper(model.ctc_decoder)
|
| 200 |
+
wrapper.eval()
|
| 201 |
+
|
| 202 |
+
# Sample input (encoder output)
|
| 203 |
+
encoder_output = torch.randn(1, 188, 512) # batch, time, hidden
|
| 204 |
+
|
| 205 |
+
traced = torch.jit.trace(wrapper, encoder_output)
|
| 206 |
+
|
| 207 |
+
mlmodel = ct.convert(
|
| 208 |
+
traced,
|
| 209 |
+
inputs=[
|
| 210 |
+
ct.TensorType(name="encoder_output", shape=(1, ct.RangeDim(10, 300), 512)),
|
| 211 |
+
],
|
| 212 |
+
outputs=[
|
| 213 |
+
ct.TensorType(name="ctc_log_probs"),
|
| 214 |
+
],
|
| 215 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
mlmodel.save(output_dir / "CTCHead.mlpackage")
|
| 219 |
+
print(" Saved CTCHead.mlpackage")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def convert_decoder(model: EncDecHybridRNNTCTCModel, output_dir: Path):
|
| 223 |
+
"""Convert decoder to CoreML."""
|
| 224 |
+
print("Converting Decoder...")
|
| 225 |
+
|
| 226 |
+
wrapper = DecoderWrapper(model.decoder)
|
| 227 |
+
wrapper.eval()
|
| 228 |
+
|
| 229 |
+
hidden_size = wrapper.hidden_size
|
| 230 |
+
num_layers = model.decoder.pred_num_layers
|
| 231 |
+
|
| 232 |
+
# Sample inputs
|
| 233 |
+
targets = torch.zeros(1, 1, dtype=torch.long)
|
| 234 |
+
target_length = torch.tensor([1])
|
| 235 |
+
h_in = torch.zeros(num_layers, 1, hidden_size)
|
| 236 |
+
c_in = torch.zeros(num_layers, 1, hidden_size)
|
| 237 |
+
|
| 238 |
+
traced = torch.jit.trace(wrapper, (targets, target_length, h_in, c_in))
|
| 239 |
+
|
| 240 |
+
mlmodel = ct.convert(
|
| 241 |
+
traced,
|
| 242 |
+
inputs=[
|
| 243 |
+
ct.TensorType(name="targets", shape=(1, 1)),
|
| 244 |
+
ct.TensorType(name="target_length", shape=(1,)),
|
| 245 |
+
ct.TensorType(name="h_in", shape=(num_layers, 1, hidden_size)),
|
| 246 |
+
ct.TensorType(name="c_in", shape=(num_layers, 1, hidden_size)),
|
| 247 |
+
],
|
| 248 |
+
outputs=[
|
| 249 |
+
ct.TensorType(name="decoder"),
|
| 250 |
+
ct.TensorType(name="h_out"),
|
| 251 |
+
ct.TensorType(name="c_out"),
|
| 252 |
+
],
|
| 253 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
mlmodel.save(output_dir / "Decoder.mlpackage")
|
| 257 |
+
print(" Saved Decoder.mlpackage")
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def convert_joint(model: EncDecHybridRNNTCTCModel, output_dir: Path):
|
| 261 |
+
"""Convert joint network to CoreML."""
|
| 262 |
+
print("Converting JointDecision...")
|
| 263 |
+
|
| 264 |
+
wrapper = JointDecisionWrapper(model.joint)
|
| 265 |
+
wrapper.eval()
|
| 266 |
+
|
| 267 |
+
# Sample inputs
|
| 268 |
+
encoder_step = torch.randn(1, 512, 1)
|
| 269 |
+
decoder_step = torch.randn(1, 640, 1)
|
| 270 |
+
|
| 271 |
+
traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
|
| 272 |
+
|
| 273 |
+
mlmodel = ct.convert(
|
| 274 |
+
traced,
|
| 275 |
+
inputs=[
|
| 276 |
+
ct.TensorType(name="encoder_step", shape=(1, 512, 1)),
|
| 277 |
+
ct.TensorType(name="decoder_step", shape=(1, 640, 1)),
|
| 278 |
+
],
|
| 279 |
+
outputs=[
|
| 280 |
+
ct.TensorType(name="token_id"),
|
| 281 |
+
ct.TensorType(name="token_prob"),
|
| 282 |
+
ct.TensorType(name="duration_bin"),
|
| 283 |
+
],
|
| 284 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
mlmodel.save(output_dir / "JointDecision.mlpackage")
|
| 288 |
+
print(" Saved JointDecision.mlpackage")
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def extract_vocabulary(model: EncDecHybridRNNTCTCModel, output_dir: Path):
|
| 292 |
+
"""Extract vocabulary from model."""
|
| 293 |
+
print("Extracting vocabulary...")
|
| 294 |
+
|
| 295 |
+
vocab = model.decoding.decoding.vocabulary
|
| 296 |
+
vocab_dict = {i: token for i, token in enumerate(vocab)}
|
| 297 |
+
|
| 298 |
+
with open(output_dir / "vocab.json", "w") as f:
|
| 299 |
+
json.dump(vocab_dict, f, indent=2, ensure_ascii=False)
|
| 300 |
+
|
| 301 |
+
print(f" Saved vocab.json ({len(vocab_dict)} tokens)")
|
| 302 |
+
return len(vocab_dict)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def create_metadata(model: EncDecHybridRNNTCTCModel, output_dir: Path, vocab_size: int):
|
| 306 |
+
"""Create metadata file."""
|
| 307 |
+
print("Creating metadata...")
|
| 308 |
+
|
| 309 |
+
metadata = {
|
| 310 |
+
"model_id": "nvidia/parakeet-tdt_ctc-110m",
|
| 311 |
+
"sample_rate": 16000,
|
| 312 |
+
"max_audio_seconds": 15.0,
|
| 313 |
+
"max_audio_samples": 240000,
|
| 314 |
+
"vocab_size": vocab_size,
|
| 315 |
+
"vocab_with_blank": vocab_size + 1,
|
| 316 |
+
"num_extra": 5, # TDT duration bins
|
| 317 |
+
"decoder_hidden_dim": model.decoder.pred_hidden,
|
| 318 |
+
"decoder_num_layers": model.decoder.pred_num_layers,
|
| 319 |
+
"components": {
|
| 320 |
+
"preprocessor": "Preprocessor.mlpackage",
|
| 321 |
+
"encoder": "Encoder.mlpackage",
|
| 322 |
+
"ctc_head": "CTCHead.mlpackage",
|
| 323 |
+
"decoder": "Decoder.mlpackage",
|
| 324 |
+
"joint_decision": "JointDecision.mlpackage"
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
with open(output_dir / "metadata.json", "w") as f:
|
| 329 |
+
json.dump(metadata, f, indent=2)
|
| 330 |
+
|
| 331 |
+
print(" Saved metadata.json")
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def main():
|
| 335 |
+
parser = argparse.ArgumentParser(description="Convert Parakeet-TDT-CTC-110M to CoreML")
|
| 336 |
+
parser.add_argument("--output-dir", type=str, default="./model",
|
| 337 |
+
help="Output directory for CoreML models")
|
| 338 |
+
parser.add_argument("--model-name", type=str, default="nvidia/parakeet-tdt_ctc-110m",
|
| 339 |
+
help="NeMo model name or path")
|
| 340 |
+
args = parser.parse_args()
|
| 341 |
+
|
| 342 |
+
output_dir = Path(args.output_dir)
|
| 343 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 344 |
+
|
| 345 |
+
# Load model
|
| 346 |
+
model = load_nemo_model(args.model_name)
|
| 347 |
+
|
| 348 |
+
# Convert components
|
| 349 |
+
convert_preprocessor(model, output_dir)
|
| 350 |
+
convert_encoder(model, output_dir)
|
| 351 |
+
convert_ctc_head(model, output_dir)
|
| 352 |
+
convert_decoder(model, output_dir)
|
| 353 |
+
convert_joint(model, output_dir)
|
| 354 |
+
|
| 355 |
+
# Extract vocabulary and create metadata
|
| 356 |
+
vocab_size = extract_vocabulary(model, output_dir)
|
| 357 |
+
create_metadata(model, output_dir, vocab_size)
|
| 358 |
+
|
| 359 |
+
print(f"\nConversion complete! Models saved to: {output_dir}")
|
| 360 |
+
print("\nTo compile models for Apple Silicon:")
|
| 361 |
+
print(" xcrun coremlcompiler compile Encoder.mlpackage Encoder.mlmodelc")
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
if __name__ == "__main__":
|
| 365 |
+
main()
|
inference.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Inference script for Parakeet-TDT-CTC-110M CoreML model.
|
| 4 |
+
|
| 5 |
+
This script demonstrates how to run inference using the converted CoreML models
|
| 6 |
+
on Apple Silicon. It supports both TDT (Token-Duration Transducer) decoding for
|
| 7 |
+
full transcription and CTC decoding for keyword spotting.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
uv run scripts/inference.py --audio audio.wav --mode tdt
|
| 11 |
+
uv run scripts/inference.py --audio audio.wav --mode ctc
|
| 12 |
+
|
| 13 |
+
Requirements:
|
| 14 |
+
- macOS 13+ with Apple Silicon
|
| 15 |
+
- Python 3.10+
|
| 16 |
+
- coremltools
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import json
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
import coremltools as ct
|
| 24 |
+
import numpy as np
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ParakeetCoreML:
|
| 28 |
+
"""CoreML inference wrapper for Parakeet-TDT-CTC-110M."""
|
| 29 |
+
|
| 30 |
+
def __init__(self, model_dir: str):
|
| 31 |
+
"""Load CoreML models from directory.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
model_dir: Path to directory containing .mlpackage files
|
| 35 |
+
"""
|
| 36 |
+
self.model_dir = Path(model_dir)
|
| 37 |
+
|
| 38 |
+
# Load metadata
|
| 39 |
+
with open(self.model_dir / "metadata.json") as f:
|
| 40 |
+
self.metadata = json.load(f)
|
| 41 |
+
|
| 42 |
+
# Load vocabulary
|
| 43 |
+
with open(self.model_dir / "vocab.json") as f:
|
| 44 |
+
vocab_dict = json.load(f)
|
| 45 |
+
self.vocab = {int(k): v for k, v in vocab_dict.items()}
|
| 46 |
+
|
| 47 |
+
self.blank_id = len(self.vocab) # Blank token is last
|
| 48 |
+
|
| 49 |
+
# Load models
|
| 50 |
+
print("Loading CoreML models...")
|
| 51 |
+
self.preprocessor = ct.models.MLModel(
|
| 52 |
+
str(self.model_dir / "Preprocessor.mlpackage")
|
| 53 |
+
)
|
| 54 |
+
self.encoder = ct.models.MLModel(
|
| 55 |
+
str(self.model_dir / "Encoder.mlpackage")
|
| 56 |
+
)
|
| 57 |
+
self.ctc_head = ct.models.MLModel(
|
| 58 |
+
str(self.model_dir / "CTCHead.mlpackage")
|
| 59 |
+
)
|
| 60 |
+
self.decoder = ct.models.MLModel(
|
| 61 |
+
str(self.model_dir / "Decoder.mlpackage")
|
| 62 |
+
)
|
| 63 |
+
self.joint = ct.models.MLModel(
|
| 64 |
+
str(self.model_dir / "JointDecision.mlpackage")
|
| 65 |
+
)
|
| 66 |
+
print("Models loaded successfully.")
|
| 67 |
+
|
| 68 |
+
def load_audio(self, audio_path: str) -> np.ndarray:
|
| 69 |
+
"""Load audio file and convert to 16kHz mono.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
audio_path: Path to audio file (WAV, MP3, etc.)
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Audio samples as float32 numpy array
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
import librosa
|
| 79 |
+
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
|
| 80 |
+
return audio.astype(np.float32)
|
| 81 |
+
except ImportError:
|
| 82 |
+
# Fallback to scipy for WAV files
|
| 83 |
+
from scipy.io import wavfile
|
| 84 |
+
sr, audio = wavfile.read(audio_path)
|
| 85 |
+
|
| 86 |
+
# Convert to mono if stereo
|
| 87 |
+
if len(audio.shape) > 1:
|
| 88 |
+
audio = audio.mean(axis=1)
|
| 89 |
+
|
| 90 |
+
# Resample if needed
|
| 91 |
+
if sr != 16000:
|
| 92 |
+
from scipy import signal
|
| 93 |
+
num_samples = int(len(audio) * 16000 / sr)
|
| 94 |
+
audio = signal.resample(audio, num_samples)
|
| 95 |
+
|
| 96 |
+
# Normalize to float32 [-1, 1]
|
| 97 |
+
if audio.dtype == np.int16:
|
| 98 |
+
audio = audio.astype(np.float32) / 32768.0
|
| 99 |
+
elif audio.dtype == np.int32:
|
| 100 |
+
audio = audio.astype(np.float32) / 2147483648.0
|
| 101 |
+
|
| 102 |
+
return audio.astype(np.float32)
|
| 103 |
+
|
| 104 |
+
def preprocess(self, audio: np.ndarray) -> tuple[np.ndarray, int]:
|
| 105 |
+
"""Convert audio to mel spectrogram.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
audio: Audio samples as float32 array
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Tuple of (mel spectrogram, mel length)
|
| 112 |
+
"""
|
| 113 |
+
audio_signal = audio.reshape(1, -1).astype(np.float32)
|
| 114 |
+
audio_length = np.array([len(audio)], dtype=np.int32)
|
| 115 |
+
|
| 116 |
+
result = self.preprocessor.predict({
|
| 117 |
+
"audio_signal": audio_signal,
|
| 118 |
+
"audio_length": audio_length
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
return result["mel"], int(result["mel_length"][0])
|
| 122 |
+
|
| 123 |
+
def encode(self, mel: np.ndarray, mel_length: int) -> tuple[np.ndarray, int]:
|
| 124 |
+
"""Run encoder on mel spectrogram.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
mel: Mel spectrogram from preprocessor
|
| 128 |
+
mel_length: Length of mel spectrogram
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Tuple of (encoder output, encoder length)
|
| 132 |
+
"""
|
| 133 |
+
result = self.encoder.predict({
|
| 134 |
+
"mel": mel,
|
| 135 |
+
"mel_length": np.array([mel_length], dtype=np.int32)
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
return result["encoder"], int(result["encoder_length"][0])
|
| 139 |
+
|
| 140 |
+
def decode_ctc(self, encoder_output: np.ndarray) -> list[int]:
|
| 141 |
+
"""CTC greedy decoding.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
encoder_output: Output from encoder
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
List of token IDs (with duplicates and blanks removed)
|
| 148 |
+
"""
|
| 149 |
+
result = self.ctc_head.predict({"encoder_output": encoder_output})
|
| 150 |
+
log_probs = result["ctc_log_probs"]
|
| 151 |
+
|
| 152 |
+
# Greedy decoding: take argmax at each timestep
|
| 153 |
+
predictions = np.argmax(log_probs[0], axis=-1)
|
| 154 |
+
|
| 155 |
+
# Remove duplicates and blanks
|
| 156 |
+
tokens = []
|
| 157 |
+
prev_token = self.blank_id
|
| 158 |
+
for token in predictions:
|
| 159 |
+
if token != self.blank_id and token != prev_token:
|
| 160 |
+
tokens.append(int(token))
|
| 161 |
+
prev_token = token
|
| 162 |
+
|
| 163 |
+
return tokens
|
| 164 |
+
|
| 165 |
+
def decode_tdt(self, encoder_output: np.ndarray, encoder_length: int) -> list[int]:
|
| 166 |
+
"""TDT (Token-Duration Transducer) decoding.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
encoder_output: Output from encoder
|
| 170 |
+
encoder_length: Length of encoder output
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
List of token IDs
|
| 174 |
+
"""
|
| 175 |
+
hidden_size = self.metadata["decoder_hidden_dim"]
|
| 176 |
+
num_layers = self.metadata["decoder_num_layers"]
|
| 177 |
+
|
| 178 |
+
# Initialize decoder state
|
| 179 |
+
h = np.zeros((num_layers, 1, hidden_size), dtype=np.float32)
|
| 180 |
+
c = np.zeros((num_layers, 1, hidden_size), dtype=np.float32)
|
| 181 |
+
|
| 182 |
+
# Start with blank token
|
| 183 |
+
targets = np.zeros((1, 1), dtype=np.int32)
|
| 184 |
+
target_length = np.array([1], dtype=np.int32)
|
| 185 |
+
|
| 186 |
+
tokens = []
|
| 187 |
+
frame = 0
|
| 188 |
+
max_tokens = 1000 # Safety limit
|
| 189 |
+
|
| 190 |
+
while frame < encoder_length and len(tokens) < max_tokens:
|
| 191 |
+
# Get decoder output
|
| 192 |
+
decoder_result = self.decoder.predict({
|
| 193 |
+
"targets": targets,
|
| 194 |
+
"target_length": target_length,
|
| 195 |
+
"h_in": h,
|
| 196 |
+
"c_in": c
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
decoder_output = decoder_result["decoder"]
|
| 200 |
+
h = decoder_result["h_out"]
|
| 201 |
+
c = decoder_result["c_out"]
|
| 202 |
+
|
| 203 |
+
# Get encoder step
|
| 204 |
+
encoder_step = encoder_output[0, frame:frame+1, :].T.reshape(1, -1, 1)
|
| 205 |
+
decoder_step = decoder_output.T.reshape(1, -1, 1)
|
| 206 |
+
|
| 207 |
+
# Joint prediction
|
| 208 |
+
joint_result = self.joint.predict({
|
| 209 |
+
"encoder_step": encoder_step.astype(np.float32),
|
| 210 |
+
"decoder_step": decoder_step.astype(np.float32)
|
| 211 |
+
})
|
| 212 |
+
|
| 213 |
+
token_id = int(joint_result["token_id"])
|
| 214 |
+
duration_bin = int(joint_result["duration_bin"])
|
| 215 |
+
|
| 216 |
+
# Duration bins: 0=0, 1=1, 2=2, 3=3, 4=4+
|
| 217 |
+
durations = [0, 1, 2, 3, 4]
|
| 218 |
+
duration = durations[min(duration_bin, 4)]
|
| 219 |
+
|
| 220 |
+
if token_id != self.blank_id:
|
| 221 |
+
tokens.append(token_id)
|
| 222 |
+
# Update decoder input
|
| 223 |
+
targets = np.array([[token_id]], dtype=np.int32)
|
| 224 |
+
|
| 225 |
+
# Advance by duration (minimum 1 frame)
|
| 226 |
+
frame += max(1, duration)
|
| 227 |
+
|
| 228 |
+
return tokens
|
| 229 |
+
|
| 230 |
+
def tokens_to_text(self, tokens: list[int]) -> str:
|
| 231 |
+
"""Convert token IDs to text.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
tokens: List of token IDs
|
| 235 |
+
|
| 236 |
+
Returns:
|
| 237 |
+
Decoded text string
|
| 238 |
+
"""
|
| 239 |
+
pieces = [self.vocab.get(t, "") for t in tokens]
|
| 240 |
+
# Join and handle SentencePiece encoding
|
| 241 |
+
text = "".join(pieces).replace("▁", " ").strip()
|
| 242 |
+
return text
|
| 243 |
+
|
| 244 |
+
def transcribe(self, audio_path: str, mode: str = "tdt") -> str:
|
| 245 |
+
"""Transcribe audio file.
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
audio_path: Path to audio file
|
| 249 |
+
mode: Decoding mode - "tdt" for full transcription, "ctc" for keyword spotting
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
Transcribed text
|
| 253 |
+
"""
|
| 254 |
+
# Load and preprocess audio
|
| 255 |
+
audio = self.load_audio(audio_path)
|
| 256 |
+
mel, mel_length = self.preprocess(audio)
|
| 257 |
+
|
| 258 |
+
# Encode
|
| 259 |
+
encoder_output, encoder_length = self.encode(mel, mel_length)
|
| 260 |
+
|
| 261 |
+
# Decode
|
| 262 |
+
if mode == "ctc":
|
| 263 |
+
tokens = self.decode_ctc(encoder_output)
|
| 264 |
+
else:
|
| 265 |
+
tokens = self.decode_tdt(encoder_output, encoder_length)
|
| 266 |
+
|
| 267 |
+
# Convert to text
|
| 268 |
+
text = self.tokens_to_text(tokens)
|
| 269 |
+
|
| 270 |
+
return text
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def main():
|
| 274 |
+
parser = argparse.ArgumentParser(
|
| 275 |
+
description="Run inference with Parakeet-TDT-CTC-110M CoreML model"
|
| 276 |
+
)
|
| 277 |
+
parser.add_argument(
|
| 278 |
+
"--audio", type=str, required=True,
|
| 279 |
+
help="Path to audio file (WAV, MP3, etc.)"
|
| 280 |
+
)
|
| 281 |
+
parser.add_argument(
|
| 282 |
+
"--model-dir", type=str, default="./model",
|
| 283 |
+
help="Directory containing CoreML model files"
|
| 284 |
+
)
|
| 285 |
+
parser.add_argument(
|
| 286 |
+
"--mode", type=str, choices=["tdt", "ctc"], default="tdt",
|
| 287 |
+
help="Decoding mode: 'tdt' for transcription, 'ctc' for keyword spotting"
|
| 288 |
+
)
|
| 289 |
+
args = parser.parse_args()
|
| 290 |
+
|
| 291 |
+
# Load model
|
| 292 |
+
model = ParakeetCoreML(args.model_dir)
|
| 293 |
+
|
| 294 |
+
# Transcribe
|
| 295 |
+
print(f"\nTranscribing: {args.audio}")
|
| 296 |
+
print(f"Mode: {args.mode.upper()}")
|
| 297 |
+
print("-" * 40)
|
| 298 |
+
|
| 299 |
+
text = model.transcribe(args.audio, mode=args.mode)
|
| 300 |
+
print(f"Result: {text}")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
if __name__ == "__main__":
|
| 304 |
+
main()
|
metadata.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_id": "nvidia/parakeet-tdt_ctc-110m",
|
| 3 |
+
"sample_rate": 16000,
|
| 4 |
+
"max_audio_seconds": 15.0,
|
| 5 |
+
"max_audio_samples": 240000,
|
| 6 |
+
"vocab_size": 1024,
|
| 7 |
+
"vocab_with_blank": 1025,
|
| 8 |
+
"num_extra": 5,
|
| 9 |
+
"decoder_hidden_dim": 640,
|
| 10 |
+
"decoder_num_layers": 1,
|
| 11 |
+
"components": {
|
| 12 |
+
"preprocessor": "Preprocessor.mlpackage",
|
| 13 |
+
"encoder": "Encoder.mlpackage",
|
| 14 |
+
"ctc_head": "CTCHead.mlpackage",
|
| 15 |
+
"decoder": "Decoder.mlpackage",
|
| 16 |
+
"joint_decision": "JointDecision.mlpackage"
|
| 17 |
+
}
|
| 18 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "parakeet-tdt-ctc-110m-coreml"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "NVIDIA Parakeet-TDT-CTC-110M converted to CoreML format for Apple Silicon"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
license = { text = "Apache-2.0" }
|
| 8 |
+
authors = [
|
| 9 |
+
{ name = "FluidInference" }
|
| 10 |
+
]
|
| 11 |
+
keywords = ["asr", "speech-recognition", "coreml", "apple-silicon", "nvidia", "parakeet"]
|
| 12 |
+
classifiers = [
|
| 13 |
+
"Development Status :: 4 - Beta",
|
| 14 |
+
"Intended Audience :: Developers",
|
| 15 |
+
"License :: OSI Approved :: Apache Software License",
|
| 16 |
+
"Operating System :: MacOS",
|
| 17 |
+
"Programming Language :: Python :: 3",
|
| 18 |
+
"Programming Language :: Python :: 3.10",
|
| 19 |
+
"Programming Language :: Python :: 3.11",
|
| 20 |
+
"Programming Language :: Python :: 3.12",
|
| 21 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 22 |
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
dependencies = [
|
| 26 |
+
"coremltools>=7.0",
|
| 27 |
+
"numpy>=1.24.0",
|
| 28 |
+
"scipy>=1.10.0",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.optional-dependencies]
|
| 32 |
+
convert = [
|
| 33 |
+
"torch>=2.0.0",
|
| 34 |
+
"nemo-toolkit[asr]>=1.20.0",
|
| 35 |
+
]
|
| 36 |
+
audio = [
|
| 37 |
+
"librosa>=0.10.0",
|
| 38 |
+
]
|
| 39 |
+
dev = [
|
| 40 |
+
"pytest>=7.0.0",
|
| 41 |
+
"ruff>=0.1.0",
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
[project.urls]
|
| 45 |
+
Homepage = "https://huggingface.co/FluidInference/parakeet-tdt-ctc-110m-coreml"
|
| 46 |
+
Repository = "https://huggingface.co/FluidInference/parakeet-tdt-ctc-110m-coreml"
|
| 47 |
+
Issues = "https://github.com/FluidInference/fluidaudio/issues"
|
| 48 |
+
|
| 49 |
+
[project.scripts]
|
| 50 |
+
parakeet-inference = "scripts.inference:main"
|
| 51 |
+
|
| 52 |
+
[build-system]
|
| 53 |
+
requires = ["hatchling"]
|
| 54 |
+
build-backend = "hatchling.build"
|
| 55 |
+
|
| 56 |
+
[tool.hatch.build.targets.wheel]
|
| 57 |
+
packages = ["scripts"]
|
| 58 |
+
|
| 59 |
+
[tool.ruff]
|
| 60 |
+
line-length = 100
|
| 61 |
+
target-version = "py310"
|
| 62 |
+
|
| 63 |
+
[tool.ruff.lint]
|
| 64 |
+
select = ["E", "F", "I", "N", "W"]
|
| 65 |
+
ignore = ["E501"]
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"0": "<unk>", "1": " t", "2": " th", "3": " a", "4": "in", "5": "re", "6": " the", "7": " w", "8": " s", "9": " o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": " h", "16": " c", "17": " b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": " f", "23": " to", "24": " m", "25": "es", "26": " p", "27": "or", "28": "an", "29": " d", "30": "ll", "31": " I", "32": "ed", "33": " and", "34": " l", "35": " of", "36": " in", "37": " y", "38": "ar", "39": " g", "40": " you", "41": "as", "42": "om", "43": " n", "44": "ve", "45": " that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": " e", "53": "ut", "54": " it", "55": "ot", "56": " be", "57": " T", "58": "ion", "59": " is", "60": " wh", "61": " re", "62": " on", "63": " we", "64": "ent", "65": " A", "66": "ay", "67": " ha", "68": " Th", "69": "id", "70": " S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": " for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": " he", "81": " st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": " this", "91": "if", "92": " W", "93": "oo", "94": "ri", "95": " was", "96": "ght", "97": " u", "98": " with", "99": "ad", "100": "ch", "101": " se", "102": " k", "103": " an", "104": " The", "105": " li", "106": " do", "107": " B", "108": " have", "109": " as", "110": "th", "111": " are", "112": " sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": " H", "118": " j", "119": "ter", "120": " go", "121": " And", "122": "ation", "123": " C", "124": " so", "125": "ome", "126": " not", "127": "op", "128": "il", "129": "ore", "130": " ne", "131": " can", "132": " me", "133": " at", "134": "ould", "135": "ant", "136": " M", "137": " like", "138": "ere", "139": " they", "140": "ra", "141": "ers", "142": " ab", "143": " de", "144": " kn", "145": "ge", "146": " Y", "147": " ch", "148": "ul", "149": "pp", "150": " or", "151": " al", "152": " con", "153": " com", "154": "ess", "155": " su", "156": "out", "157": " your", "158": " So", "159": "ate", "160": " one", "161": " all", "162": " ex", "163": "est", "164": " fr", "165": " just", "166": " pro", "167": " know", "168": " O", "169": "ain", "170": " but", "171": "ol", "172": "ive", "173": " v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": " my", "179": "el", "180": " N", "181": "nt", "182": " It", "183": " what", "184": "ab", "185": " P", "186": " wor", "187": " out", "188": " there", "189": " up", "190": "um", "191": " from", "192": "pe", "193": " tw", "194": " r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": " L", "200": "ist", "201": " about", "202": "ide", "203": "ig", "204": "ake", "205": " D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": " We", "214": " get", "215": " E", "216": " G", "217": "ack", "218": " le", "219": "ity", "220": "od", "221": " F", "222": "ard", "223": " pl", "224": " our", "225": " int", "226": "ment", "227": " will", "228": "ies", "229": " by", "230": "ink", "231": "ca", "232": " if", "233": "red", "234": "her", "235": "ie", "236": " us", "237": " some", "238": " don", "239": "ven", "240": "ood", "241": "ast", "242": " R", "243": " his", "244": " tim", "245": " tr", "246": " more", "247": "ich", "248": "ous", "249": "ame", "250": " going", "251": " had", "252": " them", "253": "ook", "254": " pe", "255": " Wh", "256": " You", "257": " But", "258": "ine", "259": " here", "260": " would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": " has", "267": "ect", "268": " think", "269": " fe", "270": "ong", "271": " see", "272": " when", "273": " who", "274": " were", "275": " really", "276": " their", "277": " want", "278": "one", "279": "ople", "280": " then", "281": " time", "282": " sa", "283": "ap", "284": " te", "285": " He", "286": " ye", "287": "ck", "288": " her", "289": " thing", "290": " right", "291": " which", "292": "itt", "293": "ice", "294": "act", "295": " people", "296": "ty", "297": " two", "298": " J", "299": " im", "300": "ther", "301": "ci", "302": "ose", "303": " cl", "304": " qu", "305": " man", "306": " also", "307": "ree", "308": " en", "309": "ud", "310": " how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": " any", "316": "ff", "317": "ace", "318": "per", "319": " because", "320": " very", "321": "own", "322": " ad", "323": " act", "324": " been", "325": " now", "326": " ag", "327": " into", "328": " comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": " these", "335": "ays", "336": "ep", "337": " This", "338": " she", "339": "ans", "340": "ah", "341": "een", "342": " over", "343": "ry", "344": " lo", "345": "age", "346": " pr", "347": " sp", "348": "ue", "349": " co", "350": "ick", "351": "ber", "352": " did", "353": "ip", "354": "ach", "355": " back", "356": " no", "357": " cont", "358": " other", "359": " every", "360": "pt", "361": " need", "362": " him", "363": " U", "364": " In", "365": " work", "366": "irst", "367": " part", "368": " look", "369": "ittle", "370": "ble", "371": "iz", "372": " un", "373": " make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": " little", "379": " off", "380": " than", "381": " got", "382": "ually", "383": " per", "384": " good", "385": " way", "386": " could", "387": " ac", "388": " imp", "389": "able", "390": " where", "391": "iff", "392": " That", "393": " res", "394": "ount", "395": "pl", "396": "ance", "397": " first", "398": " ro", "399": " pre", "400": "ass", "401": " say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": " somet", "408": "ound", "409": " down", "410": " diff", "411": "sel", "412": " gu", "413": " am", "414": "ress", "415": " lot", "416": "ence", "417": " dis", "418": "orm", "419": "ix", "420": " po", "421": "ving", "422": "enty", "423": " K", "424": " spe", "425": "und", "426": "he", "427": " much", "428": " ar", "429": "round", "430": " app", "431": "co", "432": "ark", "433": " new", "434": "ater", "435": "ult", "436": "end", "437": " even", "438": " start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": " well", "444": "be", "445": " They", "446": " three", "447": "ign", "448": "ild", "449": " said", "450": "ough", "451": "ang", "452": " too", "453": "ade", "454": " bl", "455": "ens", "456": " inc", "457": "ia", "458": " those", "459": " mo", "460": " take", "461": " through", "462": " fl", "463": " kind", "464": " things", "465": " bet", "466": " only", "467": " St", "468": " let", "469": "cess", "470": " Ch", "471": "ary", "472": "vel", "473": " If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": " again", "480": " something", "481": "onna", "482": "fore", "483": " may", "484": "ting", "485": " bu", "486": " differe", "487": "urn", "488": " gonna", "489": " does", "490": "uct", "491": "og", "492": " twenty", "493": " gr", "494": " Ye", "495": "wn", "496": " should", "497": " comm", "498": "ition", "499": " under", "500": " hel", "501": "ory", "502": " fo", "503": " use", "504": "igh", "505": "ife", "506": " actually", "507": " tal", "508": " call", "509": "ents", "510": "ious", "511": "ull", "512": " There", "513": " Yeah", "514": " most", "515": " ke", "516": "ors", "517": "ved", "518": "ys", "519": " sc", "520": " happ", "521": "ope", "522": " help", "523": "atch", "524": " What", "525": " rem", "526": "ple", "527": " Now", "528": " br", "529": "ool", "530": "oth", "531": " four", "532": "self", "533": " str", "534": "ne", "535": "thing", "536": " put", "537": "ial", "538": " great", "539": "ail", "540": "ub", "541": "ning", "542": " sm", "543": " feel", "544": " five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": " many", "552": " hundred", "553": " years", "554": " being", "555": " come", "556": " mean", "557": "ily", "558": " different", "559": " after", "560": " ser", "561": " show", "562": "form", "563": "ful", "564": "oy", "565": " six", "566": " vide", "567": " V", "568": " its", "569": " point", "570": " day", "571": " des", "572": "ons", "573": " bit", "574": " bel", "575": " before", "576": " aw", "577": " end", "578": " Oh", "579": " still", "580": "ath", "581": " long", "582": " '", "583": "ise", "584": "ob", "585": "day", "586": " add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": " cr", "592": " around", "593": " try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": " find", "600": "ward", "601": " As", "602": " eight", "603": "lic", "604": " same", "605": " pos", "606": " em", "607": " made", "608": " supp", "609": " life", "610": " Be", "611": "pect", "612": " dec", "613": " play", "614": "ange", "615": " att", "616": " pers", "617": "ways", "618": " high", "619": " hand", "620": " next", "621": " cons", "622": " own", "623": " inv", "624": "ower", "625": " ind", "626": "ert", "627": "ng", "628": "ave", "629": " year", "630": " big", "631": "ating", "632": " world", "633": " rel", "634": " sure", "635": " tra", "636": "ew", "637": "ered", "638": " fin", "639": " Well", "640": " sl", "641": " doing", "642": "bs", "643": " set", "644": " rec", "645": "ual", "646": "cial", "647": " ph", "648": "erm", "649": " love", "650": "ph", "651": " real", "652": " last", "653": "ict", "654": " bo", "655": " ra", "656": "ible", "657": " wr", "658": "mer", "659": " count", "660": "ities", "661": " always", "662": "inet", "663": "ments", "664": "uc", "665": " might", "666": " inter", "667": " video", "668": "gin", "669": " tell", "670": " never", "671": "vent", "672": " import", "673": "ied", "674": " sy", "675": " How", "676": "ically", "677": "ought", "678": " thir", "679": " rep", "680": "ks", "681": "ib", "682": " fam", "683": "ject", "684": " bas", "685": " She", "686": " give", "687": "akes", "688": " ninet", "689": " reg", "690": " min", "691": " op", "692": " def", "693": " didn", "694": "te", "695": " cour", "696": " why", "697": " ent", "698": " place", "699": " ins", "700": " car", "701": "ather", "702": " person", "703": "ular", "704": " inst", "705": " prod", "706": "lect", "707": " Al", "708": " today", "709": " bec", "710": " sur", "711": " All", "712": " another", "713": " bus", "714": " keep", "715": "ell", "716": "ese", "717": "riend", "718": " quest", "719": " talk", "720": "als", "721": "ings", "722": " mon", "723": "cond", "724": "old", "725": " acc", "726": " la", "727": " num", "728": "ident", "729": " che", "730": "iness", "731": " turn", "732": " ear", "733": " No", "734": "ousand", "735": " better", "736": "ific", "737": " loo", "738": " gl", "739": "oc", "740": " important", "741": "ited", "742": " An", "743": " thousand", "744": "ility", "745": "llow", "746": " used", "747": " gen", "748": " sim", "749": "li", "750": " happen", "751": " Un", "752": " Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": " watch", "758": " For", "759": " sw", "760": "ren", "761": "ute", "762": "ever", "763": " pol", "764": " sch", "765": " When", "766": " such", "767": " fif", "768": " home", "769": " cle", "770": " contin", "771": "ouse", "772": " friend", "773": "uring", "774": " Okay", "775": "gr", "776": " able", "777": " stud", "778": " eff", "779": "hip", "780": "body", "781": " top", "782": "ness", "783": " exper", "784": " pret", "785": " both", "786": " done", "787": "cri", "788": " mark", "789": " while", "790": " old", "791": "ros", "792": "ont", "793": " second", "794": "ative", "795": " thought", "796": " best", "797": " found", "798": "iew", "799": " belie", "800": " each", "801": "erest", "802": " tri", "803": " eas", "804": " ca", "805": " fact", "806": " care", "807": " fun", "808": "atter", "809": "ures", "810": " head", "811": " lear", "812": " water", "813": " hard", "814": " few", "815": " side", "816": "ween", "817": " exp", "818": " away", "819": "its", "820": " ext", "821": "lud", "822": " run", "823": " trans", "824": "ince", "825": " sk", "826": " open", "827": "cus", "828": " between", "829": " called", "830": " wee", "831": " pretty", "832": "ason", "833": " far", "834": "ember", "835": "omm", "836": " interest", "837": "any", "838": "ner", "839": "uff", "840": " pres", "841": " cur", "842": " child", "843": "ee", "844": " toget", "845": " together", "846": "olog", "847": " God", "848": "ond", "849": " char", "850": " looking", "851": "stem", "852": "az", "853": "cent", "854": " ob", "855": " ass", "856": "land", "857": " doesn", "858": " business", "859": " course", "860": " ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": " ref", "868": " name", "869": "ross", "870": " grow", "871": "oney", "872": " went", "873": "ics", "874": "teen", "875": " cou", "876": " prob", "877": " ret", "878": " guys", "879": " came", "880": "ash", "881": "led", "882": " Eur", "883": "ues", "884": " ide", "885": "gan", "886": " everything", "887": " getting", "888": " ask", "889": " cor", "890": " build", "891": " sign", "892": " small", "893": "uck", "894": " el", "895": " col", "896": " Is", "897": "ational", "898": "stand", "899": "cy", "900": " conf", "901": "der", "902": " bre", "903": " cap", "904": " mod", "905": "ets", "906": "ike", "907": " number", "908": " comple", "909": "ertain", "910": " ever", "911": " coll", "912": " hum", "913": " Europe", "914": " cre", "915": " met", "916": " exam", "917": " move", "918": " pass", "919": " left", "920": " system", "921": " includ", "922": " Thank", "923": "cept", "924": " wom", "925": " product", "926": "ten", "927": " rest", "928": " probably", "929": " dri", "930": " Do", "931": " gener", "932": " anything", "933": " lar", "934": " My", "935": " school", "936": " lead", "937": " sub", "938": " ty", "939": " plan", "940": " seem", "941": " whole", "942": "irect", "943": " light", "944": " must", "945": " mom", "946": " opp", "947": " support", "948": " family", "949": "ices", "950": "amp", "951": " proble", "952": " dr", "953": "ready", "954": " using", "955": "ense", "956": " prov", "957": "ush", "958": "ax", "959": " power", "960": " Re", "961": "alth", "962": " ev", "963": " stand", "964": " war", "965": "ts", "966": " ", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}
|
|
|
|
| 1 |
+
{"0": "<unk>", "1": "\u2581t", "2": "\u2581th", "3": "\u2581a", "4": "in", "5": "re", "6": "\u2581the", "7": "\u2581w", "8": "\u2581s", "9": "\u2581o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "\u2581h", "16": "\u2581c", "17": "\u2581b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "\u2581f", "23": "\u2581to", "24": "\u2581m", "25": "es", "26": "\u2581p", "27": "or", "28": "an", "29": "\u2581d", "30": "ll", "31": "\u2581I", "32": "ed", "33": "\u2581and", "34": "\u2581l", "35": "\u2581of", "36": "\u2581in", "37": "\u2581y", "38": "ar", "39": "\u2581g", "40": "\u2581you", "41": "as", "42": "om", "43": "\u2581n", "44": "ve", "45": "\u2581that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "\u2581e", "53": "ut", "54": "\u2581it", "55": "ot", "56": "\u2581be", "57": "\u2581T", "58": "ion", "59": "\u2581is", "60": "\u2581wh", "61": "\u2581re", "62": "\u2581on", "63": "\u2581we", "64": "ent", "65": "\u2581A", "66": "ay", "67": "\u2581ha", "68": "\u2581Th", "69": "id", "70": "\u2581S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "\u2581for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "\u2581he", "81": "\u2581st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "\u2581this", "91": "if", "92": "\u2581W", "93": "oo", "94": "ri", "95": "\u2581was", "96": "ght", "97": "\u2581u", "98": "\u2581with", "99": "ad", "100": "ch", "101": "\u2581se", "102": "\u2581k", "103": "\u2581an", "104": "\u2581The", "105": "\u2581li", "106": "\u2581do", "107": "\u2581B", "108": "\u2581have", "109": "\u2581as", "110": "th", "111": "\u2581are", "112": "\u2581sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "\u2581H", "118": "\u2581j", "119": "ter", "120": "\u2581go", "121": "\u2581And", "122": "ation", "123": "\u2581C", "124": "\u2581so", "125": "ome", "126": "\u2581not", "127": "op", "128": "il", "129": "ore", "130": "\u2581ne", "131": "\u2581can", "132": "\u2581me", "133": "\u2581at", "134": "ould", "135": "ant", "136": "\u2581M", "137": "\u2581like", "138": "ere", "139": "\u2581they", "140": "ra", "141": "ers", "142": "\u2581ab", "143": "\u2581de", "144": "\u2581kn", "145": "ge", "146": "\u2581Y", "147": "\u2581ch", "148": "ul", "149": "pp", "150": "\u2581or", "151": "\u2581al", "152": "\u2581con", "153": "\u2581com", "154": "ess", "155": "\u2581su", "156": "out", "157": "\u2581your", "158": "\u2581So", "159": "ate", "160": "\u2581one", "161": "\u2581all", "162": "\u2581ex", "163": "est", "164": "\u2581fr", "165": "\u2581just", "166": "\u2581pro", "167": "\u2581know", "168": "\u2581O", "169": "ain", "170": "\u2581but", "171": "ol", "172": "ive", "173": "\u2581v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "\u2581my", "179": "el", "180": "\u2581N", "181": "nt", "182": "\u2581It", "183": "\u2581what", "184": "ab", "185": "\u2581P", "186": "\u2581wor", "187": "\u2581out", "188": "\u2581there", "189": "\u2581up", "190": "um", "191": "\u2581from", "192": "pe", "193": "\u2581tw", "194": "\u2581r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "\u2581L", "200": "ist", "201": "\u2581about", "202": "ide", "203": "ig", "204": "ake", "205": "\u2581D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "\u2581We", "214": "\u2581get", "215": "\u2581E", "216": "\u2581G", "217": "ack", "218": "\u2581le", "219": "ity", "220": "od", "221": "\u2581F", "222": "ard", "223": "\u2581pl", "224": "\u2581our", "225": "\u2581int", "226": "ment", "227": "\u2581will", "228": "ies", "229": "\u2581by", "230": "ink", "231": "ca", "232": "\u2581if", "233": "red", "234": "her", "235": "ie", "236": "\u2581us", "237": "\u2581some", "238": "\u2581don", "239": "ven", "240": "ood", "241": "ast", "242": "\u2581R", "243": "\u2581his", "244": "\u2581tim", "245": "\u2581tr", "246": "\u2581more", "247": "ich", "248": "ous", "249": "ame", "250": "\u2581going", "251": "\u2581had", "252": "\u2581them", "253": "ook", "254": "\u2581pe", "255": "\u2581Wh", "256": "\u2581You", "257": "\u2581But", "258": "ine", "259": "\u2581here", "260": "\u2581would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "\u2581has", "267": "ect", "268": "\u2581think", "269": "\u2581fe", "270": "ong", "271": "\u2581see", "272": "\u2581when", "273": "\u2581who", "274": "\u2581were", "275": "\u2581really", "276": "\u2581their", "277": "\u2581want", "278": "one", "279": "ople", "280": "\u2581then", "281": "\u2581time", "282": "\u2581sa", "283": "ap", "284": "\u2581te", "285": "\u2581He", "286": "\u2581ye", "287": "ck", "288": "\u2581her", "289": "\u2581thing", "290": "\u2581right", "291": "\u2581which", "292": "itt", "293": "ice", "294": "act", "295": "\u2581people", "296": "ty", "297": "\u2581two", "298": "\u2581J", "299": "\u2581im", "300": "ther", "301": "ci", "302": "ose", "303": "\u2581cl", "304": "\u2581qu", "305": "\u2581man", "306": "\u2581also", "307": "ree", "308": "\u2581en", "309": "ud", "310": "\u2581how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "\u2581any", "316": "ff", "317": "ace", "318": "per", "319": "\u2581because", "320": "\u2581very", "321": "own", "322": "\u2581ad", "323": "\u2581act", "324": "\u2581been", "325": "\u2581now", "326": "\u2581ag", "327": "\u2581into", "328": "\u2581comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "\u2581these", "335": "ays", "336": "ep", "337": "\u2581This", "338": "\u2581she", "339": "ans", "340": "ah", "341": "een", "342": "\u2581over", "343": "ry", "344": "\u2581lo", "345": "age", "346": "\u2581pr", "347": "\u2581sp", "348": "ue", "349": "\u2581co", "350": "ick", "351": "ber", "352": "\u2581did", "353": "ip", "354": "ach", "355": "\u2581back", "356": "\u2581no", "357": "\u2581cont", "358": "\u2581other", "359": "\u2581every", "360": "pt", "361": "\u2581need", "362": "\u2581him", "363": "\u2581U", "364": "\u2581In", "365": "\u2581work", "366": "irst", "367": "\u2581part", "368": "\u2581look", "369": "ittle", "370": "ble", "371": "iz", "372": "\u2581un", "373": "\u2581make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "\u2581little", "379": "\u2581off", "380": "\u2581than", "381": "\u2581got", "382": "ually", "383": "\u2581per", "384": "\u2581good", "385": "\u2581way", "386": "\u2581could", "387": "\u2581ac", "388": "\u2581imp", "389": "able", "390": "\u2581where", "391": "iff", "392": "\u2581That", "393": "\u2581res", "394": "ount", "395": "pl", "396": "ance", "397": "\u2581first", "398": "\u2581ro", "399": "\u2581pre", "400": "ass", "401": "\u2581say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "\u2581somet", "408": "ound", "409": "\u2581down", "410": "\u2581diff", "411": "sel", "412": "\u2581gu", "413": "\u2581am", "414": "ress", "415": "\u2581lot", "416": "ence", "417": "\u2581dis", "418": "orm", "419": "ix", "420": "\u2581po", "421": "ving", "422": "enty", "423": "\u2581K", "424": "\u2581spe", "425": "und", "426": "he", "427": "\u2581much", "428": "\u2581ar", "429": "round", "430": "\u2581app", "431": "co", "432": "ark", "433": "\u2581new", "434": "ater", "435": "ult", "436": "end", "437": "\u2581even", "438": "\u2581start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "\u2581well", "444": "be", "445": "\u2581They", "446": "\u2581three", "447": "ign", "448": "ild", "449": "\u2581said", "450": "ough", "451": "ang", "452": "\u2581too", "453": "ade", "454": "\u2581bl", "455": "ens", "456": "\u2581inc", "457": "ia", "458": "\u2581those", "459": "\u2581mo", "460": "\u2581take", "461": "\u2581through", "462": "\u2581fl", "463": "\u2581kind", "464": "\u2581things", "465": "\u2581bet", "466": "\u2581only", "467": "\u2581St", "468": "\u2581let", "469": "cess", "470": "\u2581Ch", "471": "ary", "472": "vel", "473": "\u2581If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "\u2581again", "480": "\u2581something", "481": "onna", "482": "fore", "483": "\u2581may", "484": "ting", "485": "\u2581bu", "486": "\u2581differe", "487": "urn", "488": "\u2581gonna", "489": "\u2581does", "490": "uct", "491": "og", "492": "\u2581twenty", "493": "\u2581gr", "494": "\u2581Ye", "495": "wn", "496": "\u2581should", "497": "\u2581comm", "498": "ition", "499": "\u2581under", "500": "\u2581hel", "501": "ory", "502": "\u2581fo", "503": "\u2581use", "504": "igh", "505": "ife", "506": "\u2581actually", "507": "\u2581tal", "508": "\u2581call", "509": "ents", "510": "ious", "511": "ull", "512": "\u2581There", "513": "\u2581Yeah", "514": "\u2581most", "515": "\u2581ke", "516": "ors", "517": "ved", "518": "ys", "519": "\u2581sc", "520": "\u2581happ", "521": "ope", "522": "\u2581help", "523": "atch", "524": "\u2581What", "525": "\u2581rem", "526": "ple", "527": "\u2581Now", "528": "\u2581br", "529": "ool", "530": "oth", "531": "\u2581four", "532": "self", "533": "\u2581str", "534": "ne", "535": "thing", "536": "\u2581put", "537": "ial", "538": "\u2581great", "539": "ail", "540": "ub", "541": "ning", "542": "\u2581sm", "543": "\u2581feel", "544": "\u2581five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "\u2581many", "552": "\u2581hundred", "553": "\u2581years", "554": "\u2581being", "555": "\u2581come", "556": "\u2581mean", "557": "ily", "558": "\u2581different", "559": "\u2581after", "560": "\u2581ser", "561": "\u2581show", "562": "form", "563": "ful", "564": "oy", "565": "\u2581six", "566": "\u2581vide", "567": "\u2581V", "568": "\u2581its", "569": "\u2581point", "570": "\u2581day", "571": "\u2581des", "572": "ons", "573": "\u2581bit", "574": "\u2581bel", "575": "\u2581before", "576": "\u2581aw", "577": "\u2581end", "578": "\u2581Oh", "579": "\u2581still", "580": "ath", "581": "\u2581long", "582": "\u2581'", "583": "ise", "584": "ob", "585": "day", "586": "\u2581add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "\u2581cr", "592": "\u2581around", "593": "\u2581try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "\u2581find", "600": "ward", "601": "\u2581As", "602": "\u2581eight", "603": "lic", "604": "\u2581same", "605": "\u2581pos", "606": "\u2581em", "607": "\u2581made", "608": "\u2581supp", "609": "\u2581life", "610": "\u2581Be", "611": "pect", "612": "\u2581dec", "613": "\u2581play", "614": "ange", "615": "\u2581att", "616": "\u2581pers", "617": "ways", "618": "\u2581high", "619": "\u2581hand", "620": "\u2581next", "621": "\u2581cons", "622": "\u2581own", "623": "\u2581inv", "624": "ower", "625": "\u2581ind", "626": "ert", "627": "ng", "628": "ave", "629": "\u2581year", "630": "\u2581big", "631": "ating", "632": "\u2581world", "633": "\u2581rel", "634": "\u2581sure", "635": "\u2581tra", "636": "ew", "637": "ered", "638": "\u2581fin", "639": "\u2581Well", "640": "\u2581sl", "641": "\u2581doing", "642": "bs", "643": "\u2581set", "644": "\u2581rec", "645": "ual", "646": "cial", "647": "\u2581ph", "648": "erm", "649": "\u2581love", "650": "ph", "651": "\u2581real", "652": "\u2581last", "653": "ict", "654": "\u2581bo", "655": "\u2581ra", "656": "ible", "657": "\u2581wr", "658": "mer", "659": "\u2581count", "660": "ities", "661": "\u2581always", "662": "inet", "663": "ments", "664": "uc", "665": "\u2581might", "666": "\u2581inter", "667": "\u2581video", "668": "gin", "669": "\u2581tell", "670": "\u2581never", "671": "vent", "672": "\u2581import", "673": "ied", "674": "\u2581sy", "675": "\u2581How", "676": "ically", "677": "ought", "678": "\u2581thir", "679": "\u2581rep", "680": "ks", "681": "ib", "682": "\u2581fam", "683": "ject", "684": "\u2581bas", "685": "\u2581She", "686": "\u2581give", "687": "akes", "688": "\u2581ninet", "689": "\u2581reg", "690": "\u2581min", "691": "\u2581op", "692": "\u2581def", "693": "\u2581didn", "694": "te", "695": "\u2581cour", "696": "\u2581why", "697": "\u2581ent", "698": "\u2581place", "699": "\u2581ins", "700": "\u2581car", "701": "ather", "702": "\u2581person", "703": "ular", "704": "\u2581inst", "705": "\u2581prod", "706": "lect", "707": "\u2581Al", "708": "\u2581today", "709": "\u2581bec", "710": "\u2581sur", "711": "\u2581All", "712": "\u2581another", "713": "\u2581bus", "714": "\u2581keep", "715": "ell", "716": "ese", "717": "riend", "718": "\u2581quest", "719": "\u2581talk", "720": "als", "721": "ings", "722": "\u2581mon", "723": "cond", "724": "old", "725": "\u2581acc", "726": "\u2581la", "727": "\u2581num", "728": "ident", "729": "\u2581che", "730": "iness", "731": "\u2581turn", "732": "\u2581ear", "733": "\u2581No", "734": "ousand", "735": "\u2581better", "736": "ific", "737": "\u2581loo", "738": "\u2581gl", "739": "oc", "740": "\u2581important", "741": "ited", "742": "\u2581An", "743": "\u2581thousand", "744": "ility", "745": "llow", "746": "\u2581used", "747": "\u2581gen", "748": "\u2581sim", "749": "li", "750": "\u2581happen", "751": "\u2581Un", "752": "\u2581Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "\u2581watch", "758": "\u2581For", "759": "\u2581sw", "760": "ren", "761": "ute", "762": "ever", "763": "\u2581pol", "764": "\u2581sch", "765": "\u2581When", "766": "\u2581such", "767": "\u2581fif", "768": "\u2581home", "769": "\u2581cle", "770": "\u2581contin", "771": "ouse", "772": "\u2581friend", "773": "uring", "774": "\u2581Okay", "775": "gr", "776": "\u2581able", "777": "\u2581stud", "778": "\u2581eff", "779": "hip", "780": "body", "781": "\u2581top", "782": "ness", "783": "\u2581exper", "784": "\u2581pret", "785": "\u2581both", "786": "\u2581done", "787": "cri", "788": "\u2581mark", "789": "\u2581while", "790": "\u2581old", "791": "ros", "792": "ont", "793": "\u2581second", "794": "ative", "795": "\u2581thought", "796": "\u2581best", "797": "\u2581found", "798": "iew", "799": "\u2581belie", "800": "\u2581each", "801": "erest", "802": "\u2581tri", "803": "\u2581eas", "804": "\u2581ca", "805": "\u2581fact", "806": "\u2581care", "807": "\u2581fun", "808": "atter", "809": "ures", "810": "\u2581head", "811": "\u2581lear", "812": "\u2581water", "813": "\u2581hard", "814": "\u2581few", "815": "\u2581side", "816": "ween", "817": "\u2581exp", "818": "\u2581away", "819": "its", "820": "\u2581ext", "821": "lud", "822": "\u2581run", "823": "\u2581trans", "824": "ince", "825": "\u2581sk", "826": "\u2581open", "827": "cus", "828": "\u2581between", "829": "\u2581called", "830": "\u2581wee", "831": "\u2581pretty", "832": "ason", "833": "\u2581far", "834": "ember", "835": "omm", "836": "\u2581interest", "837": "any", "838": "ner", "839": "uff", "840": "\u2581pres", "841": "\u2581cur", "842": "\u2581child", "843": "ee", "844": "\u2581toget", "845": "\u2581together", "846": "olog", "847": "\u2581God", "848": "ond", "849": "\u2581char", "850": "\u2581looking", "851": "stem", "852": "az", "853": "cent", "854": "\u2581ob", "855": "\u2581ass", "856": "land", "857": "\u2581doesn", "858": "\u2581business", "859": "\u2581course", "860": "\u2581ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "\u2581ref", "868": "\u2581name", "869": "ross", "870": "\u2581grow", "871": "oney", "872": "\u2581went", "873": "ics", "874": "teen", "875": "\u2581cou", "876": "\u2581prob", "877": "\u2581ret", "878": "\u2581guys", "879": "\u2581came", "880": "ash", "881": "led", "882": "\u2581Eur", "883": "ues", "884": "\u2581ide", "885": "gan", "886": "\u2581everything", "887": "\u2581getting", "888": "\u2581ask", "889": "\u2581cor", "890": "\u2581build", "891": "\u2581sign", "892": "\u2581small", "893": "uck", "894": "\u2581el", "895": "\u2581col", "896": "\u2581Is", "897": "ational", "898": "stand", "899": "cy", "900": "\u2581conf", "901": "der", "902": "\u2581bre", "903": "\u2581cap", "904": "\u2581mod", "905": "ets", "906": "ike", "907": "\u2581number", "908": "\u2581comple", "909": "ertain", "910": "\u2581ever", "911": "\u2581coll", "912": "\u2581hum", "913": "\u2581Europe", "914": "\u2581cre", "915": "\u2581met", "916": "\u2581exam", "917": "\u2581move", "918": "\u2581pass", "919": "\u2581left", "920": "\u2581system", "921": "\u2581includ", "922": "\u2581Thank", "923": "cept", "924": "\u2581wom", "925": "\u2581product", "926": "ten", "927": "\u2581rest", "928": "\u2581probably", "929": "\u2581dri", "930": "\u2581Do", "931": "\u2581gener", "932": "\u2581anything", "933": "\u2581lar", "934": "\u2581My", "935": "\u2581school", "936": "\u2581lead", "937": "\u2581sub", "938": "\u2581ty", "939": "\u2581plan", "940": "\u2581seem", "941": "\u2581whole", "942": "irect", "943": "\u2581light", "944": "\u2581must", "945": "\u2581mom", "946": "\u2581opp", "947": "\u2581support", "948": "\u2581family", "949": "ices", "950": "amp", "951": "\u2581proble", "952": "\u2581dr", "953": "ready", "954": "\u2581using", "955": "ense", "956": "\u2581prov", "957": "ush", "958": "ax", "959": "\u2581power", "960": "\u2581Re", "961": "alth", "962": "\u2581ev", "963": "\u2581stand", "964": "\u2581war", "965": "ts", "966": "\u2581", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}
|