Upload 4 files
Browse files- README.md +9 -6
- inference.py +1 -1
README.md
CHANGED
|
@@ -56,8 +56,8 @@ pip install -e ".[audio]"
|
|
| 56 |
```python
|
| 57 |
from scripts.inference import ParakeetCoreML
|
| 58 |
|
| 59 |
-
# Load model
|
| 60 |
-
model = ParakeetCoreML(".
|
| 61 |
|
| 62 |
# Transcribe with TDT (higher quality)
|
| 63 |
text = model.transcribe("audio.wav", mode="tdt")
|
|
@@ -72,10 +72,10 @@ print(text)
|
|
| 72 |
|
| 73 |
```bash
|
| 74 |
# TDT decoding (default, higher quality)
|
| 75 |
-
uv run scripts/inference.py --audio audio.wav
|
| 76 |
|
| 77 |
# CTC decoding (faster, good for keyword spotting)
|
| 78 |
-
uv run scripts/inference.py --audio audio.wav --
|
| 79 |
```
|
| 80 |
|
| 81 |
## Model Conversion
|
|
@@ -98,14 +98,17 @@ This will:
|
|
| 98 |
## File Structure
|
| 99 |
|
| 100 |
```
|
| 101 |
-
|
| 102 |
βββ Preprocessor.mlpackage # Audio β Mel spectrogram
|
| 103 |
βββ Encoder.mlpackage # Mel β Encoder features
|
| 104 |
βββ CTCHead.mlpackage # Encoder β CTC log probs
|
| 105 |
βββ Decoder.mlpackage # TDT prediction network
|
| 106 |
βββ JointDecision.mlpackage # TDT joint network
|
| 107 |
βββ vocab.json # Token vocabulary (1024 tokens)
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
| 109 |
```
|
| 110 |
|
| 111 |
## Decoding Modes
|
|
|
|
| 56 |
```python
|
| 57 |
from scripts.inference import ParakeetCoreML
|
| 58 |
|
| 59 |
+
# Load model (from current directory with .mlpackage files)
|
| 60 |
+
model = ParakeetCoreML(".")
|
| 61 |
|
| 62 |
# Transcribe with TDT (higher quality)
|
| 63 |
text = model.transcribe("audio.wav", mode="tdt")
|
|
|
|
| 72 |
|
| 73 |
```bash
|
| 74 |
# TDT decoding (default, higher quality)
|
| 75 |
+
uv run scripts/inference.py --audio audio.wav
|
| 76 |
|
| 77 |
# CTC decoding (faster, good for keyword spotting)
|
| 78 |
+
uv run scripts/inference.py --audio audio.wav --mode ctc
|
| 79 |
```
|
| 80 |
|
| 81 |
## Model Conversion
|
|
|
|
| 98 |
## File Structure
|
| 99 |
|
| 100 |
```
|
| 101 |
+
./
|
| 102 |
βββ Preprocessor.mlpackage # Audio β Mel spectrogram
|
| 103 |
βββ Encoder.mlpackage # Mel β Encoder features
|
| 104 |
βββ CTCHead.mlpackage # Encoder β CTC log probs
|
| 105 |
βββ Decoder.mlpackage # TDT prediction network
|
| 106 |
βββ JointDecision.mlpackage # TDT joint network
|
| 107 |
βββ vocab.json # Token vocabulary (1024 tokens)
|
| 108 |
+
βββ metadata.json # Model configuration
|
| 109 |
+
βββ pyproject.toml # Python dependencies
|
| 110 |
+
βββ uv.lock # Locked dependencies
|
| 111 |
+
βββ scripts/ # Inference & conversion scripts
|
| 112 |
```
|
| 113 |
|
| 114 |
## Decoding Modes
|
inference.py
CHANGED
|
@@ -279,7 +279,7 @@ def main():
|
|
| 279 |
help="Path to audio file (WAV, MP3, etc.)"
|
| 280 |
)
|
| 281 |
parser.add_argument(
|
| 282 |
-
"--model-dir", type=str, default=".
|
| 283 |
help="Directory containing CoreML model files"
|
| 284 |
)
|
| 285 |
parser.add_argument(
|
|
|
|
| 279 |
help="Path to audio file (WAV, MP3, etc.)"
|
| 280 |
)
|
| 281 |
parser.add_argument(
|
| 282 |
+
"--model-dir", type=str, default=".",
|
| 283 |
help="Directory containing CoreML model files"
|
| 284 |
)
|
| 285 |
parser.add_argument(
|