Upload distilled speech model
Browse files
README.md
CHANGED
|
@@ -30,19 +30,21 @@ A Data2Vec-style bidirectional speech encoder trained via distillation from Auri
|
|
| 30 |
## Usage
|
| 31 |
|
| 32 |
```python
|
| 33 |
-
from transformers import AutoModel,
|
| 34 |
import torch
|
| 35 |
|
| 36 |
# Load model and feature extractor
|
| 37 |
model = AutoModel.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960", trust_remote_code=True)
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
# Prepare audio (16kHz, mono)
|
| 41 |
-
audio = torch.randn(16000) # 1 second of audio
|
| 42 |
|
| 43 |
# Extract features
|
| 44 |
-
inputs = feature_extractor(audio, return_tensors="pt",
|
| 45 |
-
|
|
|
|
| 46 |
|
| 47 |
# Get representations
|
| 48 |
last_hidden = outputs.last_hidden_state # (1, 50, 768) for 1 second
|
|
|
|
| 30 |
## Usage
|
| 31 |
|
| 32 |
```python
|
| 33 |
+
from transformers import AutoModel, Wav2Vec2FeatureExtractor
|
| 34 |
import torch
|
| 35 |
|
| 36 |
# Load model and feature extractor
|
| 37 |
model = AutoModel.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960", trust_remote_code=True)
|
| 38 |
+
model.eval() # Important for inference!
|
| 39 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960")
|
| 40 |
|
| 41 |
# Prepare audio (16kHz, mono)
|
| 42 |
+
audio = torch.randn(16000).numpy() # 1 second of audio
|
| 43 |
|
| 44 |
# Extract features
|
| 45 |
+
inputs = feature_extractor(audio, return_tensors="pt", sampling_rate=16000)
|
| 46 |
+
with torch.no_grad():
|
| 47 |
+
outputs = model(inputs.input_values, output_hidden_states=True)
|
| 48 |
|
| 49 |
# Get representations
|
| 50 |
last_hidden = outputs.last_hidden_state # (1, 50, 768) for 1 second
|
__pycache__/configuration_distilled_speech.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/configuration_distilled_speech.cpython-311.pyc and b/__pycache__/configuration_distilled_speech.cpython-311.pyc differ
|
|
|
__pycache__/modeling_distilled_speech.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/modeling_distilled_speech.cpython-311.pyc and b/__pycache__/modeling_distilled_speech.cpython-311.pyc differ
|
|
|
config.json
CHANGED
|
@@ -46,8 +46,7 @@
|
|
| 46 |
"model_type": "distilled_speech",
|
| 47 |
"auto_map": {
|
| 48 |
"AutoConfig": "configuration_distilled_speech.DistilledSpeechConfig",
|
| 49 |
-
"AutoModel": "modeling_distilled_speech.DistilledSpeechModel"
|
| 50 |
-
"AutoFeatureExtractor": "feature_extraction_distilled_speech.DistilledSpeechFeatureExtractor"
|
| 51 |
},
|
| 52 |
"architectures": [
|
| 53 |
"DistilledSpeechModel"
|
|
|
|
| 46 |
"model_type": "distilled_speech",
|
| 47 |
"auto_map": {
|
| 48 |
"AutoConfig": "configuration_distilled_speech.DistilledSpeechConfig",
|
| 49 |
+
"AutoModel": "modeling_distilled_speech.DistilledSpeechModel"
|
|
|
|
| 50 |
},
|
| 51 |
"architectures": [
|
| 52 |
"DistilledSpeechModel"
|
modeling_distilled_speech.py
CHANGED
|
@@ -407,6 +407,9 @@ class DistilledSpeechModel(PreTrainedModel):
|
|
| 407 |
representations at 50Hz (20ms stride). It returns hidden states from all
|
| 408 |
transformer layers, making it suitable for downstream probing and finetuning.
|
| 409 |
|
|
|
|
|
|
|
|
|
|
| 410 |
Hidden states structure (for 12-layer model, output_hidden_states=True):
|
| 411 |
- hidden_states[0]: Feature projection output (input to transformer)
|
| 412 |
- hidden_states[1]: Output of transformer layer 0
|
|
@@ -416,12 +419,14 @@ class DistilledSpeechModel(PreTrainedModel):
|
|
| 416 |
Total: 13 hidden states (1 embedding + 12 layers)
|
| 417 |
|
| 418 |
Example usage:
|
| 419 |
-
>>> from transformers import AutoModel,
|
| 420 |
>>> model = AutoModel.from_pretrained("your-model-name", trust_remote_code=True)
|
| 421 |
-
>>>
|
| 422 |
-
>>>
|
| 423 |
-
>>>
|
| 424 |
-
>>>
|
|
|
|
|
|
|
| 425 |
>>> last_hidden = outputs.last_hidden_state # (1, 50, 768)
|
| 426 |
>>> all_hidden = outputs.hidden_states # Tuple of 13 tensors
|
| 427 |
>>> # Or use dict-style access:
|
|
@@ -495,6 +500,14 @@ class DistilledSpeechModel(PreTrainedModel):
|
|
| 495 |
)
|
| 496 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
# Conv encoder: (B, T) -> (B, T', conv_dim)
|
| 499 |
extract_features = self.conv_encoder(input_values)
|
| 500 |
|
|
|
|
| 407 |
representations at 50Hz (20ms stride). It returns hidden states from all
|
| 408 |
transformer layers, making it suitable for downstream probing and finetuning.
|
| 409 |
|
| 410 |
+
IMPORTANT: Call model.eval() before inference to disable dropout and ensure
|
| 411 |
+
correct behavior of normalization layers.
|
| 412 |
+
|
| 413 |
Hidden states structure (for 12-layer model, output_hidden_states=True):
|
| 414 |
- hidden_states[0]: Feature projection output (input to transformer)
|
| 415 |
- hidden_states[1]: Output of transformer layer 0
|
|
|
|
| 419 |
Total: 13 hidden states (1 embedding + 12 layers)
|
| 420 |
|
| 421 |
Example usage:
|
| 422 |
+
>>> from transformers import AutoModel, Wav2Vec2FeatureExtractor
|
| 423 |
>>> model = AutoModel.from_pretrained("your-model-name", trust_remote_code=True)
|
| 424 |
+
>>> model.eval() # Important for inference!
|
| 425 |
+
>>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("your-model-name")
|
| 426 |
+
>>> audio = torch.randn(16000).numpy() # 1 second of audio at 16kHz
|
| 427 |
+
>>> inputs = feature_extractor(audio, return_tensors="pt", sampling_rate=16000)
|
| 428 |
+
>>> with torch.no_grad():
|
| 429 |
+
... outputs = model(inputs.input_values, output_hidden_states=True)
|
| 430 |
>>> last_hidden = outputs.last_hidden_state # (1, 50, 768)
|
| 431 |
>>> all_hidden = outputs.hidden_states # Tuple of 13 tensors
|
| 432 |
>>> # Or use dict-style access:
|
|
|
|
| 500 |
)
|
| 501 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 502 |
|
| 503 |
+
# Ensure input is float (audio samples should be float, not integer tokens)
|
| 504 |
+
if input_values.dtype in (torch.long, torch.int, torch.int32, torch.int64):
|
| 505 |
+
input_values = input_values.float()
|
| 506 |
+
|
| 507 |
+
# Ensure 2D input (batch_size, sequence_length)
|
| 508 |
+
if input_values.dim() == 1:
|
| 509 |
+
input_values = input_values.unsqueeze(0)
|
| 510 |
+
|
| 511 |
# Conv encoder: (B, T) -> (B, T', conv_dim)
|
| 512 |
extract_features = self.conv_encoder(input_values)
|
| 513 |
|
preprocessor_config.json
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"sampling_rate": 16000,
|
| 3 |
"do_normalize": true,
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 3 |
"sampling_rate": 16000,
|
| 4 |
"do_normalize": true,
|
| 5 |
+
"padding_value": 0.0,
|
| 6 |
+
"return_attention_mask": false
|
| 7 |
}
|