klemenk commited on
Commit
2965b7e
·
verified ·
1 Parent(s): f72bc8c

Upload distilled speech model

Browse files
README.md CHANGED
@@ -30,19 +30,21 @@ A Data2Vec-style bidirectional speech encoder trained via distillation from Auri
30
  ## Usage
31
 
32
  ```python
33
- from transformers import AutoModel, AutoFeatureExtractor
34
  import torch
35
 
36
  # Load model and feature extractor
37
  model = AutoModel.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960", trust_remote_code=True)
38
- feature_extractor = AutoFeatureExtractor.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960", trust_remote_code=True)
 
39
 
40
  # Prepare audio (16kHz, mono)
41
- audio = torch.randn(16000) # 1 second of audio
42
 
43
  # Extract features
44
- inputs = feature_extractor(audio, return_tensors="pt", sample_rate=16000)
45
- outputs = model(inputs.input_values, output_hidden_states=True)
 
46
 
47
  # Get representations
48
  last_hidden = outputs.last_hidden_state # (1, 50, 768) for 1 second
 
30
  ## Usage
31
 
32
  ```python
33
+ from transformers import AutoModel, Wav2Vec2FeatureExtractor
34
  import torch
35
 
36
  # Load model and feature extractor
37
  model = AutoModel.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960", trust_remote_code=True)
38
+ model.eval() # Important for inference!
39
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("TuKoResearch/AuriStreamDistill_100M40PredTeacher_librispeech960")
40
 
41
  # Prepare audio (16kHz, mono)
42
+ audio = torch.randn(16000).numpy() # 1 second of audio
43
 
44
  # Extract features
45
+ inputs = feature_extractor(audio, return_tensors="pt", sampling_rate=16000)
46
+ with torch.no_grad():
47
+ outputs = model(inputs.input_values, output_hidden_states=True)
48
 
49
  # Get representations
50
  last_hidden = outputs.last_hidden_state # (1, 50, 768) for 1 second
__pycache__/configuration_distilled_speech.cpython-311.pyc CHANGED
Binary files a/__pycache__/configuration_distilled_speech.cpython-311.pyc and b/__pycache__/configuration_distilled_speech.cpython-311.pyc differ
 
__pycache__/modeling_distilled_speech.cpython-311.pyc CHANGED
Binary files a/__pycache__/modeling_distilled_speech.cpython-311.pyc and b/__pycache__/modeling_distilled_speech.cpython-311.pyc differ
 
config.json CHANGED
@@ -46,8 +46,7 @@
46
  "model_type": "distilled_speech",
47
  "auto_map": {
48
  "AutoConfig": "configuration_distilled_speech.DistilledSpeechConfig",
49
- "AutoModel": "modeling_distilled_speech.DistilledSpeechModel",
50
- "AutoFeatureExtractor": "feature_extraction_distilled_speech.DistilledSpeechFeatureExtractor"
51
  },
52
  "architectures": [
53
  "DistilledSpeechModel"
 
46
  "model_type": "distilled_speech",
47
  "auto_map": {
48
  "AutoConfig": "configuration_distilled_speech.DistilledSpeechConfig",
49
+ "AutoModel": "modeling_distilled_speech.DistilledSpeechModel"
 
50
  },
51
  "architectures": [
52
  "DistilledSpeechModel"
modeling_distilled_speech.py CHANGED
@@ -407,6 +407,9 @@ class DistilledSpeechModel(PreTrainedModel):
407
  representations at 50Hz (20ms stride). It returns hidden states from all
408
  transformer layers, making it suitable for downstream probing and finetuning.
409
 
 
 
 
410
  Hidden states structure (for 12-layer model, output_hidden_states=True):
411
  - hidden_states[0]: Feature projection output (input to transformer)
412
  - hidden_states[1]: Output of transformer layer 0
@@ -416,12 +419,14 @@ class DistilledSpeechModel(PreTrainedModel):
416
  Total: 13 hidden states (1 embedding + 12 layers)
417
 
418
  Example usage:
419
- >>> from transformers import AutoModel, AutoFeatureExtractor
420
  >>> model = AutoModel.from_pretrained("your-model-name", trust_remote_code=True)
421
- >>> processor = AutoFeatureExtractor.from_pretrained("your-model-name", trust_remote_code=True)
422
- >>> audio = torch.randn(16000) # 1 second of audio at 16kHz
423
- >>> inputs = processor(audio, return_tensors="pt", sample_rate=16000)
424
- >>> outputs = model(inputs.input_values, output_hidden_states=True)
 
 
425
  >>> last_hidden = outputs.last_hidden_state # (1, 50, 768)
426
  >>> all_hidden = outputs.hidden_states # Tuple of 13 tensors
427
  >>> # Or use dict-style access:
@@ -495,6 +500,14 @@ class DistilledSpeechModel(PreTrainedModel):
495
  )
496
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
497
 
 
 
 
 
 
 
 
 
498
  # Conv encoder: (B, T) -> (B, T', conv_dim)
499
  extract_features = self.conv_encoder(input_values)
500
 
 
407
  representations at 50Hz (20ms stride). It returns hidden states from all
408
  transformer layers, making it suitable for downstream probing and finetuning.
409
 
410
+ IMPORTANT: Call model.eval() before inference to disable dropout and ensure
411
+ correct behavior of normalization layers.
412
+
413
  Hidden states structure (for 12-layer model, output_hidden_states=True):
414
  - hidden_states[0]: Feature projection output (input to transformer)
415
  - hidden_states[1]: Output of transformer layer 0
 
419
  Total: 13 hidden states (1 embedding + 12 layers)
420
 
421
  Example usage:
422
+ >>> from transformers import AutoModel, Wav2Vec2FeatureExtractor
423
  >>> model = AutoModel.from_pretrained("your-model-name", trust_remote_code=True)
424
+ >>> model.eval() # Important for inference!
425
+ >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("your-model-name")
426
+ >>> audio = torch.randn(16000).numpy() # 1 second of audio at 16kHz
427
+ >>> inputs = feature_extractor(audio, return_tensors="pt", sampling_rate=16000)
428
+ >>> with torch.no_grad():
429
+ ... outputs = model(inputs.input_values, output_hidden_states=True)
430
  >>> last_hidden = outputs.last_hidden_state # (1, 50, 768)
431
  >>> all_hidden = outputs.hidden_states # Tuple of 13 tensors
432
  >>> # Or use dict-style access:
 
500
  )
501
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
502
 
503
+ # Ensure input is float (audio samples should be float, not integer tokens)
504
+ if input_values.dtype in (torch.long, torch.int, torch.int32, torch.int64):
505
+ input_values = input_values.float()
506
+
507
+ # Ensure 2D input (batch_size, sequence_length)
508
+ if input_values.dim() == 1:
509
+ input_values = input_values.unsqueeze(0)
510
+
511
  # Conv encoder: (B, T) -> (B, T', conv_dim)
512
  extract_features = self.conv_encoder(input_values)
513
 
preprocessor_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
 
2
  "sampling_rate": 16000,
3
  "do_normalize": true,
4
- "return_attention_mask": false,
5
- "feature_extractor_type": "DistilledSpeechFeatureExtractor"
6
  }
 
1
  {
2
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
3
  "sampling_rate": 16000,
4
  "do_normalize": true,
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": false
7
  }