espnet
/

xeus

@@ -166,7 +166,7 @@ The code for XEUS is still in progress of being merged into the main ESPnet repo
 pip install -e git+git://github.com/wanchichen/espnet.git@ssl
 ```
-XEUS supports [Flash Attention], which can be installed as follows:
 ```
 pip install flash-attn --no-build-isolation
@@ -174,6 +174,9 @@ pip install flash-attn --no-build-isolation
 ## Usage
 ```python
 from torch.nn.utils.rnn import pad_sequence
 from espnet2.tasks.ssl import SSLTask
@@ -187,6 +190,10 @@ xeus_model, xeus_train_args = SSLTask.build_model_from_file(
     device,
 )
 wavs, sampling_rate = sf.read('/path/to/audio.wav') # sampling rate should be 16000
 wav_lengths = torch.LongTensor([len(wav) for wav in [wavs]]).to(device)
 wavs = pad_sequence([wavs], batch_first=True).to(device)
@@ -195,6 +202,25 @@ wavs = pad_sequence([wavs], batch_first=True).to(device)
 feats = xeus_model.encode(wavs, wav_lengths, use_mask=False, use_final_output=False)[0][-1] # take the output of the last layer
 ```
 ## Results
 ![image/png](https://cdn-uploads.huggingface.co/production/uploads/630438615c70c21d0eae6613/RCAWBxSuDLXJ5zdj-OBdn.png)

 pip install -e git+git://github.com/wanchichen/espnet.git@ssl
 ```
+XEUS supports [Flash Attention](), which can be installed as follows:
 ```
 pip install flash-attn --no-build-isolation
 ## Usage
+Default Usage:
 ```python
 from torch.nn.utils.rnn import pad_sequence
 from espnet2.tasks.ssl import SSLTask
     device,
 )
+use_flash_attn = False
+[layer.use_flash_attn = True for layer in xeus_model.encoder.encoders]
+xeus_model.use_flash_attn
 wavs, sampling_rate = sf.read('/path/to/audio.wav') # sampling rate should be 16000
 wav_lengths = torch.LongTensor([len(wav) for wav in [wavs]]).to(device)
 wavs = pad_sequence([wavs], batch_first=True).to(device)
 feats = xeus_model.encode(wavs, wav_lengths, use_mask=False, use_final_output=False)[0][-1] # take the output of the last layer
 ```
+With Flash Attention:
+```python
+[layer.use_flash_attn = True for layer in xeus_model.encoder.encoders]
+with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+  feats = xeus_model.encode(wavs, wav_lengths, use_mask=False, use_final_output=False)[0][-1]
+```
+Tune the masking settings:
+```python
+xeus_model.masker.mask_prob = 0.65 # default 0.8
+xeus_model.masker.mask_length = 20 # default 10
+xeus_model.masker.mask_selection = 'static' # default uniform
+xeus_model.train()
+```
 ## Results
 ![image/png](https://cdn-uploads.huggingface.co/production/uploads/630438615c70c21d0eae6613/RCAWBxSuDLXJ5zdj-OBdn.png)