Instructions to use mazesmazes/tiny-audio-next with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mazesmazes/tiny-audio-next with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="mazesmazes/tiny-audio-next", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("mazesmazes/tiny-audio-next", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress - step 1000
Browse files- asr_config.py +0 -8
- config.json +0 -1
- model.safetensors +1 -1
- projectors.py +0 -7
asr_config.py
CHANGED
|
@@ -50,13 +50,6 @@ class ASRConfig(transformers.PretrainedConfig):
|
|
| 50 |
projector_pool_stride: int = 4,
|
| 51 |
downsample_rate: int = 5, # Granite default
|
| 52 |
projector_hidden_dim: Optional[int] = None,
|
| 53 |
-
# Projector dropout — applied between activation and the second
|
| 54 |
-
# linear in MLPAudioProjector. Matches Granite-Speech 4.1's
|
| 55 |
-
# Q-Former dropout (hidden_dropout_prob=0.1) used in its frozen-
|
| 56 |
-
# encoder + LoRA-LLM training stage. Default 0.0 for backward
|
| 57 |
-
# compatibility with existing checkpoints; experiment configs
|
| 58 |
-
# opt in to 0.1.
|
| 59 |
-
projector_dropout: float = 0.0,
|
| 60 |
projector_type: str = "mlp", # "mlp", "mosa", "moe", "qformer"
|
| 61 |
# MoE-specific configuration
|
| 62 |
num_experts: int = 4, # Number of experts in MoE projectors
|
|
@@ -123,7 +116,6 @@ class ASRConfig(transformers.PretrainedConfig):
|
|
| 123 |
self.projector_pool_stride = projector_pool_stride
|
| 124 |
self.downsample_rate = downsample_rate
|
| 125 |
self.projector_hidden_dim = projector_hidden_dim
|
| 126 |
-
self.projector_dropout = projector_dropout
|
| 127 |
self.projector_type = projector_type
|
| 128 |
# MoE-specific configuration
|
| 129 |
self.num_experts = num_experts
|
|
|
|
| 50 |
projector_pool_stride: int = 4,
|
| 51 |
downsample_rate: int = 5, # Granite default
|
| 52 |
projector_hidden_dim: Optional[int] = None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
projector_type: str = "mlp", # "mlp", "mosa", "moe", "qformer"
|
| 54 |
# MoE-specific configuration
|
| 55 |
num_experts: int = 4, # Number of experts in MoE projectors
|
|
|
|
| 116 |
self.projector_pool_stride = projector_pool_stride
|
| 117 |
self.downsample_rate = downsample_rate
|
| 118 |
self.projector_hidden_dim = projector_hidden_dim
|
|
|
|
| 119 |
self.projector_type = projector_type
|
| 120 |
# MoE-specific configuration
|
| 121 |
self.num_experts = num_experts
|
config.json
CHANGED
|
@@ -262,7 +262,6 @@
|
|
| 262 |
"pad_token_id": 151643,
|
| 263 |
"pipeline_tag": "automatic-speech-recognition",
|
| 264 |
"pretrained_model_path": "mazesmazes/tiny-audio-next",
|
| 265 |
-
"projector_dropout": 0.1,
|
| 266 |
"projector_hidden_dim": 2048,
|
| 267 |
"projector_pool_stride": 4,
|
| 268 |
"projector_type": "mlp",
|
|
|
|
| 262 |
"pad_token_id": 151643,
|
| 263 |
"pipeline_tag": "automatic-speech-recognition",
|
| 264 |
"pretrained_model_path": "mazesmazes/tiny-audio-next",
|
|
|
|
| 265 |
"projector_hidden_dim": 2048,
|
| 266 |
"projector_pool_stride": 4,
|
| 267 |
"projector_type": "mlp",
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2433494416
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa0caab16d99a3c3d7ef0289332b36eee96b9a3602c1e02f8d2bf2c9f38e7b21
|
| 3 |
size 2433494416
|
projectors.py
CHANGED
|
@@ -55,12 +55,6 @@ class MLPAudioProjector(nn.Module):
|
|
| 55 |
self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
|
| 56 |
self.norm.weight.data.fill_(self._NORM_INIT)
|
| 57 |
self.act = nn.GELU()
|
| 58 |
-
# Dropout matches Granite-Speech 4.1's Q-Former hidden_dropout_prob=0.1
|
| 59 |
-
# in its frozen-encoder modality-alignment stage — the closest
|
| 60 |
-
# published precedent for our regime. Default 0.0 in config means
|
| 61 |
-
# nn.Dropout(0.0) is a no-op for existing experiments.
|
| 62 |
-
projector_dropout = float(getattr(config, "projector_dropout", 0.0))
|
| 63 |
-
self.dropout = nn.Dropout(projector_dropout)
|
| 64 |
self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
|
| 65 |
# Output norm aligns the projector's RMS with the LM's embed_tokens
|
| 66 |
# distribution. See _NORM_INIT comment above for the magnitude
|
|
@@ -86,7 +80,6 @@ class MLPAudioProjector(nn.Module):
|
|
| 86 |
x = self.linear_1(x)
|
| 87 |
x = self.norm(x)
|
| 88 |
x = self.act(x)
|
| 89 |
-
x = self.dropout(x)
|
| 90 |
x = self.linear_2(x)
|
| 91 |
return self.norm_2(x)
|
| 92 |
|
|
|
|
| 55 |
self.norm = LlamaRMSNorm(hidden_dim, eps=1e-6)
|
| 56 |
self.norm.weight.data.fill_(self._NORM_INIT)
|
| 57 |
self.act = nn.GELU()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
self.linear_2 = nn.Linear(hidden_dim, llm_dim, bias=False)
|
| 59 |
# Output norm aligns the projector's RMS with the LM's embed_tokens
|
| 60 |
# distribution. See _NORM_INIT comment above for the magnitude
|
|
|
|
| 80 |
x = self.linear_1(x)
|
| 81 |
x = self.norm(x)
|
| 82 |
x = self.act(x)
|
|
|
|
| 83 |
x = self.linear_2(x)
|
| 84 |
return self.norm_2(x)
|
| 85 |
|