mazesmazes commited on
Commit
f9b8048
·
verified ·
1 Parent(s): 70f79b2

Update custom model files, README, and requirements

Browse files
Files changed (5) hide show
  1. .gitattributes +2 -35
  2. README.md +230 -162
  3. asr_modeling.py +15 -2
  4. handler.py +71 -0
  5. requirements.txt +5 -0
.gitattributes CHANGED
@@ -1,36 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
3
+ tokenizer_config.json -filter -diff -merge text
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,199 +1,267 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  library_name: transformers
3
- tags: []
4
  ---
5
 
6
- # Model Card for Model ID
7
 
8
- <!-- Provide a quick summary of what the model is/does. -->
9
 
 
10
 
 
 
11
 
12
- ## Model Details
 
 
 
13
 
14
- ### Model Description
15
 
16
- <!-- Provide a longer summary of what this model is. -->
17
 
18
- This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
 
19
 
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
 
28
- ### Model Sources [optional]
 
 
29
 
30
- <!-- Provide the basic links for the model. -->
 
31
 
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
 
 
35
 
36
- ## Uses
37
 
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 
 
 
 
 
 
39
 
40
- ### Direct Use
41
 
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 
 
 
 
 
 
 
 
 
 
43
 
44
- [More Information Needed]
45
 
46
- ### Downstream Use [optional]
 
 
47
 
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 
49
 
50
- [More Information Needed]
 
 
 
51
 
52
- ### Out-of-Scope Use
 
 
 
53
 
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
 
56
- [More Information Needed]
 
 
 
57
 
58
- ## Bias, Risks, and Limitations
 
 
59
 
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 
61
 
62
- [More Information Needed]
 
63
 
64
- ### Recommendations
 
 
 
 
 
 
65
 
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 
 
 
67
 
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 
 
 
69
 
70
- ## How to Get Started with the Model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
 
 
 
 
 
75
 
76
  ## Training Details
77
 
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
 
1
  ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ datasets:
6
+ - speechbrain/LoquaciousSet
7
+ base_model:
8
+ - zai-org/GLM-ASR-Nano-2512
9
+ - Qwen/Qwen3-0.6B
10
+ pipeline_tag: automatic-speech-recognition
11
+ tags:
12
+ - asr
13
+ - speech-recognition
14
+ - audio
15
+ - qwen
16
+ - glm-asr
17
  library_name: transformers
 
18
  ---
19
 
20
+ # Tiny Audio
21
 
22
+ A speech recognition model trained in 24 hours on a single GPU for ~$12. Built with [Tiny Audio](https://github.com/alexkroman/tiny-audio)—a minimal, hackable ASR framework.
23
 
24
+ ## Quick Start
25
 
26
+ ```python
27
+ from transformers import pipeline
28
 
29
+ pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
30
+ result = pipe("audio.wav")
31
+ print(result["text"])
32
+ ```
33
 
34
+ ## Usage Examples
35
 
36
+ ### Basic Transcription
37
 
38
+ ```python
39
+ from transformers import pipeline
40
 
41
+ pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
 
 
 
 
 
 
42
 
43
+ # From file
44
+ result = pipe("audio.wav")
45
+ print(result["text"])
46
 
47
+ # From URL
48
+ result = pipe("https://example.com/audio.mp3")
49
 
50
+ # From numpy array (must be 16kHz)
51
+ import numpy as np
52
+ audio = np.random.randn(16000).astype(np.float32) # 1 second
53
+ result = pipe(audio)
54
+ ```
55
 
56
+ ### Batch Processing
57
 
58
+ ```python
59
+ # Process multiple files
60
+ files = ["audio1.wav", "audio2.wav", "audio3.wav"]
61
+ results = pipe(files, batch_size=4)
62
+ for r in results:
63
+ print(r["text"])
64
+ ```
65
 
66
+ ### Word-Level Timestamps
67
 
68
+ ```python
69
+ result = pipe("audio.wav", return_timestamps="word")
70
+ # Returns:
71
+ # {
72
+ # "text": "hello world",
73
+ # "chunks": [
74
+ # {"text": "hello", "timestamp": (0.0, 0.5)},
75
+ # {"text": "world", "timestamp": (0.6, 1.0)}
76
+ # ]
77
+ # }
78
+ ```
79
 
80
+ ### Streaming Inference
81
 
82
+ ```python
83
+ from tiny_audio import ASRModel, ASRProcessor
84
+ import torch
85
 
86
+ model = ASRModel.from_pretrained("mazesmazes/tiny-audio")
87
+ processor = ASRProcessor.from_pretrained("mazesmazes/tiny-audio")
88
 
89
+ # Load and process audio
90
+ import librosa
91
+ audio, sr = librosa.load("audio.wav", sr=16000)
92
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
93
 
94
+ # Stream tokens
95
+ for token in model.generate_streaming(inputs["input_features"]):
96
+ print(token, end="", flush=True)
97
+ ```
98
 
99
+ ### Using with torch directly
100
 
101
+ ```python
102
+ from tiny_audio import ASRModel, ASRProcessor
103
+ import torch
104
+ import librosa
105
 
106
+ # Load model and processor
107
+ model = ASRModel.from_pretrained("mazesmazes/tiny-audio")
108
+ processor = ASRProcessor.from_pretrained("mazesmazes/tiny-audio")
109
 
110
+ # Load audio (16kHz)
111
+ audio, sr = librosa.load("audio.wav", sr=16000)
112
 
113
+ # Process
114
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
115
 
116
+ # Generate
117
+ with torch.no_grad():
118
+ output = model.generate(
119
+ input_features=inputs["input_features"],
120
+ attention_mask=inputs["attention_mask"],
121
+ max_new_tokens=256
122
+ )
123
 
124
+ # Decode
125
+ text = processor.batch_decode(output, skip_special_tokens=True)[0]
126
+ print(text)
127
+ ```
128
 
129
+ ### GPU Inference
130
+
131
+ ```python
132
+ import torch
133
 
134
+ pipe = pipeline(
135
+ "automatic-speech-recognition",
136
+ model="mazesmazes/tiny-audio",
137
+ trust_remote_code=True,
138
+ device="cuda" # or device=0
139
+ )
140
+ ```
141
+
142
+ ### Half Precision
143
+
144
+ ```python
145
+ pipe = pipeline(
146
+ "automatic-speech-recognition",
147
+ model="mazesmazes/tiny-audio",
148
+ trust_remote_code=True,
149
+ torch_dtype=torch.float16,
150
+ device="cuda"
151
+ )
152
+ ```
153
+
154
+ ## Architecture
155
+
156
+ ```
157
+ Audio (16kHz) → GLM-ASR Encoder (frozen) → MLP Projector (trained) → Qwen3 (frozen) → Text
158
+ ```
159
+
160
+ Only the projector is trained (~12M params). The encoder and decoder remain frozen, leveraging their pretrained knowledge.
161
+
162
+ | Component | Model | Parameters | Status |
163
+ |-----------|-------|------------|--------|
164
+ | Audio Encoder | GLM-ASR-Nano-2512 | ~600M | Frozen |
165
+ | Projector | 2-layer MLP | ~12M | Trained |
166
+ | Language Model | Qwen3-0.6B | ~600M | Frozen |
167
+
168
+ ### How It Works
169
+
170
+ 1. **Audio Encoder**: GLM-ASR converts 16kHz audio into frame-level embeddings (768-dim)
171
+ 2. **Projector**: A 2-layer MLP with frame stacking bridges the audio and text embedding spaces
172
+ 3. **Language Model**: Qwen3 generates text autoregressively, conditioned on the projected audio
173
+
174
+ The projector reduces sequence length via frame stacking: `output_len = (input_len - 5) // 5 + 1`
175
+
176
+ ## Model Specifications
177
 
178
+ | Specification | Value |
179
+ |---------------|-------|
180
+ | Input | Audio (16kHz mono) |
181
+ | Output | Text transcription |
182
+ | Max Audio Length | ~30 seconds (limited by encoder) |
183
+ | Vocabulary | Qwen3 tokenizer |
184
+ | Languages | English only |
185
+ | Generation | Greedy decoding (num_beams=1, do_sample=False) |
186
 
187
  ## Training Details
188
 
189
+ | | |
190
+ |---|---|
191
+ | **Dataset** | LoquaciousSet (25,000 hours) |
192
+ | **Hardware** | Single NVIDIA A40 |
193
+ | **Time** | ~24 hours |
194
+ | **Cost** | ~$12 |
195
+ | **Optimizer** | AdamW |
196
+ | **Learning Rate** | 1e-4 |
197
+ | **Batch Size** | 4 |
198
+ | **Steps** | 50,000 |
199
+
200
+ ## Limitations
201
+
202
+ - **English only**: Not trained on other languages
203
+ - **Sample rate**: Expects 16kHz audio (other rates resampled automatically)
204
+ - **Audio length**: Best for clips under 30 seconds
205
+ - **Accuracy**: May degrade on:
206
+ - Heavily accented speech
207
+ - Noisy or low-quality audio
208
+ - Domain-specific terminology
209
+ - Overlapping speakers
210
+ - **No punctuation**: Output is lowercase without punctuation by default
211
+
212
+ ## Requirements
213
+
214
+ ```
215
+ transformers>=4.40.0
216
+ torch>=2.0.0
217
+ torchaudio>=2.0.0
218
+ ```
219
+
220
+ Optional for streaming:
221
+ ```
222
+ librosa
223
+ soundfile
224
+ ```
225
+
226
+ ## Files
227
+
228
+ | File | Description |
229
+ |------|-------------|
230
+ | `config.json` | Model configuration |
231
+ | `model.safetensors` | Projector weights (~48MB) |
232
+ | `preprocessor_config.json` | Audio preprocessing config |
233
+ | `tokenizer.json` | Tokenizer |
234
+ | `tokenizer_config.json` | Tokenizer config |
235
+ | `special_tokens_map.json` | Special tokens |
236
+
237
+ Note: Only the projector weights are stored. The encoder (GLM-ASR) and decoder (Qwen3) are loaded from their respective HuggingFace repos.
238
+
239
+ ## Citation
240
+
241
+ If you use this model, please cite:
242
+
243
+ ```bibtex
244
+ @misc{tinyaudio2024,
245
+ author = {Alex Kroman},
246
+ title = {Tiny Audio: Minimal ASR Training},
247
+ year = {2024},
248
+ publisher = {GitHub},
249
+ url = {https://github.com/alexkroman/tiny-audio}
250
+ }
251
+ ```
252
+
253
+ ## Links
254
+
255
+ - [GitHub Repository](https://github.com/alexkroman/tiny-audio) - Train your own model
256
+ - [Free 3.5-hour Course](https://github.com/alexkroman/tiny-audio/blob/main/docs/course/0-course-overview.md) - Learn ASR from scratch
257
+ - [Live Demo](https://huggingface.co/spaces/mazesmazes/tiny-audio) - Try it in your browser
258
+
259
+ ## Acknowledgments
260
+
261
+ - [GLM-ASR](https://huggingface.co/zai-org/GLM-ASR-Nano-2512) for the audio encoder
262
+ - [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B) for the language model
263
+ - [LoquaciousSet](https://huggingface.co/datasets/speechbrain/LoquaciousSet) for training data
264
+
265
+ ## License
266
+
267
+ MIT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
asr_modeling.py CHANGED
@@ -24,6 +24,19 @@ except ImportError:
24
  from projectors import PROJECTOR_CLASSES # type: ignore[no-redef]
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def _gather_audio_embeds(audio_embeds: torch.Tensor, token_counts: torch.Tensor) -> torch.Tensor:
28
  """Flatten per-sample audio embeddings into a packed tensor.
29
 
@@ -215,7 +228,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
215
  def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
216
  """Load and freeze the audio encoder."""
217
  encoder_kwargs = {
218
- "attn_implementation": config.attn_implementation,
219
  "low_cpu_mem_usage": True,
220
  "dtype": dtype,
221
  }
@@ -258,7 +271,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
258
  def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
259
  """Load and freeze the language model."""
260
  decoder_kwargs = {
261
- "attn_implementation": config.attn_implementation,
262
  "trust_remote_code": True,
263
  "low_cpu_mem_usage": True,
264
  "dtype": dtype,
 
24
  from projectors import PROJECTOR_CLASSES # type: ignore[no-redef]
25
 
26
 
27
+ def _resolve_attn_implementation(requested: Optional[str]) -> Optional[str]:
28
+ """Coerce flash_attention_2 to sdpa when CUDA isn't available.
29
+
30
+ FA2 is CUDA-only. On MPS/CPU, requesting it either errors at load or
31
+ silently falls back to a slower path; either way the user pays the FA2
32
+ install + import cost for no win. Coerce here so a saved config that
33
+ pins flash_attention_2 still loads on Mac / CPU-only Linux boxes.
34
+ """
35
+ if requested == "flash_attention_2" and not torch.cuda.is_available():
36
+ return "sdpa"
37
+ return requested
38
+
39
+
40
  def _gather_audio_embeds(audio_embeds: torch.Tensor, token_counts: torch.Tensor) -> torch.Tensor:
41
  """Flatten per-sample audio embeddings into a packed tensor.
42
 
 
228
  def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
229
  """Load and freeze the audio encoder."""
230
  encoder_kwargs = {
231
+ "attn_implementation": _resolve_attn_implementation(config.attn_implementation),
232
  "low_cpu_mem_usage": True,
233
  "dtype": dtype,
234
  }
 
271
  def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
272
  """Load and freeze the language model."""
273
  decoder_kwargs = {
274
+ "attn_implementation": _resolve_attn_implementation(config.attn_implementation),
275
  "trust_remote_code": True,
276
  "low_cpu_mem_usage": True,
277
  "dtype": dtype,
handler.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom inference handler for HuggingFace Inference Endpoints."""
2
+
3
+ from typing import Any, Dict, List, Union
4
+
5
+ try:
6
+ # For remote execution, imports are relative
7
+ from .asr_modeling import ASRModel
8
+ from .asr_pipeline import ASRPipeline
9
+ except ImportError:
10
+ # For local execution, imports are not relative
11
+ from asr_modeling import ASRModel # type: ignore[no-redef]
12
+ from asr_pipeline import ASRPipeline # type: ignore[no-redef]
13
+
14
+
15
+ class EndpointHandler:
16
+ """HuggingFace Inference Endpoints handler for ASR model.
17
+
18
+ Handles model loading, warmup, and inference requests for deployment
19
+ on HuggingFace Inference Endpoints or similar services.
20
+ """
21
+
22
+ def __init__(self, path: str = ""):
23
+ """Initialize the endpoint handler.
24
+
25
+ Args:
26
+ path: Path to model directory or HuggingFace model ID
27
+ """
28
+ import os
29
+
30
+ import nltk
31
+ from transformers.utils import is_flash_attn_2_available
32
+
33
+ nltk.download("punkt_tab", quiet=True)
34
+
35
+ os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
36
+
37
+ model_kwargs = {
38
+ "device_map": "auto",
39
+ "torch_dtype": "auto",
40
+ "low_cpu_mem_usage": True,
41
+ }
42
+ if is_flash_attn_2_available():
43
+ model_kwargs["attn_implementation"] = "flash_attention_2"
44
+
45
+ self.model = ASRModel.from_pretrained(path, **model_kwargs)
46
+ self.device = next(self.model.parameters()).device
47
+
48
+ self.pipe = ASRPipeline(
49
+ model=self.model,
50
+ feature_extractor=self.model.feature_extractor,
51
+ tokenizer=self.model.tokenizer,
52
+ device=self.device,
53
+ )
54
+
55
+ def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
56
+ """Process an inference request.
57
+
58
+ Args:
59
+ data: Request data containing 'inputs' (audio path/bytes) and optional 'parameters'
60
+
61
+ Returns:
62
+ Transcription result with 'text' key
63
+ """
64
+ inputs = data.get("inputs")
65
+ if inputs is None:
66
+ raise ValueError("Missing 'inputs' in request data")
67
+
68
+ # Pass through any parameters from request, let model config provide defaults
69
+ params = data.get("parameters", {})
70
+
71
+ return self.pipe(inputs, **params)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Core dependencies for tiny-audio model inference
2
+ # This file is pushed to HuggingFace for model repository
3
+
4
+ # Transformers - main library for model loading and inference
5
+ transformers>=4.57.0