Update custom model files, README, and requirements

Browse files

Files changed (5) hide show

.gitattributes +2 -35
README.md +230 -162
asr_modeling.py +15 -2
handler.py +71 -0
requirements.txt +5 -0

.gitattributes CHANGED Viewed

@@ -1,36 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+tokenizer_config.json -filter -diff -merge text

README.md CHANGED Viewed

@@ -1,199 +1,267 @@
 ---
 library_name: transformers
-tags: []
 ---
-# Model Card for Model ID
-<!-- Provide a quick summary of what the model is/does. -->
-## Model Details
-### Model Description
-<!-- Provide a longer summary of what this model is. -->
-This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
-- **Developed by:** [More Information Needed]
-- **Funded by [optional]:** [More Information Needed]
-- **Shared by [optional]:** [More Information Needed]
-- **Model type:** [More Information Needed]
-- **Language(s) (NLP):** [More Information Needed]
-- **License:** [More Information Needed]
-- **Finetuned from model [optional]:** [More Information Needed]
-### Model Sources [optional]
-<!-- Provide the basic links for the model. -->
-- **Repository:** [More Information Needed]
-- **Paper [optional]:** [More Information Needed]
-- **Demo [optional]:** [More Information Needed]
-## Uses
-<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
-### Direct Use
-<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
-[More Information Needed]
-### Downstream Use [optional]
-<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-[More Information Needed]
-### Out-of-Scope Use
-<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
-[More Information Needed]
-## Bias, Risks, and Limitations
-<!-- This section is meant to convey both technical and sociotechnical limitations. -->
-[More Information Needed]
-### Recommendations
-<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
-Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
-## How to Get Started with the Model
-Use the code below to get started with the model.
-[More Information Needed]
 ## Training Details
-### Training Data
-<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
-[More Information Needed]
-### Training Procedure
-<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
-#### Preprocessing [optional]
-[More Information Needed]
-#### Training Hyperparameters
-- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
-#### Speeds, Sizes, Times [optional]
-<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
-[More Information Needed]
-## Evaluation
-<!-- This section describes the evaluation protocols and provides the results. -->
-### Testing Data, Factors & Metrics
-#### Testing Data
-<!-- This should link to a Dataset Card if possible. -->
-[More Information Needed]
-#### Factors
-<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
-[More Information Needed]
-#### Metrics
-<!-- These are the evaluation metrics being used, ideally with a description of why. -->
-[More Information Needed]
-### Results
-[More Information Needed]
-#### Summary
-## Model Examination [optional]
-<!-- Relevant interpretability work for the model goes here -->
-[More Information Needed]
-## Environmental Impact
-<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
-Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
-- **Hardware Type:** [More Information Needed]
-- **Hours used:** [More Information Needed]
-- **Cloud Provider:** [More Information Needed]
-- **Compute Region:** [More Information Needed]
-- **Carbon Emitted:** [More Information Needed]
-## Technical Specifications [optional]
-### Model Architecture and Objective
-[More Information Needed]
-### Compute Infrastructure
-[More Information Needed]
-#### Hardware
-[More Information Needed]
-#### Software
-[More Information Needed]
-## Citation [optional]
-<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
-**BibTeX:**
-[More Information Needed]
-**APA:**
-[More Information Needed]
-## Glossary [optional]
-<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
-[More Information Needed]
-## More Information [optional]
-[More Information Needed]
-## Model Card Authors [optional]
-[More Information Needed]
-## Model Card Contact
-[More Information Needed]

 ---
+license: mit
+language:
+- en
+datasets:
+- speechbrain/LoquaciousSet
+base_model:
+- zai-org/GLM-ASR-Nano-2512
+- Qwen/Qwen3-0.6B
+pipeline_tag: automatic-speech-recognition
+tags:
+- asr
+- speech-recognition
+- audio
+- qwen
+- glm-asr
 library_name: transformers
 ---
+# Tiny Audio
+A speech recognition model trained in 24 hours on a single GPU for ~$12. Built with [Tiny Audio](https://github.com/alexkroman/tiny-audio)—a minimal, hackable ASR framework.
+## Quick Start
+```python
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
+result = pipe("audio.wav")
+print(result["text"])
+```
+## Usage Examples
+### Basic Transcription
+```python
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
+# From file
+result = pipe("audio.wav")
+print(result["text"])
+# From URL
+result = pipe("https://example.com/audio.mp3")
+# From numpy array (must be 16kHz)
+import numpy as np
+audio = np.random.randn(16000).astype(np.float32)  # 1 second
+result = pipe(audio)
+```
+### Batch Processing
+```python
+# Process multiple files
+files = ["audio1.wav", "audio2.wav", "audio3.wav"]
+results = pipe(files, batch_size=4)
+for r in results:
+    print(r["text"])
+```
+### Word-Level Timestamps
+```python
+result = pipe("audio.wav", return_timestamps="word")
+# Returns:
+# {
+#   "text": "hello world",
+#   "chunks": [
+#     {"text": "hello", "timestamp": (0.0, 0.5)},
+#     {"text": "world", "timestamp": (0.6, 1.0)}
+#   ]
+# }
+```
+### Streaming Inference
+```python
+from tiny_audio import ASRModel, ASRProcessor
+import torch
+model = ASRModel.from_pretrained("mazesmazes/tiny-audio")
+processor = ASRProcessor.from_pretrained("mazesmazes/tiny-audio")
+# Load and process audio
+import librosa
+audio, sr = librosa.load("audio.wav", sr=16000)
+inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+# Stream tokens
+for token in model.generate_streaming(inputs["input_features"]):
+    print(token, end="", flush=True)
+```
+### Using with torch directly
+```python
+from tiny_audio import ASRModel, ASRProcessor
+import torch
+import librosa
+# Load model and processor
+model = ASRModel.from_pretrained("mazesmazes/tiny-audio")
+processor = ASRProcessor.from_pretrained("mazesmazes/tiny-audio")
+# Load audio (16kHz)
+audio, sr = librosa.load("audio.wav", sr=16000)
+# Process
+inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+# Generate
+with torch.no_grad():
+    output = model.generate(
+        input_features=inputs["input_features"],
+        attention_mask=inputs["attention_mask"],
+        max_new_tokens=256
+    )
+# Decode
+text = processor.batch_decode(output, skip_special_tokens=True)[0]
+print(text)
+```
+### GPU Inference
+```python
+import torch
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model="mazesmazes/tiny-audio",
+    trust_remote_code=True,
+    device="cuda"  # or device=0
+)
+```
+### Half Precision
+```python
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model="mazesmazes/tiny-audio",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device="cuda"
+)
+```
+## Architecture
+```
+Audio (16kHz) → GLM-ASR Encoder (frozen) → MLP Projector (trained) → Qwen3 (frozen) → Text
+```
+Only the projector is trained (~12M params). The encoder and decoder remain frozen, leveraging their pretrained knowledge.
+| Component | Model | Parameters | Status |
+|-----------|-------|------------|--------|
+| Audio Encoder | GLM-ASR-Nano-2512 | ~600M | Frozen |
+| Projector | 2-layer MLP | ~12M | Trained |
+| Language Model | Qwen3-0.6B | ~600M | Frozen |
+### How It Works
+1. **Audio Encoder**: GLM-ASR converts 16kHz audio into frame-level embeddings (768-dim)
+2. **Projector**: A 2-layer MLP with frame stacking bridges the audio and text embedding spaces
+3. **Language Model**: Qwen3 generates text autoregressively, conditioned on the projected audio
+The projector reduces sequence length via frame stacking: `output_len = (input_len - 5) // 5 + 1`
+## Model Specifications
+| Specification | Value |
+|---------------|-------|
+| Input | Audio (16kHz mono) |
+| Output | Text transcription |
+| Max Audio Length | ~30 seconds (limited by encoder) |
+| Vocabulary | Qwen3 tokenizer |
+| Languages | English only |
+| Generation | Greedy decoding (num_beams=1, do_sample=False) |
 ## Training Details
+| | |
+|---|---|
+| **Dataset** | LoquaciousSet (25,000 hours) |
+| **Hardware** | Single NVIDIA A40 |
+| **Time** | ~24 hours |
+| **Cost** | ~$12 |
+| **Optimizer** | AdamW |
+| **Learning Rate** | 1e-4 |
+| **Batch Size** | 4 |
+| **Steps** | 50,000 |
+## Limitations
+- **English only**: Not trained on other languages
+- **Sample rate**: Expects 16kHz audio (other rates resampled automatically)
+- **Audio length**: Best for clips under 30 seconds
+- **Accuracy**: May degrade on:
+  - Heavily accented speech
+  - Noisy or low-quality audio
+  - Domain-specific terminology
+  - Overlapping speakers
+- **No punctuation**: Output is lowercase without punctuation by default
+## Requirements
+```
+transformers>=4.40.0
+torch>=2.0.0
+torchaudio>=2.0.0
+```
+Optional for streaming:
+```
+librosa
+soundfile
+```
+## Files
+| File | Description |
+|------|-------------|
+| `config.json` | Model configuration |
+| `model.safetensors` | Projector weights (~48MB) |
+| `preprocessor_config.json` | Audio preprocessing config |
+| `tokenizer.json` | Tokenizer |
+| `tokenizer_config.json` | Tokenizer config |
+| `special_tokens_map.json` | Special tokens |
+Note: Only the projector weights are stored. The encoder (GLM-ASR) and decoder (Qwen3) are loaded from their respective HuggingFace repos.
+## Citation
+If you use this model, please cite:
+```bibtex
+@misc{tinyaudio2024,
+  author = {Alex Kroman},
+  title = {Tiny Audio: Minimal ASR Training},
+  year = {2024},
+  publisher = {GitHub},
+  url = {https://github.com/alexkroman/tiny-audio}
+}
+```
+## Links
+- [GitHub Repository](https://github.com/alexkroman/tiny-audio) - Train your own model
+- [Free 3.5-hour Course](https://github.com/alexkroman/tiny-audio/blob/main/docs/course/0-course-overview.md) - Learn ASR from scratch
+- [Live Demo](https://huggingface.co/spaces/mazesmazes/tiny-audio) - Try it in your browser
+## Acknowledgments
+- [GLM-ASR](https://huggingface.co/zai-org/GLM-ASR-Nano-2512) for the audio encoder
+- [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B) for the language model
+- [LoquaciousSet](https://huggingface.co/datasets/speechbrain/LoquaciousSet) for training data
+## License
+MIT

asr_modeling.py CHANGED Viewed

@@ -24,6 +24,19 @@ except ImportError:
     from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
 def _gather_audio_embeds(audio_embeds: torch.Tensor, token_counts: torch.Tensor) -> torch.Tensor:
     """Flatten per-sample audio embeddings into a packed tensor.
@@ -215,7 +228,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
     def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
         """Load and freeze the audio encoder."""
         encoder_kwargs = {
-            "attn_implementation": config.attn_implementation,
             "low_cpu_mem_usage": True,
             "dtype": dtype,
         }
@@ -258,7 +271,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
     def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
         """Load and freeze the language model."""
         decoder_kwargs = {
-            "attn_implementation": config.attn_implementation,
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
             "dtype": dtype,

     from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
+def _resolve_attn_implementation(requested: Optional[str]) -> Optional[str]:
+    """Coerce flash_attention_2 to sdpa when CUDA isn't available.
+    FA2 is CUDA-only. On MPS/CPU, requesting it either errors at load or
+    silently falls back to a slower path; either way the user pays the FA2
+    install + import cost for no win. Coerce here so a saved config that
+    pins flash_attention_2 still loads on Mac / CPU-only Linux boxes.
+    """
+    if requested == "flash_attention_2" and not torch.cuda.is_available():
+        return "sdpa"
+    return requested
 def _gather_audio_embeds(audio_embeds: torch.Tensor, token_counts: torch.Tensor) -> torch.Tensor:
     """Flatten per-sample audio embeddings into a packed tensor.
     def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
         """Load and freeze the audio encoder."""
         encoder_kwargs = {
+            "attn_implementation": _resolve_attn_implementation(config.attn_implementation),
             "low_cpu_mem_usage": True,
             "dtype": dtype,
         }
     def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
         """Load and freeze the language model."""
         decoder_kwargs = {
+            "attn_implementation": _resolve_attn_implementation(config.attn_implementation),
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
             "dtype": dtype,

handler.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Custom inference handler for HuggingFace Inference Endpoints."""
+from typing import Any, Dict, List, Union
+try:
+    # For remote execution, imports are relative
+    from .asr_modeling import ASRModel
+    from .asr_pipeline import ASRPipeline
+except ImportError:
+    # For local execution, imports are not relative
+    from asr_modeling import ASRModel  # type: ignore[no-redef]
+    from asr_pipeline import ASRPipeline  # type: ignore[no-redef]
+class EndpointHandler:
+    """HuggingFace Inference Endpoints handler for ASR model.
+    Handles model loading, warmup, and inference requests for deployment
+    on HuggingFace Inference Endpoints or similar services.
+    """
+    def __init__(self, path: str = ""):
+        """Initialize the endpoint handler.
+        Args:
+            path: Path to model directory or HuggingFace model ID
+        """
+        import os
+        import nltk
+        from transformers.utils import is_flash_attn_2_available
+        nltk.download("punkt_tab", quiet=True)
+        os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+        model_kwargs = {
+            "device_map": "auto",
+            "torch_dtype": "auto",
+            "low_cpu_mem_usage": True,
+        }
+        if is_flash_attn_2_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
+        self.model = ASRModel.from_pretrained(path, **model_kwargs)
+        self.device = next(self.model.parameters()).device
+        self.pipe = ASRPipeline(
+            model=self.model,
+            feature_extractor=self.model.feature_extractor,
+            tokenizer=self.model.tokenizer,
+            device=self.device,
+        )
+    def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+        """Process an inference request.
+        Args:
+            data: Request data containing 'inputs' (audio path/bytes) and optional 'parameters'
+        Returns:
+            Transcription result with 'text' key
+        """
+        inputs = data.get("inputs")
+        if inputs is None:
+            raise ValueError("Missing 'inputs' in request data")
+        # Pass through any parameters from request, let model config provide defaults
+        params = data.get("parameters", {})
+        return self.pipe(inputs, **params)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+# Core dependencies for tiny-audio model inference
+# This file is pushed to HuggingFace for model repository
+# Transformers - main library for model loading and inference
+transformers>=4.57.0