Spaces:

mazesmazes
/

tiny-audio

Sleeping

App Files Files Community

HF Space Deploy commited on Dec 19, 2025

Commit

d411ac6

0 Parent(s):

Deploy demo to HF Space

Browse files

Files changed (4) hide show

.gitattributes +5 -0
README.md +78 -0
app.py +186 -0
requirements.txt +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.m4a filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+title: Tiny Audio Demo
+emoji: 🎤
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: "4.44.0"
+python_version: "3.11"
+app_file: app.py
+pinned: false
+license: mit
+short_description: Efficient ASR with Whisper encoder and SmolLM3 decoder
+models:
+  - mazesmazes/tiny-audio
+tags:
+  - audio
+  - automatic-speech-recognition
+  - whisper
+  - smollm
+  - mlp
+suggested_hardware: cpu-upgrade
+preload_from_hub:
+  - mazesmazes/tiny-audio
+---
+## Demo Overview
+This Space demonstrates an Automatic Speech Recognition (ASR) model that combines:
+- **Whisper encoder** for audio feature extraction
+- **SmolLM3 decoder** for efficient text generation
+## Features
+- 🎙️ **Record from microphone** or upload audio files
+- ⚡ **Fast inference** with a small number of trainable parameters
+- 🎯 **English transcription** optimized for speech-to-text
+- 📊 **Lightweight model** suitable for edge deployment
+## Model Architecture
+The model uses a novel architecture that bridges audio and text modalities:
+1. **Audio Encoder**: Frozen Whisper encoder
+2. **Projection Layer**: Custom audio-to-text space mapping
+3. **Text Decoder**: SmolLM3 (frozen)
+## Usage
+1. **Upload an audio file** (WAV, MP3, etc.) or **record directly** using your microphone
+2. Click **"Transcribe"** to convert speech to text
+3. The transcription will appear in the output box
+## Limitations
+- Maximum audio length: 30 seconds
+- Optimized for English language
+- Best performance with clear speech and minimal background noise
+## Links
+- 📦 [Model on Hugging Face](https://huggingface.co/mazesmazes/tiny-audio)
+- 💻 [GitHub Repository](https://github.com/alexkroman/tiny-audio)
+- 📄 [Technical Details](https://github.com/alexkroman/tiny-audio/blob/main/MODEL_CARD.md)
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@software{kroman2024tinyaudio,
+  author = {Kroman, Alex},
+  title = {Tiny Audio: Train your own speech recognition model in 24 hours},
+  year = {2024},
+  publisher = {GitHub},
+  url = {https://github.com/alexkroman/tiny-audio}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+#!/usr/bin/env python3
+"""
+Gradio app for ASR model with support for:
+- Microphone input
+- File upload
+- Word-level timestamps
+- Speaker diarization
+"""
+import os
+# Fix OpenMP environment variable if invalid
+if not os.environ.get("OMP_NUM_THREADS", "").isdigit():
+    os.environ["OMP_NUM_THREADS"] = "1"
+# Set matplotlib config dir to avoid warning in Hugging Face Spaces
+os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
+# Disable tokenizer parallelism warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import gradio as gr
+import torch
+from transformers import pipeline
+def format_timestamp(seconds):
+    """Format seconds as MM:SS.ms"""
+    mins = int(seconds // 60)
+    secs = seconds % 60
+    return f"{mins:02d}:{secs:05.2f}"
+def format_words_with_timestamps(words):
+    """Format word timestamps as readable text."""
+    if not words:
+        return ""
+    lines = []
+    for w in words:
+        start = format_timestamp(w["start"])
+        end = format_timestamp(w["end"])
+        speaker = w.get("speaker", "")
+        if speaker:
+            lines.append(f"[{start} - {end}] ({speaker}) {w['word']}")
+        else:
+            lines.append(f"[{start} - {end}] {w['word']}")
+    return "\n".join(lines)
+def format_speaker_segments(segments):
+    """Format speaker segments as readable text."""
+    if not segments:
+        return ""
+    lines = []
+    for seg in segments:
+        start = format_timestamp(seg["start"])
+        end = format_timestamp(seg["end"])
+        lines.append(f"[{start} - {end}] {seg['speaker']}")
+    return "\n".join(lines)
+def create_demo(model_path="mazesmazes/tiny-audio"):
+    """Create Gradio demo interface using transformers pipeline."""
+    # Determine device
+    if torch.cuda.is_available():
+        device = 0
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = -1
+    # Load pipeline - uses custom ASRPipeline from the model repo
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model_path,
+        trust_remote_code=True,
+        device=device,
+    )
+    def process_audio(audio, show_timestamps, show_diarization):
+        """Process audio file for transcription."""
+        if audio is None:
+            return "Please provide audio input", "", ""
+        # Build kwargs
+        kwargs = {}
+        if show_timestamps:
+            kwargs["return_timestamps"] = True
+        if show_diarization:
+            kwargs["return_speakers"] = True
+        # Transcribe the audio
+        result = pipe(audio, **kwargs)
+        # Format outputs
+        transcript = result.get("text", "")
+        # Format timestamps
+        if show_timestamps and "words" in result:
+            timestamps_text = format_words_with_timestamps(result["words"])
+        elif "timestamp_error" in result:
+            timestamps_text = f"Error: {result['timestamp_error']}"
+        else:
+            timestamps_text = ""
+        # Format diarization
+        if show_diarization and "speaker_segments" in result:
+            diarization_text = format_speaker_segments(result["speaker_segments"])
+        elif "diarization_error" in result:
+            diarization_text = f"Error: {result['diarization_error']}"
+        else:
+            diarization_text = ""
+        return transcript, timestamps_text, diarization_text
+    # Create Gradio interface
+    with gr.Blocks(title="Tiny Audio") as demo:
+        gr.Markdown("# Tiny Audio")
+        gr.Markdown("Speech recognition with optional word timestamps and speaker diarization.")
+        with gr.Row():
+            with gr.Column(scale=2):
+                audio_input = gr.Audio(
+                    sources=["microphone", "upload"],
+                    type="filepath",
+                    label="Audio Input",
+                )
+                with gr.Row():
+                    show_timestamps = gr.Checkbox(
+                        label="Word Timestamps",
+                        value=False,
+                    )
+                    show_diarization = gr.Checkbox(
+                        label="Speaker Diarization",
+                        value=False,
+                    )
+                process_btn = gr.Button("Transcribe", variant="primary")
+            with gr.Column(scale=3):
+                output_text = gr.Textbox(
+                    label="Transcript",
+                    lines=5,
+                )
+                timestamps_output = gr.Textbox(
+                    label="Word Timestamps",
+                    lines=8,
+                )
+                diarization_output = gr.Textbox(
+                    label="Speaker Segments",
+                    lines=5,
+                )
+        # Wire up events
+        process_btn.click(
+            fn=process_audio,
+            inputs=[audio_input, show_timestamps, show_diarization],
+            outputs=[output_text, timestamps_output, diarization_output],
+        )
+    return demo
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="ASR Gradio Demo")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=os.environ.get("MODEL_ID", "mazesmazes/tiny-audio"),
+        help="HuggingFace Hub model ID",
+    )
+    parser.add_argument("--port", type=int, default=7860)
+    parser.add_argument("--share", action="store_true")
+    args = parser.parse_args()
+    demo = create_demo(args.model)
+    demo.launch(server_port=args.port, share=args.share, server_name="0.0.0.0")

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# Use latest compatible versions
+gradio>=4.44.1
+transformers>=4.57.1
+torch
+soundfile
+librosa
+peft
+truecase
+# Forced alignment for word-level timestamps
+ctc-forced-aligner @ git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
+# Speaker diarization
+pyannote-audio>=3.1.0