HF Space Deploy commited on
Commit
d411ac6
·
0 Parent(s):

Deploy demo to HF Space

Browse files
Files changed (4) hide show
  1. .gitattributes +5 -0
  2. README.md +78 -0
  3. app.py +186 -0
  4. requirements.txt +14 -0
.gitattributes ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.wav filter=lfs diff=lfs merge=lfs -text
2
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
3
+ *.flac filter=lfs diff=lfs merge=lfs -text
4
+ *.m4a filter=lfs diff=lfs merge=lfs -text
5
+ *.ogg filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tiny Audio Demo
3
+ emoji: 🎤
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: "4.44.0"
8
+ python_version: "3.11"
9
+ app_file: app.py
10
+ pinned: false
11
+ license: mit
12
+ short_description: Efficient ASR with Whisper encoder and SmolLM3 decoder
13
+ models:
14
+ - mazesmazes/tiny-audio
15
+ tags:
16
+ - audio
17
+ - automatic-speech-recognition
18
+ - whisper
19
+ - smollm
20
+ - mlp
21
+ suggested_hardware: cpu-upgrade
22
+ preload_from_hub:
23
+ - mazesmazes/tiny-audio
24
+ ---
25
+
26
+ ## Demo Overview
27
+
28
+ This Space demonstrates an Automatic Speech Recognition (ASR) model that combines:
29
+
30
+ - **Whisper encoder** for audio feature extraction
31
+ - **SmolLM3 decoder** for efficient text generation
32
+
33
+ ## Features
34
+
35
+ - 🎙️ **Record from microphone** or upload audio files
36
+ - ⚡ **Fast inference** with a small number of trainable parameters
37
+ - 🎯 **English transcription** optimized for speech-to-text
38
+ - 📊 **Lightweight model** suitable for edge deployment
39
+
40
+ ## Model Architecture
41
+
42
+ The model uses a novel architecture that bridges audio and text modalities:
43
+
44
+ 1. **Audio Encoder**: Frozen Whisper encoder
45
+ 2. **Projection Layer**: Custom audio-to-text space mapping
46
+ 3. **Text Decoder**: SmolLM3 (frozen)
47
+
48
+ ## Usage
49
+
50
+ 1. **Upload an audio file** (WAV, MP3, etc.) or **record directly** using your microphone
51
+ 2. Click **"Transcribe"** to convert speech to text
52
+ 3. The transcription will appear in the output box
53
+
54
+ ## Limitations
55
+
56
+ - Maximum audio length: 30 seconds
57
+ - Optimized for English language
58
+ - Best performance with clear speech and minimal background noise
59
+
60
+ ## Links
61
+
62
+ - 📦 [Model on Hugging Face](https://huggingface.co/mazesmazes/tiny-audio)
63
+ - 💻 [GitHub Repository](https://github.com/alexkroman/tiny-audio)
64
+ - 📄 [Technical Details](https://github.com/alexkroman/tiny-audio/blob/main/MODEL_CARD.md)
65
+
66
+ ## Citation
67
+
68
+ If you use this model in your research, please cite:
69
+
70
+ ```bibtex
71
+ @software{kroman2024tinyaudio,
72
+ author = {Kroman, Alex},
73
+ title = {Tiny Audio: Train your own speech recognition model in 24 hours},
74
+ year = {2024},
75
+ publisher = {GitHub},
76
+ url = {https://github.com/alexkroman/tiny-audio}
77
+ }
78
+ ```
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio app for ASR model with support for:
4
+ - Microphone input
5
+ - File upload
6
+ - Word-level timestamps
7
+ - Speaker diarization
8
+ """
9
+
10
+ import os
11
+
12
+ # Fix OpenMP environment variable if invalid
13
+ if not os.environ.get("OMP_NUM_THREADS", "").isdigit():
14
+ os.environ["OMP_NUM_THREADS"] = "1"
15
+
16
+ # Set matplotlib config dir to avoid warning in Hugging Face Spaces
17
+ os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
18
+
19
+ # Disable tokenizer parallelism warning
20
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
21
+
22
+ import gradio as gr
23
+ import torch
24
+ from transformers import pipeline
25
+
26
+
27
+ def format_timestamp(seconds):
28
+ """Format seconds as MM:SS.ms"""
29
+ mins = int(seconds // 60)
30
+ secs = seconds % 60
31
+ return f"{mins:02d}:{secs:05.2f}"
32
+
33
+
34
+ def format_words_with_timestamps(words):
35
+ """Format word timestamps as readable text."""
36
+ if not words:
37
+ return ""
38
+
39
+ lines = []
40
+ for w in words:
41
+ start = format_timestamp(w["start"])
42
+ end = format_timestamp(w["end"])
43
+ speaker = w.get("speaker", "")
44
+ if speaker:
45
+ lines.append(f"[{start} - {end}] ({speaker}) {w['word']}")
46
+ else:
47
+ lines.append(f"[{start} - {end}] {w['word']}")
48
+
49
+ return "\n".join(lines)
50
+
51
+
52
+ def format_speaker_segments(segments):
53
+ """Format speaker segments as readable text."""
54
+ if not segments:
55
+ return ""
56
+
57
+ lines = []
58
+ for seg in segments:
59
+ start = format_timestamp(seg["start"])
60
+ end = format_timestamp(seg["end"])
61
+ lines.append(f"[{start} - {end}] {seg['speaker']}")
62
+
63
+ return "\n".join(lines)
64
+
65
+
66
+ def create_demo(model_path="mazesmazes/tiny-audio"):
67
+ """Create Gradio demo interface using transformers pipeline."""
68
+
69
+ # Determine device
70
+ if torch.cuda.is_available():
71
+ device = 0
72
+ elif torch.backends.mps.is_available():
73
+ device = "mps"
74
+ else:
75
+ device = -1
76
+
77
+ # Load pipeline - uses custom ASRPipeline from the model repo
78
+ pipe = pipeline(
79
+ "automatic-speech-recognition",
80
+ model=model_path,
81
+ trust_remote_code=True,
82
+ device=device,
83
+ )
84
+
85
+ def process_audio(audio, show_timestamps, show_diarization):
86
+ """Process audio file for transcription."""
87
+ if audio is None:
88
+ return "Please provide audio input", "", ""
89
+
90
+ # Build kwargs
91
+ kwargs = {}
92
+ if show_timestamps:
93
+ kwargs["return_timestamps"] = True
94
+ if show_diarization:
95
+ kwargs["return_speakers"] = True
96
+
97
+ # Transcribe the audio
98
+ result = pipe(audio, **kwargs)
99
+
100
+ # Format outputs
101
+ transcript = result.get("text", "")
102
+
103
+ # Format timestamps
104
+ if show_timestamps and "words" in result:
105
+ timestamps_text = format_words_with_timestamps(result["words"])
106
+ elif "timestamp_error" in result:
107
+ timestamps_text = f"Error: {result['timestamp_error']}"
108
+ else:
109
+ timestamps_text = ""
110
+
111
+ # Format diarization
112
+ if show_diarization and "speaker_segments" in result:
113
+ diarization_text = format_speaker_segments(result["speaker_segments"])
114
+ elif "diarization_error" in result:
115
+ diarization_text = f"Error: {result['diarization_error']}"
116
+ else:
117
+ diarization_text = ""
118
+
119
+ return transcript, timestamps_text, diarization_text
120
+
121
+ # Create Gradio interface
122
+ with gr.Blocks(title="Tiny Audio") as demo:
123
+ gr.Markdown("# Tiny Audio")
124
+ gr.Markdown("Speech recognition with optional word timestamps and speaker diarization.")
125
+
126
+ with gr.Row():
127
+ with gr.Column(scale=2):
128
+ audio_input = gr.Audio(
129
+ sources=["microphone", "upload"],
130
+ type="filepath",
131
+ label="Audio Input",
132
+ )
133
+
134
+ with gr.Row():
135
+ show_timestamps = gr.Checkbox(
136
+ label="Word Timestamps",
137
+ value=False,
138
+ )
139
+ show_diarization = gr.Checkbox(
140
+ label="Speaker Diarization",
141
+ value=False,
142
+ )
143
+
144
+ process_btn = gr.Button("Transcribe", variant="primary")
145
+
146
+ with gr.Column(scale=3):
147
+ output_text = gr.Textbox(
148
+ label="Transcript",
149
+ lines=5,
150
+ )
151
+ timestamps_output = gr.Textbox(
152
+ label="Word Timestamps",
153
+ lines=8,
154
+ )
155
+ diarization_output = gr.Textbox(
156
+ label="Speaker Segments",
157
+ lines=5,
158
+ )
159
+
160
+ # Wire up events
161
+ process_btn.click(
162
+ fn=process_audio,
163
+ inputs=[audio_input, show_timestamps, show_diarization],
164
+ outputs=[output_text, timestamps_output, diarization_output],
165
+ )
166
+
167
+ return demo
168
+
169
+
170
+ if __name__ == "__main__":
171
+ import argparse
172
+
173
+ parser = argparse.ArgumentParser(description="ASR Gradio Demo")
174
+ parser.add_argument(
175
+ "--model",
176
+ type=str,
177
+ default=os.environ.get("MODEL_ID", "mazesmazes/tiny-audio"),
178
+ help="HuggingFace Hub model ID",
179
+ )
180
+ parser.add_argument("--port", type=int, default=7860)
181
+ parser.add_argument("--share", action="store_true")
182
+
183
+ args = parser.parse_args()
184
+
185
+ demo = create_demo(args.model)
186
+ demo.launch(server_port=args.port, share=args.share, server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use latest compatible versions
2
+ gradio>=4.44.1
3
+ transformers>=4.57.1
4
+ torch
5
+ soundfile
6
+ librosa
7
+ peft
8
+ truecase
9
+
10
+ # Forced alignment for word-level timestamps
11
+ ctc-forced-aligner @ git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
12
+
13
+ # Speaker diarization
14
+ pyannote-audio>=3.1.0