Add Qwen-ASR for automatic transcription, both sizes and keep Whisper as an option in dropdown.

#5
by Impulse2000 - opened
Files changed (2) hide show
  1. app.py +50 -19
  2. requirements.txt +1 -0
app.py CHANGED
@@ -84,29 +84,55 @@ def decode_codes_to_audio(merged_codes):
84
  return audio[0, 0]
85
 
86
 
87
- whisper_model = None
88
-
89
-
90
- def get_whisper_model():
91
- global whisper_model
92
- if whisper_model is None:
93
- from faster_whisper import WhisperModel
94
- whisper_model = WhisperModel("large-v3", device="cuda", compute_type="int8")
95
- return whisper_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
 
98
  @spaces.GPU(duration=60)
99
- def transcribe_audio(audio_path):
100
  if audio_path is None:
101
  raise gr.Error("Please upload a reference audio file first.")
102
  try:
103
- gr.Info("Transcribing audio with Whisper large-v3...")
104
- model = get_whisper_model()
105
- segments, info = model.transcribe(audio_path, beam_size=5, vad_filter=True)
106
- text = " ".join(seg.text.strip() for seg in segments).strip()
 
 
 
 
 
 
 
107
  if not text:
108
- raise gr.Error("Whisper could not detect any speech in the audio.")
109
- gr.Info(f"Detected language: {info.language} ({info.language_probability:.0%} confidence)")
110
  return text
111
  except gr.Error:
112
  raise
@@ -240,7 +266,12 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
240
  "The model will clone that voice for synthesis. Language is inferred automatically."
241
  )
242
  ref_audio = gr.Audio(label="Reference Audio", type="filepath")
243
- transcribe_btn = gr.Button("🎀 Auto-transcribe with Whisper", variant="secondary", size="sm")
 
 
 
 
 
244
  ref_text = gr.Textbox(
245
  label="Reference Audio Transcription",
246
  placeholder="Exact transcription of the reference audio, or click Auto-transcribe above...",
@@ -314,7 +345,7 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
314
 
315
  transcribe_btn.click(
316
  fn=transcribe_audio,
317
- inputs=[ref_audio],
318
  outputs=[ref_text],
319
  )
320
 
@@ -325,4 +356,4 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
325
  )
326
 
327
  if __name__ == "__main__":
328
- app.launch()
 
84
  return audio[0, 0]
85
 
86
 
87
+ ASR_MODELS = {
88
+ "Qwen3-ASR-1.7B β€” larger, more accurate": ("qwen", "Qwen/Qwen3-ASR-1.7B"),
89
+ "Qwen3-ASR-0.6B β€” smaller, faster": ("qwen", "Qwen/Qwen3-ASR-0.6B"),
90
+ "Whisper large-v3 (faster-whisper)": ("whisper", "large-v3"),
91
+ }
92
+
93
+ DEFAULT_ASR = "Qwen3-ASR-1.7B β€” larger, more accurate"
94
+
95
+ asr_models = {}
96
+
97
+
98
+ def get_asr_model(label):
99
+ if label not in asr_models:
100
+ backend, model_id = ASR_MODELS[label]
101
+ if backend == "qwen":
102
+ from qwen_asr import Qwen3ASRModel
103
+ asr_models[label] = Qwen3ASRModel.from_pretrained(
104
+ model_id,
105
+ dtype=torch.bfloat16,
106
+ device_map="cuda:0" if torch.cuda.is_available() else "cpu",
107
+ max_inference_batch_size=32,
108
+ max_new_tokens=256,
109
+ )
110
+ else:
111
+ from faster_whisper import WhisperModel
112
+ device = "cuda" if torch.cuda.is_available() else "cpu"
113
+ asr_models[label] = WhisperModel(model_id, device=device, compute_type="int8")
114
+ return asr_models[label]
115
 
116
 
117
  @spaces.GPU(duration=60)
118
+ def transcribe_audio(audio_path, asr_label):
119
  if audio_path is None:
120
  raise gr.Error("Please upload a reference audio file first.")
121
  try:
122
+ gr.Info(f"Transcribing audio with {asr_label}...")
123
+ backend, _ = ASR_MODELS[asr_label]
124
+ model = get_asr_model(asr_label)
125
+ if backend == "qwen":
126
+ result = model.transcribe(audio=audio_path, language=None)[0]
127
+ text = (result.text or "").strip()
128
+ detected_language = result.language
129
+ else:
130
+ segments, info = model.transcribe(audio_path, beam_size=5, vad_filter=True)
131
+ text = " ".join(seg.text.strip() for seg in segments).strip()
132
+ detected_language = info.language
133
  if not text:
134
+ raise gr.Error("No speech could be detected in the audio.")
135
+ gr.Info(f"Detected language: {detected_language}")
136
  return text
137
  except gr.Error:
138
  raise
 
266
  "The model will clone that voice for synthesis. Language is inferred automatically."
267
  )
268
  ref_audio = gr.Audio(label="Reference Audio", type="filepath")
269
+ asr_model_selector = gr.Radio(
270
+ choices=list(ASR_MODELS.keys()),
271
+ value=DEFAULT_ASR,
272
+ label="ASR Model",
273
+ )
274
+ transcribe_btn = gr.Button("🎀 Auto-transcribe", variant="secondary", size="sm")
275
  ref_text = gr.Textbox(
276
  label="Reference Audio Transcription",
277
  placeholder="Exact transcription of the reference audio, or click Auto-transcribe above...",
 
345
 
346
  transcribe_btn.click(
347
  fn=transcribe_audio,
348
+ inputs=[ref_audio, asr_model_selector],
349
  outputs=[ref_text],
350
  )
351
 
 
356
  )
357
 
358
  if __name__ == "__main__":
359
+ app.launch(server_port=8181)
requirements.txt CHANGED
@@ -6,6 +6,7 @@ datasets
6
  lightning
7
  hydra-core
8
  faster-whisper
 
9
  tensorboard
10
  natsort
11
  einops
 
6
  lightning
7
  hydra-core
8
  faster-whisper
9
+ qwen-asr
10
  tensorboard
11
  natsort
12
  einops