Spaces:

artificialguybr
/

fish-s2-pro-zero

Running on Zero

App Files Files Community

Add Qwen-ASR for automatic transcription, both sizes and keep Whisper as an option in dropdown.

by Impulse2000 - opened 13 days ago

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+51

-19

Files changed (2) hide show

app.py +50 -19
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -84,29 +84,55 @@ def decode_codes_to_audio(merged_codes):
     return audio[0, 0]
-whisper_model = None
-def get_whisper_model():
-    global whisper_model
-    if whisper_model is None:
-        from faster_whisper import WhisperModel
-        whisper_model = WhisperModel("large-v3", device="cuda", compute_type="int8")
-    return whisper_model
 @spaces.GPU(duration=60)
-def transcribe_audio(audio_path):
     if audio_path is None:
         raise gr.Error("Please upload a reference audio file first.")
     try:
-        gr.Info("Transcribing audio with Whisper large-v3...")
-        model = get_whisper_model()
-        segments, info = model.transcribe(audio_path, beam_size=5, vad_filter=True)
-        text = " ".join(seg.text.strip() for seg in segments).strip()
         if not text:
-            raise gr.Error("Whisper could not detect any speech in the audio.")
-        gr.Info(f"Detected language: {info.language} ({info.language_probability:.0%} confidence)")
         return text
     except gr.Error:
         raise
@@ -240,7 +266,12 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
                     "The model will clone that voice for synthesis. Language is inferred automatically."
                 )
                 ref_audio = gr.Audio(label="Reference Audio", type="filepath")
-                transcribe_btn = gr.Button("🎤 Auto-transcribe with Whisper", variant="secondary", size="sm")
                 ref_text = gr.Textbox(
                     label="Reference Audio Transcription",
                     placeholder="Exact transcription of the reference audio, or click Auto-transcribe above...",
@@ -314,7 +345,7 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
     transcribe_btn.click(
         fn=transcribe_audio,
-        inputs=[ref_audio],
         outputs=[ref_text],
     )
@@ -325,4 +356,4 @@ with gr.Blocks(title="Fish Audio S2 Pro") as app:
     )
 if __name__ == "__main__":
-    app.launch()

     return audio[0, 0]
+ASR_MODELS = {
+    "Qwen3-ASR-1.7B — larger, more accurate": ("qwen", "Qwen/Qwen3-ASR-1.7B"),
+    "Qwen3-ASR-0.6B — smaller, faster": ("qwen", "Qwen/Qwen3-ASR-0.6B"),
+    "Whisper large-v3 (faster-whisper)": ("whisper", "large-v3"),
+}
+DEFAULT_ASR = "Qwen3-ASR-1.7B — larger, more accurate"
+asr_models = {}
+def get_asr_model(label):
+    if label not in asr_models:
+        backend, model_id = ASR_MODELS[label]
+        if backend == "qwen":
+            from qwen_asr import Qwen3ASRModel
+            asr_models[label] = Qwen3ASRModel.from_pretrained(
+                model_id,
+                dtype=torch.bfloat16,
+                device_map="cuda:0" if torch.cuda.is_available() else "cpu",
+                max_inference_batch_size=32,
+                max_new_tokens=256,
+            )
+        else:
+            from faster_whisper import WhisperModel
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            asr_models[label] = WhisperModel(model_id, device=device, compute_type="int8")
+    return asr_models[label]
 @spaces.GPU(duration=60)
+def transcribe_audio(audio_path, asr_label):
     if audio_path is None:
         raise gr.Error("Please upload a reference audio file first.")
     try:
+        gr.Info(f"Transcribing audio with {asr_label}...")
+        backend, _ = ASR_MODELS[asr_label]
+        model = get_asr_model(asr_label)
+        if backend == "qwen":
+            result = model.transcribe(audio=audio_path, language=None)[0]
+            text = (result.text or "").strip()
+            detected_language = result.language
+        else:
+            segments, info = model.transcribe(audio_path, beam_size=5, vad_filter=True)
+            text = " ".join(seg.text.strip() for seg in segments).strip()
+            detected_language = info.language
         if not text:
+            raise gr.Error("No speech could be detected in the audio.")
+        gr.Info(f"Detected language: {detected_language}")
         return text
     except gr.Error:
         raise
                     "The model will clone that voice for synthesis. Language is inferred automatically."
                 )
                 ref_audio = gr.Audio(label="Reference Audio", type="filepath")
+                asr_model_selector = gr.Radio(
+                    choices=list(ASR_MODELS.keys()),
+                    value=DEFAULT_ASR,
+                    label="ASR Model",
+                )
+                transcribe_btn = gr.Button("🎤 Auto-transcribe", variant="secondary", size="sm")
                 ref_text = gr.Textbox(
                     label="Reference Audio Transcription",
                     placeholder="Exact transcription of the reference audio, or click Auto-transcribe above...",
     transcribe_btn.click(
         fn=transcribe_audio,
+        inputs=[ref_audio, asr_model_selector],
         outputs=[ref_text],
     )
     )
 if __name__ == "__main__":
+    app.launch(server_port=8181)

requirements.txt CHANGED Viewed

@@ -6,6 +6,7 @@ datasets
 lightning
 hydra-core
 faster-whisper
 tensorboard
 natsort
 einops

 lightning
 hydra-core
 faster-whisper
+qwen-asr
 tensorboard
 natsort
 einops