import json import os import shutil import subprocess import tempfile from pathlib import Path import gradio as gr from huggingface_hub import snapshot_download REPO_ID = "Daumee/Qwen3-ASR-0.6B-ONNX-CPU" LANGUAGE_MAP = { "English": "English", "Chinese": "Chinese", "Bilingual": None, # auto-detect } MODEL_DIR = snapshot_download(repo_id=REPO_ID) def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str: """Convert uploaded audio to mono 16 kHz WAV. No trimming, no denoising.""" if progress: progress(0.15, desc="Preparing audio...") if shutil.which("ffmpeg") is None: raise gr.Error("ffmpeg is not installed.") out_dir = Path(tempfile.mkdtemp()) out_path = out_dir / "normalized.wav" cmd = [ "ffmpeg", "-y", "-i", input_path, "-ac", "1", "-ar", "16000", "-vn", str(out_path), ] try: subprocess.run( cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) except subprocess.CalledProcessError as e: raise gr.Error("Failed to process the uploaded audio file.") from e return str(out_path) def run_onnx_asr(audio_path: str, mode: str, progress: gr.Progress | None = None) -> dict: if mode not in LANGUAGE_MAP: raise gr.Error("Invalid mode selected.") language = LANGUAGE_MAP[mode] script_path = Path(MODEL_DIR) / "onnx_inference.py" if not script_path.exists(): raise gr.Error("onnx_inference.py was not found in the downloaded model repo.") cmd = ["python", str(script_path), audio_path, "--json"] if language is not None: cmd.extend(["--language", language]) if progress: progress(0.45, desc="Running transcription...") try: proc = subprocess.run( cmd, cwd=MODEL_DIR, capture_output=True, text=True, check=True, ) except subprocess.CalledProcessError as e: stderr = (e.stderr or "").strip() stdout = (e.stdout or "").strip() detail = stderr or stdout or "Unknown ASR error." raise gr.Error(detail[:1500]) from e output = (proc.stdout or "").strip().splitlines() parsed = None for line in reversed(output): line = line.strip() if not line: continue try: parsed = json.loads(line) break except json.JSONDecodeError: continue if not isinstance(parsed, dict): return { "text": (proc.stdout or "").strip(), "language": None, } return parsed def make_txt_file(text: str, original_audio_path: str) -> str: out_dir = Path(tempfile.mkdtemp()) stem = Path(original_audio_path).stem or "transcript" out_path = out_dir / f"{stem}.txt" out_path.write_text(text, encoding="utf-8") return str(out_path) def transcribe(audio_file: str, mode: str, progress=gr.Progress()): if not audio_file: raise gr.Error("Please upload an audio file.") progress(0.05, desc="Starting...") normalized_path = None try: normalized_path = normalize_audio(audio_file, progress=progress) result = run_onnx_asr(normalized_path, mode=mode, progress=progress) text = (result.get("text") or result.get("transcript") or "").strip() txt_file = make_txt_file(text, audio_file) detected_language = result.get("language") or result.get("detected_language") info = f"Mode: {mode}" if detected_language: info += f"\nDetected language: {detected_language}" progress(1.0, desc="Done") return text, txt_file, info finally: if normalized_path and os.path.exists(normalized_path): try: os.remove(normalized_path) except OSError: pass with gr.Blocks(title="Qwen3 ASR ONNX CPU") as demo: gr.Markdown("# Qwen3 ASR ONNX CPU") gr.Markdown( "Upload audio, choose a mode, transcribe with Qwen3-ASR ONNX on CPU, and download the transcript." ) with gr.Row(): audio = gr.Audio( sources=["upload"], type="filepath", label="Upload audio file", ) mode = gr.Dropdown( choices=["English", "Chinese", "Bilingual"], value="Bilingual", label="Mode", info="Bilingual means auto-detect.", ) transcribe_btn = gr.Button("Transcribe") transcript = gr.Textbox(label="Transcript", lines=14) download_file = gr.File(label="Download transcript") metadata = gr.Textbox(label="Info", lines=2, interactive=False) transcribe_btn.click( fn=transcribe, inputs=[audio, mode], outputs=[transcript, download_file, metadata], ) if __name__ == "__main__": demo.launch()