Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import shutil | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| from huggingface_hub import snapshot_download | |
| REPO_ID = "Daumee/Qwen3-ASR-0.6B-ONNX-CPU" | |
| LANGUAGE_MAP = { | |
| "English": "English", | |
| "Chinese": "Chinese", | |
| "Bilingual": None, # auto-detect | |
| } | |
| MODEL_DIR = snapshot_download(repo_id=REPO_ID) | |
| def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str: | |
| """Convert uploaded audio to mono 16 kHz WAV. No trimming, no denoising.""" | |
| if progress: | |
| progress(0.15, desc="Preparing audio...") | |
| if shutil.which("ffmpeg") is None: | |
| raise gr.Error("ffmpeg is not installed.") | |
| out_dir = Path(tempfile.mkdtemp()) | |
| out_path = out_dir / "normalized.wav" | |
| cmd = [ | |
| "ffmpeg", | |
| "-y", | |
| "-i", input_path, | |
| "-ac", "1", | |
| "-ar", "16000", | |
| "-vn", | |
| str(out_path), | |
| ] | |
| try: | |
| subprocess.run( | |
| cmd, | |
| check=True, | |
| stdout=subprocess.DEVNULL, | |
| stderr=subprocess.DEVNULL, | |
| ) | |
| except subprocess.CalledProcessError as e: | |
| raise gr.Error("Failed to process the uploaded audio file.") from e | |
| return str(out_path) | |
| def run_onnx_asr(audio_path: str, mode: str, progress: gr.Progress | None = None) -> dict: | |
| if mode not in LANGUAGE_MAP: | |
| raise gr.Error("Invalid mode selected.") | |
| language = LANGUAGE_MAP[mode] | |
| script_path = Path(MODEL_DIR) / "onnx_inference.py" | |
| if not script_path.exists(): | |
| raise gr.Error("onnx_inference.py was not found in the downloaded model repo.") | |
| cmd = ["python", str(script_path), audio_path, "--json"] | |
| if language is not None: | |
| cmd.extend(["--language", language]) | |
| if progress: | |
| progress(0.45, desc="Running transcription...") | |
| try: | |
| proc = subprocess.run( | |
| cmd, | |
| cwd=MODEL_DIR, | |
| capture_output=True, | |
| text=True, | |
| check=True, | |
| ) | |
| except subprocess.CalledProcessError as e: | |
| stderr = (e.stderr or "").strip() | |
| stdout = (e.stdout or "").strip() | |
| detail = stderr or stdout or "Unknown ASR error." | |
| raise gr.Error(detail[:1500]) from e | |
| output = (proc.stdout or "").strip().splitlines() | |
| parsed = None | |
| for line in reversed(output): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| parsed = json.loads(line) | |
| break | |
| except json.JSONDecodeError: | |
| continue | |
| if not isinstance(parsed, dict): | |
| return { | |
| "text": (proc.stdout or "").strip(), | |
| "language": None, | |
| } | |
| return parsed | |
| def make_txt_file(text: str, original_audio_path: str) -> str: | |
| out_dir = Path(tempfile.mkdtemp()) | |
| stem = Path(original_audio_path).stem or "transcript" | |
| out_path = out_dir / f"{stem}.txt" | |
| out_path.write_text(text, encoding="utf-8") | |
| return str(out_path) | |
| def transcribe(audio_file: str, mode: str, progress=gr.Progress()): | |
| if not audio_file: | |
| raise gr.Error("Please upload an audio file.") | |
| progress(0.05, desc="Starting...") | |
| normalized_path = None | |
| try: | |
| normalized_path = normalize_audio(audio_file, progress=progress) | |
| result = run_onnx_asr(normalized_path, mode=mode, progress=progress) | |
| text = (result.get("text") or result.get("transcript") or "").strip() | |
| txt_file = make_txt_file(text, audio_file) | |
| detected_language = result.get("language") or result.get("detected_language") | |
| info = f"Mode: {mode}" | |
| if detected_language: | |
| info += f"\nDetected language: {detected_language}" | |
| progress(1.0, desc="Done") | |
| return text, txt_file, info | |
| finally: | |
| if normalized_path and os.path.exists(normalized_path): | |
| try: | |
| os.remove(normalized_path) | |
| except OSError: | |
| pass | |
| with gr.Blocks(title="Qwen3 ASR ONNX CPU") as demo: | |
| gr.Markdown("# Qwen3 ASR ONNX CPU") | |
| gr.Markdown( | |
| "Upload audio, choose a mode, transcribe with Qwen3-ASR ONNX on CPU, and download the transcript." | |
| ) | |
| with gr.Row(): | |
| audio = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload audio file", | |
| ) | |
| mode = gr.Dropdown( | |
| choices=["English", "Chinese", "Bilingual"], | |
| value="Bilingual", | |
| label="Mode", | |
| info="Bilingual means auto-detect.", | |
| ) | |
| transcribe_btn = gr.Button("Transcribe") | |
| transcript = gr.Textbox(label="Transcript", lines=14) | |
| download_file = gr.File(label="Download transcript") | |
| metadata = gr.Textbox(label="Info", lines=2, interactive=False) | |
| transcribe_btn.click( | |
| fn=transcribe, | |
| inputs=[audio, mode], | |
| outputs=[transcript, download_file, metadata], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |