Spaces:

David-Chew-HL
/

Transcriber

Sleeping

App Files Files Community

David-Chew-HL commited on Apr 19

Commit

93122f4

verified ·

1 Parent(s): 4091bcd

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -65

app.py CHANGED Viewed

@@ -1,42 +1,33 @@
 import os
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 import gradio as gr
-import torch
-from qwen_asr import Qwen3ASRModel
-MODEL_NAME = "Qwen/Qwen3-ASR-0.6B"
-LANG_MAP = {
     "English": "English",
     "Chinese": "Chinese",
-    "Bilingual": None,  # let Qwen auto-detect
 }
-device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
-dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-model = Qwen3ASRModel.from_pretrained(
-    MODEL_NAME,
-    dtype=dtype,
-    device_map=device_map,
-    max_inference_batch_size=1
-)
 def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str:
-    """
-    Convert uploaded audio to mono 16k WAV.
-    No silence trimming. No noise reduction.
-    """
     if progress:
         progress(0.15, desc="Preparing audio...")
     if shutil.which("ffmpeg") is None:
-        raise gr.Error("ffmpeg is not installed in this environment.")
     out_dir = Path(tempfile.mkdtemp())
     out_path = out_dir / "normalized.wav"
@@ -45,12 +36,11 @@ def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str
         "ffmpeg",
         "-y",
         "-i", input_path,
-        "-ac", "1",         # mono
-        "-ar", "16000",     # 16 kHz
         "-vn",
         str(out_path),
     ]
     try:
         subprocess.run(
             cmd,
@@ -58,51 +48,138 @@ def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
         )
-    except subprocess.CalledProcessError:
-        raise gr.Error("Failed to process the uploaded audio file.")
     return str(out_path)
-def make_output_txt(text: str, original_audio_path: str) -> str:
     out_dir = Path(tempfile.mkdtemp())
     stem = Path(original_audio_path).stem or "transcript"
-    txt_path = out_dir / f"{stem}.txt"
-    txt_path.write_text(text, encoding="utf-8")
-    return str(txt_path)
-def transcribe(audio_path: str, mode: str, progress=gr.Progress()):
-    if not audio_path:
         raise gr.Error("Please upload an audio file.")
-    if mode not in LANG_MAP:
-        raise gr.Error("Invalid mode selected.")
     progress(0.05, desc="Starting...")
-    normalized_path = None
     try:
-        normalized_path = normalize_audio(audio_path, progress=progress)
-        progress(0.45, desc="Running transcription...")
-        language = LANG_MAP[mode]
-        result = model.transcribe(
-            audio=normalized_path,
-            language=language,
-        )[0]
-        text = (result.text or "").strip()
-        txt_path = make_output_txt(text, audio_path)
-        detected_language = getattr(result, "language", None)
         info = f"Mode: {mode}"
         if detected_language:
             info += f"\nDetected language: {detected_language}"
         progress(1.0, desc="Done")
-        return text, txt_path, info
     finally:
         if normalized_path and os.path.exists(normalized_path):
@@ -112,10 +189,10 @@ def transcribe(audio_path: str, mode: str, progress=gr.Progress()):
                 pass
-with gr.Blocks(title="Qwen3 ASR Transcriber") as demo:
-    gr.Markdown("# Qwen3 ASR Transcriber")
     gr.Markdown(
-        "Upload audio, choose a mode, transcribe it, and download the transcript as a text file."
     )
     with gr.Row():
@@ -128,31 +205,28 @@ with gr.Blocks(title="Qwen3 ASR Transcriber") as demo:
             choices=["English", "Chinese", "Bilingual"],
             value="Bilingual",
             label="Mode",
-            info="Bilingual means Qwen auto-detects mixed English + Mandarin audio.",
         )
-    transcribe_btn = gr.Button("Transcribe")
-    transcript = gr.Textbox(
-        label="Transcript",
-        lines=14,
     )
-    transcript_file = gr.File(
-        label="Download transcript",
-    )
-    metadata = gr.Textbox(
-        label="Info",
-        lines=2,
-        interactive=False,
-    )
     transcribe_btn.click(
         fn=transcribe,
-        inputs=[audio, mode],
-        outputs=[transcript, transcript_file, metadata],
     )
 if __name__ == "__main__":
-    demo.launch()

+import json
 import os
+import re
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 import gradio as gr
+from huggingface_hub import snapshot_download
+REPO_ID = "Daumee/Qwen3-ASR-0.6B-ONNX-CPU"
+LANGUAGE_MAP = {
     "English": "English",
     "Chinese": "Chinese",
+    "Bilingual": None,  # auto-detect
 }
+# Download the ONNX repo into the Space at startup.
+MODEL_DIR = snapshot_download(repo_id=REPO_ID)
 def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str:
+    """Convert uploaded audio to mono 16 kHz WAV. No trimming, no denoising."""
     if progress:
         progress(0.15, desc="Preparing audio...")
     if shutil.which("ffmpeg") is None:
+        raise gr.Error("ffmpeg is not installed.")
     out_dir = Path(tempfile.mkdtemp())
     out_path = out_dir / "normalized.wav"
         "ffmpeg",
         "-y",
         "-i", input_path,
+        "-ac", "1",
+        "-ar", "16000",
         "-vn",
         str(out_path),
     ]
     try:
         subprocess.run(
             cmd,
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
         )
+    except subprocess.CalledProcessError as e:
+        raise gr.Error("Failed to process the uploaded audio file.") from e
     return str(out_path)
+def paragraphize_text(text: str, max_chars: int = 180, max_sentences: int = 3) -> str:
+    """Lightweight paragraphing that preserves the original wording."""
+    text = (text or "").strip()
+    if not text:
+        return ""
+    # Split on end-of-sentence punctuation for English and Chinese.
+    sentences = re.split(r"(?<=[\.\!\?\。\！？])\s+", text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    # Fallback: if no sentence punctuation exists, split by commas / Chinese commas
+    if len(sentences) <= 1:
+        chunks = re.split(r"(?<=[,，;；])\s*", text)
+        chunks = [c.strip() for c in chunks if c.strip()]
+        if len(chunks) > 1:
+            sentences = chunks
+    paragraphs = []
+    current = []
+    current_len = 0
+    for s in sentences:
+        proposed_len = current_len + (1 if current else 0) + len(s)
+        if current and (proposed_len > max_chars or len(current) >= max_sentences):
+            paragraphs.append(" ".join(current))
+            current = [s]
+            current_len = len(s)
+        else:
+            current.append(s)
+            current_len = proposed_len
+    if current:
+        paragraphs.append(" ".join(current))
+    return "\n\n".join(paragraphs)
+def run_onnx_asr(audio_path: str, mode: str, progress: gr.Progress | None = None) -> dict:
+    if mode not in LANGUAGE_MAP:
+        raise gr.Error("Invalid mode selected.")
+    language = LANGUAGE_MAP[mode]
+    script_path = Path(MODEL_DIR) / "onnx_inference.py"
+    if not script_path.exists():
+        raise gr.Error("onnx_inference.py was not found in the downloaded model repo.")
+    cmd = ["python", str(script_path), audio_path, "--json"]
+    if language is not None:
+        cmd.extend(["--language", language])
+    if progress:
+        progress(0.45, desc="Running transcription...")
+    try:
+        proc = subprocess.run(
+            cmd,
+            cwd=MODEL_DIR,
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+    except subprocess.CalledProcessError as e:
+        stderr = (e.stderr or "").strip()
+        stdout = (e.stdout or "").strip()
+        detail = stderr or stdout or "Unknown ASR error."
+        raise gr.Error(detail[:1500]) from e
+    # Be resilient: find the last JSON object in stdout.
+    output = (proc.stdout or "").strip().splitlines()
+    parsed = None
+    for line in reversed(output):
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            parsed = json.loads(line)
+            break
+        except json.JSONDecodeError:
+            continue
+    if not isinstance(parsed, dict):
+        # Fallback: return raw text if the script prints plain text instead.
+        return {
+            "text": (proc.stdout or "").strip(),
+            "language": None,
+        }
+    return parsed
+def make_txt_file(text: str, original_audio_path: str, suffix: str) -> str:
     out_dir = Path(tempfile.mkdtemp())
     stem = Path(original_audio_path).stem or "transcript"
+    out_path = out_dir / f"{stem}_{suffix}.txt"
+    out_path.write_text(text, encoding="utf-8")
+    return str(out_path)
+def transcribe(audio_file: str, mode: str, paragraphing: bool, progress=gr.Progress()):
+    if not audio_file:
         raise gr.Error("Please upload an audio file.")
     progress(0.05, desc="Starting...")
+    normalized_path = None
     try:
+        normalized_path = normalize_audio(audio_file, progress=progress)
+        result = run_onnx_asr(normalized_path, mode=mode, progress=progress)
+        raw_text = (result.get("text") or result.get("transcript") or "").strip()
+        if not raw_text:
+            raw_text = ""
+        final_text = paragraphize_text(raw_text) if paragraphing else raw_text
+        raw_txt = make_txt_file(raw_text, audio_file, "raw")
+        final_txt = make_txt_file(final_text, audio_file, "paragraphs" if paragraphing else "transcript")
+        detected_language = result.get("language") or result.get("detected_language")
         info = f"Mode: {mode}"
         if detected_language:
             info += f"\nDetected language: {detected_language}"
         progress(1.0, desc="Done")
+        return raw_text, final_text, final_txt, info
     finally:
         if normalized_path and os.path.exists(normalized_path):
                 pass
+with gr.Blocks(title="Qwen3 ASR ONNX CPU") as demo:
+    gr.Markdown("# Qwen3 ASR ONNX CPU")
     gr.Markdown(
+        "Upload audio, choose a mode, transcribe with Qwen3-ASR ONNX on CPU, and download the transcript."
     )
     with gr.Row():
             choices=["English", "Chinese", "Bilingual"],
             value="Bilingual",
             label="Mode",
+            info="Bilingual means auto-detect.",
         )
+    paragraphing = gr.Checkbox(
+        value=True,
+        label="Auto paragraphing",
+        info="Preserves wording and only inserts paragraph breaks.",
     )
+    transcribe_btn = gr.Button("Transcribe")
+    raw_transcript = gr.Textbox(label="Raw transcript", lines=10)
+    formatted_transcript = gr.Textbox(label="Formatted transcript", lines=14)
+    download_file = gr.File(label="Download transcript")
+    metadata = gr.Textbox(label="Info", lines=2, interactive=False)
     transcribe_btn.click(
         fn=transcribe,
+        inputs=[audio, mode, paragraphing],
+        outputs=[raw_transcript, formatted_transcript, download_file, metadata],
     )
 if __name__ == "__main__":
+    demo.launch()