David-Chew-HL commited on
Commit
93122f4
·
verified ·
1 Parent(s): 4091bcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -65
app.py CHANGED
@@ -1,42 +1,33 @@
 
1
  import os
 
2
  import shutil
3
  import subprocess
4
  import tempfile
5
  from pathlib import Path
6
 
7
  import gradio as gr
8
- import torch
9
- from qwen_asr import Qwen3ASRModel
10
 
11
- MODEL_NAME = "Qwen/Qwen3-ASR-0.6B"
12
 
13
- LANG_MAP = {
14
  "English": "English",
15
  "Chinese": "Chinese",
16
- "Bilingual": None, # let Qwen auto-detect
17
  }
18
 
19
- device_map = "cuda:0" if torch.cuda.is_available() else "cpu"
20
- dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
21
-
22
- model = Qwen3ASRModel.from_pretrained(
23
- MODEL_NAME,
24
- dtype=dtype,
25
- device_map=device_map,
26
- max_inference_batch_size=1
27
- )
28
 
29
 
30
  def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str:
31
- """
32
- Convert uploaded audio to mono 16k WAV.
33
- No silence trimming. No noise reduction.
34
- """
35
  if progress:
36
  progress(0.15, desc="Preparing audio...")
37
 
38
  if shutil.which("ffmpeg") is None:
39
- raise gr.Error("ffmpeg is not installed in this environment.")
40
 
41
  out_dir = Path(tempfile.mkdtemp())
42
  out_path = out_dir / "normalized.wav"
@@ -45,12 +36,11 @@ def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str
45
  "ffmpeg",
46
  "-y",
47
  "-i", input_path,
48
- "-ac", "1", # mono
49
- "-ar", "16000", # 16 kHz
50
  "-vn",
51
  str(out_path),
52
  ]
53
-
54
  try:
55
  subprocess.run(
56
  cmd,
@@ -58,51 +48,138 @@ def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str
58
  stdout=subprocess.DEVNULL,
59
  stderr=subprocess.DEVNULL,
60
  )
61
- except subprocess.CalledProcessError:
62
- raise gr.Error("Failed to process the uploaded audio file.")
63
 
64
  return str(out_path)
65
 
66
 
67
- def make_output_txt(text: str, original_audio_path: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  out_dir = Path(tempfile.mkdtemp())
69
  stem = Path(original_audio_path).stem or "transcript"
70
- txt_path = out_dir / f"{stem}.txt"
71
- txt_path.write_text(text, encoding="utf-8")
72
- return str(txt_path)
73
 
74
 
75
- def transcribe(audio_path: str, mode: str, progress=gr.Progress()):
76
- if not audio_path:
77
  raise gr.Error("Please upload an audio file.")
78
 
79
- if mode not in LANG_MAP:
80
- raise gr.Error("Invalid mode selected.")
81
-
82
  progress(0.05, desc="Starting...")
83
- normalized_path = None
84
 
 
85
  try:
86
- normalized_path = normalize_audio(audio_path, progress=progress)
 
87
 
88
- progress(0.45, desc="Running transcription...")
89
- language = LANG_MAP[mode]
 
90
 
91
- result = model.transcribe(
92
- audio=normalized_path,
93
- language=language,
94
- )[0]
95
 
96
- text = (result.text or "").strip()
97
- txt_path = make_output_txt(text, audio_path)
98
 
99
- detected_language = getattr(result, "language", None)
100
  info = f"Mode: {mode}"
101
  if detected_language:
102
  info += f"\nDetected language: {detected_language}"
103
 
104
  progress(1.0, desc="Done")
105
- return text, txt_path, info
106
 
107
  finally:
108
  if normalized_path and os.path.exists(normalized_path):
@@ -112,10 +189,10 @@ def transcribe(audio_path: str, mode: str, progress=gr.Progress()):
112
  pass
113
 
114
 
115
- with gr.Blocks(title="Qwen3 ASR Transcriber") as demo:
116
- gr.Markdown("# Qwen3 ASR Transcriber")
117
  gr.Markdown(
118
- "Upload audio, choose a mode, transcribe it, and download the transcript as a text file."
119
  )
120
 
121
  with gr.Row():
@@ -128,31 +205,28 @@ with gr.Blocks(title="Qwen3 ASR Transcriber") as demo:
128
  choices=["English", "Chinese", "Bilingual"],
129
  value="Bilingual",
130
  label="Mode",
131
- info="Bilingual means Qwen auto-detects mixed English + Mandarin audio.",
132
  )
133
 
134
- transcribe_btn = gr.Button("Transcribe")
135
-
136
- transcript = gr.Textbox(
137
- label="Transcript",
138
- lines=14,
139
  )
140
 
141
- transcript_file = gr.File(
142
- label="Download transcript",
143
- )
144
 
145
- metadata = gr.Textbox(
146
- label="Info",
147
- lines=2,
148
- interactive=False,
149
- )
150
 
151
  transcribe_btn.click(
152
  fn=transcribe,
153
- inputs=[audio, mode],
154
- outputs=[transcript, transcript_file, metadata],
155
  )
156
 
157
  if __name__ == "__main__":
158
- demo.launch()
 
 
1
+ import json
2
  import os
3
+ import re
4
  import shutil
5
  import subprocess
6
  import tempfile
7
  from pathlib import Path
8
 
9
  import gradio as gr
10
+ from huggingface_hub import snapshot_download
 
11
 
12
+ REPO_ID = "Daumee/Qwen3-ASR-0.6B-ONNX-CPU"
13
 
14
+ LANGUAGE_MAP = {
15
  "English": "English",
16
  "Chinese": "Chinese",
17
+ "Bilingual": None, # auto-detect
18
  }
19
 
20
+ # Download the ONNX repo into the Space at startup.
21
+ MODEL_DIR = snapshot_download(repo_id=REPO_ID)
 
 
 
 
 
 
 
22
 
23
 
24
  def normalize_audio(input_path: str, progress: gr.Progress | None = None) -> str:
25
+ """Convert uploaded audio to mono 16 kHz WAV. No trimming, no denoising."""
 
 
 
26
  if progress:
27
  progress(0.15, desc="Preparing audio...")
28
 
29
  if shutil.which("ffmpeg") is None:
30
+ raise gr.Error("ffmpeg is not installed.")
31
 
32
  out_dir = Path(tempfile.mkdtemp())
33
  out_path = out_dir / "normalized.wav"
 
36
  "ffmpeg",
37
  "-y",
38
  "-i", input_path,
39
+ "-ac", "1",
40
+ "-ar", "16000",
41
  "-vn",
42
  str(out_path),
43
  ]
 
44
  try:
45
  subprocess.run(
46
  cmd,
 
48
  stdout=subprocess.DEVNULL,
49
  stderr=subprocess.DEVNULL,
50
  )
51
+ except subprocess.CalledProcessError as e:
52
+ raise gr.Error("Failed to process the uploaded audio file.") from e
53
 
54
  return str(out_path)
55
 
56
 
57
+ def paragraphize_text(text: str, max_chars: int = 180, max_sentences: int = 3) -> str:
58
+ """Lightweight paragraphing that preserves the original wording."""
59
+ text = (text or "").strip()
60
+ if not text:
61
+ return ""
62
+
63
+ # Split on end-of-sentence punctuation for English and Chinese.
64
+ sentences = re.split(r"(?<=[\.\!\?\。\!?])\s+", text)
65
+ sentences = [s.strip() for s in sentences if s.strip()]
66
+
67
+ # Fallback: if no sentence punctuation exists, split by commas / Chinese commas
68
+ if len(sentences) <= 1:
69
+ chunks = re.split(r"(?<=[,,;;])\s*", text)
70
+ chunks = [c.strip() for c in chunks if c.strip()]
71
+ if len(chunks) > 1:
72
+ sentences = chunks
73
+
74
+ paragraphs = []
75
+ current = []
76
+ current_len = 0
77
+
78
+ for s in sentences:
79
+ proposed_len = current_len + (1 if current else 0) + len(s)
80
+ if current and (proposed_len > max_chars or len(current) >= max_sentences):
81
+ paragraphs.append(" ".join(current))
82
+ current = [s]
83
+ current_len = len(s)
84
+ else:
85
+ current.append(s)
86
+ current_len = proposed_len
87
+
88
+ if current:
89
+ paragraphs.append(" ".join(current))
90
+
91
+ return "\n\n".join(paragraphs)
92
+
93
+
94
+ def run_onnx_asr(audio_path: str, mode: str, progress: gr.Progress | None = None) -> dict:
95
+ if mode not in LANGUAGE_MAP:
96
+ raise gr.Error("Invalid mode selected.")
97
+
98
+ language = LANGUAGE_MAP[mode]
99
+
100
+ script_path = Path(MODEL_DIR) / "onnx_inference.py"
101
+ if not script_path.exists():
102
+ raise gr.Error("onnx_inference.py was not found in the downloaded model repo.")
103
+
104
+ cmd = ["python", str(script_path), audio_path, "--json"]
105
+ if language is not None:
106
+ cmd.extend(["--language", language])
107
+
108
+ if progress:
109
+ progress(0.45, desc="Running transcription...")
110
+
111
+ try:
112
+ proc = subprocess.run(
113
+ cmd,
114
+ cwd=MODEL_DIR,
115
+ capture_output=True,
116
+ text=True,
117
+ check=True,
118
+ )
119
+ except subprocess.CalledProcessError as e:
120
+ stderr = (e.stderr or "").strip()
121
+ stdout = (e.stdout or "").strip()
122
+ detail = stderr or stdout or "Unknown ASR error."
123
+ raise gr.Error(detail[:1500]) from e
124
+
125
+ # Be resilient: find the last JSON object in stdout.
126
+ output = (proc.stdout or "").strip().splitlines()
127
+ parsed = None
128
+ for line in reversed(output):
129
+ line = line.strip()
130
+ if not line:
131
+ continue
132
+ try:
133
+ parsed = json.loads(line)
134
+ break
135
+ except json.JSONDecodeError:
136
+ continue
137
+
138
+ if not isinstance(parsed, dict):
139
+ # Fallback: return raw text if the script prints plain text instead.
140
+ return {
141
+ "text": (proc.stdout or "").strip(),
142
+ "language": None,
143
+ }
144
+
145
+ return parsed
146
+
147
+
148
+ def make_txt_file(text: str, original_audio_path: str, suffix: str) -> str:
149
  out_dir = Path(tempfile.mkdtemp())
150
  stem = Path(original_audio_path).stem or "transcript"
151
+ out_path = out_dir / f"{stem}_{suffix}.txt"
152
+ out_path.write_text(text, encoding="utf-8")
153
+ return str(out_path)
154
 
155
 
156
+ def transcribe(audio_file: str, mode: str, paragraphing: bool, progress=gr.Progress()):
157
+ if not audio_file:
158
  raise gr.Error("Please upload an audio file.")
159
 
 
 
 
160
  progress(0.05, desc="Starting...")
 
161
 
162
+ normalized_path = None
163
  try:
164
+ normalized_path = normalize_audio(audio_file, progress=progress)
165
+ result = run_onnx_asr(normalized_path, mode=mode, progress=progress)
166
 
167
+ raw_text = (result.get("text") or result.get("transcript") or "").strip()
168
+ if not raw_text:
169
+ raw_text = ""
170
 
171
+ final_text = paragraphize_text(raw_text) if paragraphing else raw_text
 
 
 
172
 
173
+ raw_txt = make_txt_file(raw_text, audio_file, "raw")
174
+ final_txt = make_txt_file(final_text, audio_file, "paragraphs" if paragraphing else "transcript")
175
 
176
+ detected_language = result.get("language") or result.get("detected_language")
177
  info = f"Mode: {mode}"
178
  if detected_language:
179
  info += f"\nDetected language: {detected_language}"
180
 
181
  progress(1.0, desc="Done")
182
+ return raw_text, final_text, final_txt, info
183
 
184
  finally:
185
  if normalized_path and os.path.exists(normalized_path):
 
189
  pass
190
 
191
 
192
+ with gr.Blocks(title="Qwen3 ASR ONNX CPU") as demo:
193
+ gr.Markdown("# Qwen3 ASR ONNX CPU")
194
  gr.Markdown(
195
+ "Upload audio, choose a mode, transcribe with Qwen3-ASR ONNX on CPU, and download the transcript."
196
  )
197
 
198
  with gr.Row():
 
205
  choices=["English", "Chinese", "Bilingual"],
206
  value="Bilingual",
207
  label="Mode",
208
+ info="Bilingual means auto-detect.",
209
  )
210
 
211
+ paragraphing = gr.Checkbox(
212
+ value=True,
213
+ label="Auto paragraphing",
214
+ info="Preserves wording and only inserts paragraph breaks.",
 
215
  )
216
 
217
+ transcribe_btn = gr.Button("Transcribe")
 
 
218
 
219
+ raw_transcript = gr.Textbox(label="Raw transcript", lines=10)
220
+ formatted_transcript = gr.Textbox(label="Formatted transcript", lines=14)
221
+ download_file = gr.File(label="Download transcript")
222
+ metadata = gr.Textbox(label="Info", lines=2, interactive=False)
 
223
 
224
  transcribe_btn.click(
225
  fn=transcribe,
226
+ inputs=[audio, mode, paragraphing],
227
+ outputs=[raw_transcript, formatted_transcript, download_file, metadata],
228
  )
229
 
230
  if __name__ == "__main__":
231
+ demo.launch()
232
+