Spaces:

DeepLearning101
/

Speech-Quality-Inspection_Meta-Denoiser

Paused

App Files Files Community

DeepLearning101 commited on May 4, 2025

Commit

4baf7c2

verified ·

1 Parent(s): 6dbc581

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -6

app.py CHANGED Viewed

@@ -1,3 +1,29 @@
 import os
 import time
 import json
@@ -7,23 +33,60 @@ import torchaudio
 import numpy as np
 from denoiser.demucs import Demucs
 from pydub import AudioSegment
 modelpath = './denoiser/master64.th'
 def transcribe(file_upload, microphone):
     file = microphone if microphone is not None else file_upload
     model = Demucs(hidden=64)
     state_dict = torch.load(modelpath, map_location='cpu')
     model.load_state_dict(state_dict)
-    demucs = model
     x, sr = torchaudio.load(file)
-    out = demucs(x[None])[0]
     out = out / max(out.abs().max().item(), 1)
-    torchaudio.save('enhanced.wav', out, sr)
-    enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別
-    enhanced.export('enhanced.wav', format="wav", bitrate="256k")
-    return "enhanced.wav"
 demo = gr.Interface(
     fn=transcribe,
     inputs=[

+# import os
+# import time
+# import json
+# import gradio as gr
+# import torch
+# import torchaudio
+# import numpy as np
+# from denoiser.demucs import Demucs
+# from pydub import AudioSegment
+# modelpath = './denoiser/master64.th'
+# def transcribe(file_upload, microphone):
+#     file = microphone if microphone is not None else file_upload
+#     model = Demucs(hidden=64)
+#     state_dict = torch.load(modelpath, map_location='cpu')
+#     model.load_state_dict(state_dict)
+#     demucs = model
+#     x, sr = torchaudio.load(file)
+#     out = demucs(x[None])[0]
+#     out = out / max(out.abs().max().item(), 1)
+#     torchaudio.save('enhanced.wav', out, sr)
+#     enhanced = AudioSegment.from_wav('enhanced.wav') # 只有去完噪的需要降 bitrate 再做語音識別
+#     enhanced.export('enhanced.wav', format="wav", bitrate="256k")
+#     return "enhanced.wav"
 import os
 import time
 import json
 import numpy as np
 from denoiser.demucs import Demucs
 from pydub import AudioSegment
+import soundfile as sf
+import librosa
 modelpath = './denoiser/master64.th'
 def transcribe(file_upload, microphone):
     file = microphone if microphone is not None else file_upload
+    # 新增音訊預處理 → 統一格式
+    def preprocess_audio(path):
+        data, sr = sf.read(path)
+        # 如果是雙聲道 → 轉單聲道
+        if len(data.shape) > 1:
+            data = data.mean(axis=1)
+        # 如果不是 16kHz → 重採樣
+        if sr != 16000:
+            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
+            sr = 16000
+        # 儲存為 WAV 供模型使用
+        sf.write("enhanced.wav", data, sr)
+        return "enhanced.wav"
+    # 如果是 MP3，先轉成 WAV 再處理
+    if file.lower().endswith(".mp3"):
+        audio = AudioSegment.from_file(file)
+        audio = audio.set_frame_rate(16000).set_channels(1)  # 轉單聲道 + 16kHz
+        audio.export("enhanced.wav", format="wav")
+        file = "enhanced.wav"
+    else:
+        file = preprocess_audio(file)
     model = Demucs(hidden=64)
     state_dict = torch.load(modelpath, map_location='cpu')
     model.load_state_dict(state_dict)
+    demucs = model.eval()
     x, sr = torchaudio.load(file)
+    x = x[0:1]  # 強制取第一個聲道（確保是單聲道）
+    with torch.no_grad():
+        out = demucs(x[None])[0]
     out = out / max(out.abs().max().item(), 1)
+    torchaudio.save('enhanced_final.wav', out, sr)
+    # 輸出 WAV 格式給前端播放
+    enhanced = AudioSegment.from_wav('enhanced_final.wav')
+    enhanced.export('enhanced_final.mp3', format="mp3", bitrate="256k")
+    return "enhanced_final.mp3"  # 回傳 MP3 更省空間
 demo = gr.Interface(
     fn=transcribe,
     inputs=[