Spaces:

elith
/

PHASR

Runtime error

oriki101 commited on May 10, 2024

Commit

02f5074

1 Parent(s): bd7b9b6

modify app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ TRAIN_CONFIG = base_dir / "config.yaml"
 NORM_CONFIG = base_dir / "feats_stats.npz"
 DEVICE = "cpu"
 RESAMPLING_RATE = 16000
-THRESHOLD = 5000000
 # モデル
 speech2text = Speech2Text(
@@ -33,6 +32,9 @@ def resample(audio: np.ndarray, original_sr: int) -> tuple[np.ndarray, int]:
     Returns:
         tuple[np.ndarray, int]: リサンプリングされた音声信号と目標のサンプルレート
     """
     # audioのサンプリングレートをoriginal_srから16kに調整する
     resampled_audio = librosa.resample(
         audio, orig_sr=original_sr, target_sr=RESAMPLING_RATE
@@ -57,9 +59,7 @@ def transcribe(input: tuple[int, np.ndarray]) -> str:
     sr = input[0]
     audio = input[1]
-    # リサンプリング(短すぎると、リサンプリングできないため、あまりに短いファイルはリサンプリングしない)
-    if len(audio) > THRESHOLD:
-        audio, _ = resample(audio, sr)
     # 認識
     nbests = speech2text(audio)
     text, *_ = nbests[0]

 NORM_CONFIG = base_dir / "feats_stats.npz"
 DEVICE = "cpu"
 RESAMPLING_RATE = 16000
 # モデル
 speech2text = Speech2Text(
     Returns:
         tuple[np.ndarray, int]: リサンプリングされた音声信号と目標のサンプルレート
     """
+    # int16あのでfloatに変換
+    if audio.dtype in [np.int16, np.int32]:
+        audio = audio.astype(np.float32) / np.iinfo(audio.dtype).max
     # audioのサンプリングレートをoriginal_srから16kに調整する
     resampled_audio = librosa.resample(
         audio, orig_sr=original_sr, target_sr=RESAMPLING_RATE
     sr = input[0]
     audio = input[1]
+    audio, _ = resample(audio, sr)
     # 認識
     nbests = speech2text(audio)
     text, *_ = nbests[0]