Nguyen5 commited on
Commit
b2fa85d
·
1 Parent(s): e6bb64c
Files changed (1) hide show
  1. speech_io.py +58 -113
speech_io.py CHANGED
@@ -1,157 +1,102 @@
1
- """
2
- speech_io.py
3
-
4
- Sprachbasierte Ein-/Ausgabe:
5
- - Speech-to-Text (STT) mit Whisper (transformers.pipeline)
6
- - Text-to-Speech (TTS) mit MMS-TTS Deutsch
7
-
8
- Dieses File ist 100% stabil für HuggingFace Spaces.
9
- """
10
-
11
- from typing import Optional, Tuple
12
  import numpy as np
13
  import soundfile as sf
14
- from scipy.signal import butter, filtfilt
15
  from transformers import pipeline
16
 
17
- # Modelle
18
- ASR_MODEL_ID = "openai/whisper-small"
19
- TTS_MODEL_ID = "facebook/mms-tts-deu"
20
 
21
  _asr = None
22
  _tts = None
23
 
24
- # ========================================================
25
- # STT PIPELINE
26
- # ========================================================
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def get_asr_pipeline():
29
  global _asr
30
  if _asr is None:
31
- print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
32
  _asr = pipeline(
33
  task="automatic-speech-recognition",
34
  model=ASR_MODEL_ID,
35
- device="cpu",
36
- return_timestamps=True, # wichtig
37
- chunk_length_s=30 # auto-chunk für lange audio
38
  )
39
  return _asr
40
 
41
- # ========================================================
42
- # TTS PIPELINE
43
- # ========================================================
44
-
45
- def get_tts_pipeline():
46
- global _tts
47
- if _tts is None:
48
- print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
49
- _tts = pipeline(
50
- task="text-to-speech",
51
- model=TTS_MODEL_ID,
52
- )
53
- return _tts
54
-
55
- # ========================================================
56
- # AUDIO FILTER – Noise Reduction + Highpass
57
- # ========================================================
58
-
59
- def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
60
- nyq = 0.5 * fs
61
- norm_cutoff = cutoff / nyq
62
- b, a = butter(order, norm_cutoff, btype="high")
63
- return filtfilt(b, a, data)
64
-
65
- def apply_fade(audio, sr, duration_ms=10):
66
- fade_samples = int(sr * duration_ms / 1000)
67
-
68
- if fade_samples * 2 >= len(audio):
69
- return audio
70
-
71
- fade_in_curve = np.linspace(0, 1, fade_samples)
72
- audio[:fade_samples] *= fade_in_curve
73
-
74
- fade_out_curve = np.linspace(1, 0, fade_samples)
75
- audio[-fade_samples:] *= fade_out_curve
76
-
77
- return audio
78
-
79
- # ========================================================
80
- # SPEECH-TO-TEXT (STT)
81
- # ========================================================
82
 
 
 
 
83
  def transcribe_audio(audio_path: str) -> str:
84
- """
85
- audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
86
- """
87
-
88
  if audio_path is None:
89
  return ""
90
 
91
- # WAV einlesen (soundfile garantiert PCM korrekt)
92
- data, sr = sf.read(audio_path)
93
 
94
- # immer Mono
95
- if len(data.shape) > 1:
96
- data = data[:, 0]
97
-
98
- # Whisper >30s vermeiden
99
- MAX_SAMPLES = sr * 30
100
- if len(data) > MAX_SAMPLES:
101
- data = data[:MAX_SAMPLES]
102
 
103
  asr = get_asr_pipeline()
104
 
105
- print(">>> Transkribiere Audio...")
106
  result = asr(
107
- {"array": data, "sampling_rate": sr},
 
 
 
 
108
  )
109
 
110
  text = result.get("text", "").strip()
111
- print("ASR:", text)
 
 
 
 
112
  return text
113
 
114
- # ========================================================
115
- # TEXT-TO-SPEECH (TTS)
116
- # ========================================================
 
 
 
 
 
 
 
117
 
118
  def synthesize_speech(text: str):
119
- if not text or not text.strip():
120
  return None
121
 
122
  tts = get_tts_pipeline()
123
  out = tts(text)
124
 
125
- # rohes Audio from MMS (float32 [-1, 1])
126
  audio = np.array(out["audio"], dtype=np.float32)
127
  sr = out.get("sampling_rate", 16000)
128
 
129
- # ===== FIX sample_rate =====
130
- if sr is None or sr <= 0 or sr > 65535:
131
- sr = 16000
132
-
133
- # ===== Mono erzwingen =====
134
- if audio.ndim > 1:
135
- audio = audio.squeeze()
136
- if audio.ndim > 1:
137
- audio = audio[:, 0]
138
-
139
- # ===== Noise reduction =====
140
- try:
141
- audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
142
- except:
143
- pass
144
-
145
- # ===== Normalize =====
146
- max_val = np.max(np.abs(audio))
147
- if max_val > 0:
148
- audio = audio / max_val
149
-
150
- # ===== Fade gegen pop =====
151
- audio = apply_fade(audio, sr)
152
-
153
- # ===== int16 =====
154
- audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
155
 
156
- # Rückgabe: (sr, np.int16 array)
157
- return (sr, audio_int16)
 
 
 
 
 
 
 
 
 
 
 
 
1
  import numpy as np
2
  import soundfile as sf
3
+ import librosa
4
  from transformers import pipeline
5
 
6
+ ASR_MODEL_ID = "openai/whisper-small" # multilingual
7
+ TTS_MODEL_ID = "facebook/mms-tts-deu" # bạn có thể thay nếu muốn đa ngôn ngữ
 
8
 
9
  _asr = None
10
  _tts = None
11
 
 
 
 
12
 
13
+ # ============================================
14
+ # LOAD AUDIO – chuẩn hóa 16kHz mono
15
+ # ============================================
16
+ def load_audio_16k(path):
17
+ audio, sr = sf.read(path)
18
+
19
+ # Stereo → Mono
20
+ if audio.ndim > 1:
21
+ audio = audio.mean(axis=1)
22
+
23
+ # Resample → 16kHz
24
+ if sr != 16000:
25
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
26
+ sr = 16000
27
+
28
+ return audio.astype(np.float32), sr
29
+
30
+
31
+ # ============================================
32
+ # LOAD WHISPER PIPELINE (multilingual)
33
+ # ============================================
34
  def get_asr_pipeline():
35
  global _asr
36
  if _asr is None:
 
37
  _asr = pipeline(
38
  task="automatic-speech-recognition",
39
  model=ASR_MODEL_ID,
40
+ return_timestamps=False,
41
+ chunk_length_s=30,
 
42
  )
43
  return _asr
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # ============================================
47
+ # MULTILINGUAL STT
48
+ # ============================================
49
  def transcribe_audio(audio_path: str) -> str:
 
 
 
 
50
  if audio_path is None:
51
  return ""
52
 
53
+ audio, sr = load_audio_16k(audio_path)
 
54
 
55
+ # Nếu quá ngắn → Whisper sẽ sinh ký tự rác
56
+ if len(audio) < sr * 0.4:
57
+ return ""
 
 
 
 
 
58
 
59
  asr = get_asr_pipeline()
60
 
61
+ # Không đặt language → Whisper tự detect ngôn ngữ
62
  result = asr(
63
+ {"array": audio, "sampling_rate": sr},
64
+ generate_kwargs={
65
+ "task": "transcribe", # không translate — giữ nguyên ngôn ngữ gốc
66
+ "temperature": 0.0 # giảm hallucination như "ვვვ..."
67
+ }
68
  )
69
 
70
  text = result.get("text", "").strip()
71
+
72
+ # Fix edge case: nếu Whisper trả về ký tự vô nghĩa → bỏ qua
73
+ if set(text) <= {"ვ", " "}:
74
+ return ""
75
+
76
  return text
77
 
78
+
79
+ # ============================================
80
+ # TEXT → SPEECH (chưa multilingual)
81
+ # ============================================
82
+ def get_tts_pipeline():
83
+ global _tts
84
+ if _tts is None:
85
+ _tts = pipeline(task="text-to-speech", model=TTS_MODEL_ID)
86
+ return _tts
87
+
88
 
89
  def synthesize_speech(text: str):
90
+ if not text.strip():
91
  return None
92
 
93
  tts = get_tts_pipeline()
94
  out = tts(text)
95
 
 
96
  audio = np.array(out["audio"], dtype=np.float32)
97
  sr = out.get("sampling_rate", 16000)
98
 
99
+ max_val = np.max(np.abs(audio)) or 1.0
100
+ audio = audio / max_val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ return sr, (audio * 32767).astype(np.int16)