Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

chatbot1 / speech_io.py

Nguyen5

commit

80c3670 2 months ago

raw

history blame contribute delete

4.08 kB

	"""
	speech_io.py

	Sprachbasierte Ein-/Ausgabe:
	- Speech-to-Text (STT) mit Whisper (transformers.pipeline)
	- Text-to-Speech (TTS) mit MMS-TTS Deutsch

	Dieses File ist 100% stabil für HuggingFace Spaces.
	"""

	from typing import Optional, Tuple
	import numpy as np
	import soundfile as sf
	from scipy.signal import butter, filtfilt
	from transformers import pipeline

	# Modelle
	ASR_MODEL_ID = "openai/whisper-small"
	TTS_MODEL_ID = "facebook/mms-tts-deu"

	_asr = None
	_tts = None

	# ========================================================
	# STT PIPELINE
	# ========================================================

	def get_asr_pipeline():
	global _asr
	if _asr is None:
	print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
	_asr = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL_ID,
	device="cpu",
	return_timestamps=True, # wichtig
	chunk_length_s=30 # auto-chunk für lange audio
	)
	return _asr

	# ========================================================
	# TTS PIPELINE
	# ========================================================

	def get_tts_pipeline():
	global _tts
	if _tts is None:
	print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
	_tts = pipeline(
	task="text-to-speech",
	model=TTS_MODEL_ID,
	)
	return _tts

	# ========================================================
	# AUDIO FILTER – Noise Reduction + Highpass
	# ========================================================

	def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
	nyq = 0.5 * fs
	norm_cutoff = cutoff / nyq
	b, a = butter(order, norm_cutoff, btype="high")
	return filtfilt(b, a, data)

	def apply_fade(audio, sr, duration_ms=10):
	fade_samples = int(sr * duration_ms / 1000)

	if fade_samples * 2 >= len(audio):
	return audio

	fade_in_curve = np.linspace(0, 1, fade_samples)
	audio[:fade_samples] *= fade_in_curve

	fade_out_curve = np.linspace(1, 0, fade_samples)
	audio[-fade_samples:] *= fade_out_curve

	return audio

	# ========================================================
	# SPEECH-TO-TEXT (STT)
	# ========================================================

	def transcribe_audio(audio_path: str) -> str:
	"""
	audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
	"""

	if audio_path is None:
	return ""

	# WAV einlesen (soundfile garantiert PCM korrekt)
	data, sr = sf.read(audio_path)

	# immer Mono
	if len(data.shape) > 1:
	data = data[:, 0]

	# Whisper >30s vermeiden
	MAX_SAMPLES = sr * 30
	if len(data) > MAX_SAMPLES:
	data = data[:MAX_SAMPLES]

	asr = get_asr_pipeline()

	print(">>> Transkribiere Audio...")
	result = asr(
	{"array": data, "sampling_rate": sr},
	)

	text = result.get("text", "").strip()
	print("ASR:", text)
	return text

	# ========================================================
	# TEXT-TO-SPEECH (TTS)
	# ========================================================

	def synthesize_speech(text: str):
	if not text or not text.strip():
	return None

	tts = get_tts_pipeline()
	out = tts(text)

	# rohes Audio from MMS (float32 [-1, 1])
	audio = np.array(out["audio"], dtype=np.float32)
	sr = out.get("sampling_rate", 16000)

	# ===== FIX sample_rate =====
	if sr is None or sr <= 0 or sr > 65535:
	sr = 16000

	# ===== Mono erzwingen =====
	if audio.ndim > 1:
	audio = audio.squeeze()
	if audio.ndim > 1:
	audio = audio[:, 0]

	# ===== Noise reduction =====
	try:
	audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
	except:
	pass

	# ===== Normalize =====
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	# ===== Fade gegen pop =====
	audio = apply_fade(audio, sr)

	# ===== int16 =====
	audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)

	# Rückgabe: (sr, np.int16 array)
	return (sr, audio_int16)