Spaces:

Nguyen5
/

chatbot2

Runtime error

App Files Files Community

chatbot2 / speech_io.py

Nguyen5

commit

ed084d7 2 months ago

raw

history blame contribute delete

2.67 kB

	"""
	speech_io.py – STT + TTS lokal (transformers)
	"""

	from typing import Optional, Tuple
	import numpy as np
	import soundfile as sf
	from scipy.signal import butter, filtfilt
	from transformers import pipeline

	ASR_MODEL_ID = "openai/whisper-small"
	TTS_MODEL_ID = "facebook/mms-tts-deu"

	_asr = None
	_tts = None


	def get_asr_pipeline():
	global _asr
	if _asr is None:
	print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
	_asr = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL_ID,
	device="cpu",
	return_timestamps=True,
	chunk_length_s=30,
	)
	return _asr


	def get_tts_pipeline():
	global _tts
	if _tts is None:
	print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
	_tts = pipeline(
	task="text-to-speech",
	model=TTS_MODEL_ID,
	)
	return _tts


	def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
	nyq = 0.5 * fs
	norm_cutoff = cutoff / nyq
	b, a = butter(order, norm_cutoff, btype="high")
	return filtfilt(b, a, data)


	def apply_fade(audio, sr, duration_ms=10):
	fade_samples = int(sr * duration_ms / 1000)
	if fade_samples * 2 >= len(audio):
	return audio
	fade_in_curve = np.linspace(0, 1, fade_samples)
	audio[:fade_samples] *= fade_in_curve
	fade_out_curve = np.linspace(1, 0, fade_samples)
	audio[-fade_samples:] *= fade_out_curve
	return audio


	def transcribe_audio(audio_path: str) -> str:
	if audio_path is None:
	return ""
	data, sr = sf.read(audio_path)
	if len(data.shape) > 1:
	data = data[:, 0]
	MAX_SAMPLES = sr * 30
	if len(data) > MAX_SAMPLES:
	data = data[:MAX_SAMPLES]
	asr = get_asr_pipeline()
	print(">>> Transkribiere Audio...")
	result = asr({"array": data, "sampling_rate": sr})
	text = result.get("text", "").strip()
	print("ASR:", text)
	return text


	def synthesize_speech(text: str):
	if not text or not text.strip():
	return None
	tts = get_tts_pipeline()
	out = tts(text)
	audio = np.array(out["audio"], dtype=np.float32)
	sr = out.get("sampling_rate", 16000)
	if sr is None or sr <= 0 or sr > 65535:
	sr = 16000
	if audio.ndim > 1:
	audio = audio.squeeze()
	if audio.ndim > 1:
	audio = audio[:, 0]
	try:
	audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
	except Exception:
	pass
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val
	audio = apply_fade(audio, sr)
	audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
	return (sr, audio_int16)