Spaces:
Paused
Paused
| import torch | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| import librosa | |
| import numpy as np | |
| from pydub import AudioSegment | |
| from config import SPEECH_MODEL, TTS_MODEL | |
| # Initialize models | |
| processor = Wav2Vec2Processor.from_pretrained(SPEECH_MODEL) | |
| model = Wav2Vec2ForCTC.from_pretrained(SPEECH_MODEL) | |
| def speech_to_text(audio_file): | |
| audio_input, _ = librosa.load(audio_file, sr=16000) | |
| input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values | |
| logits = model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.batch_decode(predicted_ids)[0] | |
| return transcription | |
| def text_to_speech(text): | |
| tts_model = torch.hub.load('snakers4/silero-models', 'silero_tts', model_name=TTS_MODEL) | |
| audio = tts_model.apply_tts(text=text, speaker='en_0', sample_rate=48000) | |
| # Convert the audio tensor to a numpy array | |
| audio_np = audio.numpy() | |
| # Normalize the audio to 16-bit PCM range | |
| audio_np = (audio_np * 32767).astype(np.int16) | |
| # Create an AudioSegment directly from the numpy array | |
| audio_segment = AudioSegment( | |
| audio_np.tobytes(), | |
| frame_rate=48000, | |
| sample_width=2, | |
| channels=1 | |
| ) | |
| return audio_segment | |
| def process_audio_chunk(chunk): | |
| # Assuming chunk is a byte string of raw audio data | |
| audio_np = np.frombuffer(chunk, dtype=np.int16) | |
| audio_float = audio_np.astype(np.float32) / 32768.0 | |
| return speech_to_text(audio_float) | |