MeetingUnderstandingDemo / translation.py
Vlad Bastina
first commit
599e594
from google.cloud import speech
import wave
from pydub import AudioSegment
def get_audio_properties(file_path):
"""
Get sample rate and number of channels from the WAV file.
"""
with wave.open(file_path, "rb") as wav_file:
sample_rate = wav_file.getframerate()
channels = wav_file.getnchannels()
return sample_rate, channels
def convert_to_mono(input_path, output_path):
"""Convert video from 2+ channel audio to 1 channel audio for a
single detection"""
audio = AudioSegment.from_wav(input_path)
mono_audio = audio.set_channels(1)
mono_audio.export(output_path, format="wav")
def split_audio(file_path, chunk_length_ms=60000): # 60 sec chunks
audio = AudioSegment.from_wav(file_path)
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
chunk_files = []
for i, chunk in enumerate(chunks):
chunk_name = f"chunk_{i}.wav"
chunk.export(chunk_name, format="wav")
chunk_files.append(chunk_name)
return chunk_files
def transcribe_audio(file_path):
"""Gets the .wav file path and perform speech to text
returns the string witch represents the words spoken"""
# Initialize the speech client
client = speech.SpeechClient()
# Get audio properties like sample rate and channels
sample_rate, channels = get_audio_properties(file_path)
# Open the audio file and read its content
with open(file_path, "rb") as audio_file:
audio_content = audio_file.read()
# Prepare the audio content for transcription
audio = speech.RecognitionAudio(content=audio_content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=sample_rate,
language_code="en-US",
audio_channel_count=channels,
enable_separate_recognition_per_channel=(channels > 1)
)
# Call the Google Cloud Speech API for recognition
response = client.recognize(config=config, audio=audio)
# Concatenate the transcripts from all results into one string
concatenated_transcript = ". ".join([result.alternatives[0].transcript for result in response.results])
# Return or print the concatenated transcript
return concatenated_transcript
def get_transcription_from_sound(file_path:str)->str:
"""Converts the audio to a single channel and calls the transcription function"""
output_path = "audio_mono.wav"
convert_to_mono(file_path,output_path)
final_transcript = transcribe_audio(output_path)
return final_transcript
if __name__=="__main__":
file_path = "jackhammer.wav"
output_path = "audio_mono.wav"
convert_to_mono(file_path,output_path)
final_transcript = transcribe_audio(output_path)
print(final_transcript)