Spaces:

VladB46
/

MeetingUnderstandingDemo

Running

MeetingUnderstandingDemo / translation.py

Vlad Bastina

first commit

599e594 about 1 year ago

2.83 kB

	from google.cloud import speech
	import wave
	from pydub import AudioSegment

	def get_audio_properties(file_path):
	"""
	Get sample rate and number of channels from the WAV file.
	"""
	with wave.open(file_path, "rb") as wav_file:
	sample_rate = wav_file.getframerate()
	channels = wav_file.getnchannels()
	return sample_rate, channels

	def convert_to_mono(input_path, output_path):
	"""Convert video from 2+ channel audio to 1 channel audio for a
	single detection"""
	audio = AudioSegment.from_wav(input_path)
	mono_audio = audio.set_channels(1)
	mono_audio.export(output_path, format="wav")

	def split_audio(file_path, chunk_length_ms=60000): # 60 sec chunks
	audio = AudioSegment.from_wav(file_path)
	chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]

	chunk_files = []
	for i, chunk in enumerate(chunks):
	chunk_name = f"chunk_{i}.wav"
	chunk.export(chunk_name, format="wav")
	chunk_files.append(chunk_name)

	return chunk_files

	def transcribe_audio(file_path):
	"""Gets the .wav file path and perform speech to text
	returns the string witch represents the words spoken"""
	# Initialize the speech client
	client = speech.SpeechClient()

	# Get audio properties like sample rate and channels
	sample_rate, channels = get_audio_properties(file_path)

	# Open the audio file and read its content
	with open(file_path, "rb") as audio_file:
	audio_content = audio_file.read()

	# Prepare the audio content for transcription
	audio = speech.RecognitionAudio(content=audio_content)
	config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=sample_rate,
	language_code="en-US",
	audio_channel_count=channels,
	enable_separate_recognition_per_channel=(channels > 1)
	)

	# Call the Google Cloud Speech API for recognition
	response = client.recognize(config=config, audio=audio)

	# Concatenate the transcripts from all results into one string
	concatenated_transcript = ". ".join([result.alternatives[0].transcript for result in response.results])

	# Return or print the concatenated transcript
	return concatenated_transcript

	def get_transcription_from_sound(file_path:str)->str:
	"""Converts the audio to a single channel and calls the transcription function"""
	output_path = "audio_mono.wav"
	convert_to_mono(file_path,output_path)

	final_transcript = transcribe_audio(output_path)

	return final_transcript

	if __name__=="__main__":
	file_path = "jackhammer.wav"
	output_path = "audio_mono.wav"
	convert_to_mono(file_path,output_path)
	final_transcript = transcribe_audio(output_path)
	print(final_transcript)