Spaces:

ArtCloud
/

Callytics

Sleeping

Zubik Aliaksandr

Initial Space snapshot (Docker SDK, Gradio UI, port 7860)

d60c606 17 days ago

20.9 kB

	# Standard library imports
	import os
	import re
	import json
	from io import TextIOWrapper
	from typing import Annotated, Optional, Tuple, List, Dict

	# Related third party imports
	import torch
	import faster_whisper
	from pydub import AudioSegment
	from deepmultilingualpunctuation import PunctuationModel

	# Local imports
	from src.audio.utils import TokenizerUtils


	class AudioProcessor:
	"""
	A class to handle various audio processing tasks, such as conversion,
	trimming, merging, and audio transformations.

	Parameters
	----------
	audio_path : str
	Path to the audio file to process.
	temp_dir : str, optional
	Directory for storing temporary files. Defaults to ".temp".

	Attributes
	----------
	audio_path : str
	Path to the input audio file.
	temp_dir : str
	Path to the temporary directory for processed files.
	mono_audio_path : Optional[str]
	Path to the mono audio file after conversion.

	Methods
	-------
	convert_to_mono()
	Converts the audio file to mono.
	get_duration()
	Gets the duration of the audio file in seconds.
	change_format(new_format)
	Converts the audio file to a new format.
	trim_audio(start_time, end_time)
	Trims the audio file to the specified time range.
	adjust_volume(change_in_db)
	Adjusts the volume of the audio file.
	get_channels()
	Gets the number of audio channels.
	fade_in_out(fade_in_duration, fade_out_duration)
	Applies fade-in and fade-out effects to the audio.
	merge_audio(other_audio_path)
	Merges the current audio with another audio file.
	split_audio(chunk_duration)
	Splits the audio file into chunks of a specified duration.
	create_manifest(manifest_path)
	Creates a manifest file containing metadata about the audio.
	"""

	def __init__(
	self,
	audio_path: Annotated[str, "Path to the audio file"],
	temp_dir: Annotated[str, "Directory for temporary processed files"] = ".temp"
	) -> None:
	if not isinstance(audio_path, str):
	raise TypeError("Expected 'audio_path' to be a string.")
	if not isinstance(temp_dir, str):
	raise TypeError("Expected 'temp_dir' to be a string.")

	self.audio_path = audio_path
	self.temp_dir = temp_dir
	self.mono_audio_path = None
	os.makedirs(temp_dir, exist_ok=True)

	def convert_to_mono(self) -> Annotated[str, "Path to the mono audio file"]:
	"""
	Convert the audio file to mono.

	Returns
	-------
	str
	Path to the mono audio file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> mono_path = processor.convert_to_mono()
	>>> isinstance(mono_path, str)
	True
	"""
	sound = AudioSegment.from_file(self.audio_path)
	mono_sound = sound.set_channels(1)
	self.mono_audio_path = os.path.join(self.temp_dir, "mono_file.wav")
	mono_sound.export(self.mono_audio_path, format="wav")
	return self.mono_audio_path

	def get_duration(self) -> Annotated[float, "Audio duration in seconds"]:
	"""
	Get the duration of the audio file.

	Returns
	-------
	float
	Duration of the audio in seconds.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> duration = processor.get_duration()
	>>> isinstance(duration, float)
	True
	"""
	sound = AudioSegment.from_file(self.audio_path)
	return len(sound) / 1000.0

	def change_format(
	self, new_format: Annotated[str, "New audio format"]
	) -> Annotated[str, "Path to converted audio file"]:
	"""
	Convert the audio file to a new format.

	Parameters
	----------
	new_format : str
	Desired format for the output audio file.

	Returns
	-------
	str
	Path to the converted audio file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> converted_path = processor.change_format("mp3")
	>>> isinstance(converted_path, str)
	True
	"""
	if not isinstance(new_format, str):
	raise TypeError("Expected 'new_format' to be a string.")

	sound = AudioSegment.from_file(self.audio_path)
	output_path = os.path.join(self.temp_dir, f"converted_file.{new_format}")
	sound.export(output_path, format=new_format)
	return output_path

	def trim_audio(
	self, start_time: Annotated[float, "Start time in seconds"],
	end_time: Annotated[float, "End time in seconds"]
	) -> Annotated[str, "Path to trimmed audio file"]:
	"""
	Trim the audio file to the specified duration.

	Parameters
	----------
	start_time : float
	Start time in seconds.
	end_time : float
	End time in seconds.

	Returns
	-------
	str
	Path to the trimmed audio file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> trimmed_path = processor.trim_audio(0.0, 10.0)
	>>> isinstance(trimmed_path, str)
	True
	"""
	if not isinstance(start_time, (int, float)):
	raise TypeError("Expected 'start_time' to be a float or int.")
	if not isinstance(end_time, (int, float)):
	raise TypeError("Expected 'end_time' to be a float or int.")

	sound = AudioSegment.from_file(self.audio_path)
	trimmed_audio = sound[start_time * 1000:end_time * 1000]
	trimmed_audio_path = os.path.join(self.temp_dir, "trimmed_file.wav")
	trimmed_audio.export(trimmed_audio_path, format="wav")
	return trimmed_audio_path

	def adjust_volume(
	self, change_in_db: Annotated[float, "Volume change in dB"]
	) -> Annotated[str, "Path to volume-adjusted audio file"]:
	"""
	Adjust the volume of the audio file.

	Parameters
	----------
	change_in_db : float
	Volume change in decibels.

	Returns
	-------
	str
	Path to the volume-adjusted audio file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> adjusted_path = processor.adjust_volume(5.0)
	>>> isinstance(adjusted_path, str)
	True
	"""
	if not isinstance(change_in_db, (int, float)):
	raise TypeError("Expected 'change_in_db' to be a float or int.")

	sound = AudioSegment.from_file(self.audio_path)
	adjusted_audio = sound + change_in_db
	adjusted_audio_path = os.path.join(self.temp_dir, "adjusted_volume.wav")
	adjusted_audio.export(adjusted_audio_path, format="wav")
	return adjusted_audio_path

	def get_channels(self) -> Annotated[int, "Number of channels"]:
	"""
	Get the number of audio channels.

	Returns
	-------
	int
	Number of audio channels.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> channels = processor.get_channels()
	>>> isinstance(channels, int)
	True
	"""
	sound = AudioSegment.from_file(self.audio_path)
	return sound.channels

	def fade_in_out(
	self, fade_in_duration: Annotated[float, "Fade-in duration in seconds"],
	fade_out_duration: Annotated[float, "Fade-out duration in seconds"]
	) -> Annotated[str, "Path to faded audio file"]:
	"""
	Apply fade-in and fade-out effects to the audio file.

	Parameters
	----------
	fade_in_duration : float
	Duration of the fade-in effect in seconds.
	fade_out_duration : float
	Duration of the fade-out effect in seconds.

	Returns
	-------
	str
	Path to the faded audio file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> faded_path = processor.fade_in_out(1.0, 2.0)
	>>> isinstance(faded_path, str)
	True
	"""
	if not isinstance(fade_in_duration, (int, float)):
	raise TypeError("Expected 'fade_in_duration' to be a float or int.")
	if not isinstance(fade_out_duration, (int, float)):
	raise TypeError("Expected 'fade_out_duration' to be a float or int.")

	sound = AudioSegment.from_file(self.audio_path)
	faded_audio = sound.fade_in(fade_in_duration * 1000).fade_out(fade_out_duration * 1000)
	faded_audio_path = os.path.join(self.temp_dir, "faded_audio.wav")
	faded_audio.export(faded_audio_path, format="wav")
	return faded_audio_path

	def merge_audio(
	self, other_audio_path: Annotated[str, "Path to other audio file"]
	) -> Annotated[str, "Path to merged audio file"]:
	"""
	Merge the current audio file with another audio file.

	Parameters
	----------
	other_audio_path : str
	Path to the other audio file.

	Returns
	-------
	str
	Path to the merged audio file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> merged_path = processor.merge_audio("other_example.wav")
	>>> isinstance(merged_path, str)
	True
	"""
	if not isinstance(other_audio_path, str):
	raise TypeError("Expected 'other_audio_path' to be a string.")

	sound1 = AudioSegment.from_file(self.audio_path)
	sound2 = AudioSegment.from_file(other_audio_path)
	merged_audio = sound1 + sound2
	merged_audio_path = os.path.join(self.temp_dir, "merged_audio.wav")
	merged_audio.export(merged_audio_path, format="wav")
	return merged_audio_path

	def split_audio(
	self, chunk_duration: Annotated[float, "Chunk duration in seconds"]
	) -> Annotated[List[str], "Paths to audio chunks"]:
	"""
	Split the audio file into chunks of the specified duration.

	Parameters
	----------
	chunk_duration : float
	Duration of each chunk in seconds.

	Returns
	-------
	List[str]
	Paths to the generated audio chunks.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> chunks = processor.split_audio(10.0)
	>>> isinstance(chunks, list)
	True
	"""
	if not isinstance(chunk_duration, (int, float)):
	raise TypeError("Expected 'chunk_duration' to be a float or int.")

	sound = AudioSegment.from_file(self.audio_path)
	chunk_paths = []

	for i in range(0, len(sound), int(chunk_duration * 1000)):
	chunk = sound[i:i + int(chunk_duration * 1000)]
	chunk_path = os.path.join(self.temp_dir, f"chunk_{i // 1000}.wav")
	chunk.export(chunk_path, format="wav")
	chunk_paths.append(chunk_path)

	return chunk_paths

	def create_manifest(
	self,
	manifest_path: Annotated[str, "Manifest file path"]
	) -> None:
	"""
	Create a manifest file containing metadata about the audio file.

	Parameters
	----------
	manifest_path : str
	Path to the manifest file.

	Examples
	--------
	>>> processor = AudioProcessor("example.wav")
	>>> processor.create_manifest("manifest.json")
	"""
	duration = self.get_duration()
	manifest_entry = {
	"audio_filepath": self.audio_path,
	"offset": 0,
	"duration": duration,
	"label": "infer",
	"text": "-",
	"rttm_filepath": None,
	"uem_filepath": None
	}
	with open(manifest_path, 'w', encoding='utf-8') as f: # type: TextIOWrapper
	json.dump(manifest_entry, f)


	class Transcriber:
	"""
	A class for transcribing audio files using a pre-trained Whisper model.

	Parameters
	----------
	model_name : str, optional
	Name of the model to load. Defaults to 'large-v3'.
	device : str, optional
	Device to use for model inference ('cpu' or 'cuda'). Defaults to 'cpu'.
	compute_type : str, optional
	Data type for model computation ('int8', 'float16', etc.). Defaults to 'int8'.

	Attributes
	----------
	model : faster_whisper.WhisperModel
	Loaded Whisper model for transcription.
	device : str
	Device used for inference.

	Methods
	-------
	transcribe(audio_path, language=None, suppress_numerals=False)
	Transcribes the audio file into text.
	"""

	def __init__(
	self,
	model_name: Annotated[str, "Name of the model to load"] = 'large-v3',
	device: Annotated[str, "Device to use for model inference"] = 'cpu',
	compute_type: Annotated[str, "Data type for model computation, e.g., 'int8' or 'float16'"] = 'int8'
	) -> None:
	if not isinstance(model_name, str):
	raise TypeError("Expected 'model_name' to be of type str")
	if not isinstance(device, str):
	raise TypeError("Expected 'device' to be of type str")
	if not isinstance(compute_type, str):
	raise TypeError("Expected 'compute_type' to be of type str")

	self.device = device
	self.model = faster_whisper.WhisperModel(
	model_name, device=device, compute_type=compute_type
	)

	def transcribe(
	self,
	audio_path: Annotated[str, "Path to the audio file to transcribe"],
	language: Annotated[Optional[str], "Language code for transcription, e.g., 'en' for English"] = None,
	suppress_numerals: Annotated[bool, "Whether to suppress numerals in the transcription"] = False
	) -> Annotated[Tuple[str, dict], "Transcription text and additional information"]:
	"""
	Transcribe an audio file into text.

	Parameters
	----------
	audio_path : str
	Path to the audio file.
	language : str, optional
	Language code for transcription (e.g., 'en' for English).
	suppress_numerals : bool, optional
	Whether to suppress numerals in the transcription. Defaults to False.

	Returns
	-------
	Tuple[str, dict]
	The transcribed text and additional transcription metadata.

	Examples
	--------
	>>> transcriber = Transcriber()
	>>> text, information = transcriber.transcribe("example.wav")
	>>> isinstance(text, str)
	True
	>>> isinstance(info, dict)
	True
	"""
	if not isinstance(audio_path, str):
	raise TypeError("Expected 'audio_path' to be of type str")
	if language is not None and not isinstance(language, str):
	raise TypeError("Expected 'language' to be of type str if provided")
	if not isinstance(suppress_numerals, bool):
	raise TypeError("Expected 'suppress_numerals' to be of type bool")

	audio_waveform = faster_whisper.decode_audio(audio_path)
	suppress_tokens = [-1]
	if suppress_numerals:
	suppress_tokens = TokenizerUtils.find_numeral_symbol_tokens(
	self.model.hf_tokenizer
	)

	transcript_segments, info = self.model.transcribe(
	audio_waveform,
	language=language,
	suppress_tokens=suppress_tokens,
	without_timestamps=True,
	vad_filter=True,
	log_progress=True,
	)

	transcript = ''.join(segment.text for segment in transcript_segments)
	info = vars(info)

	if self.device == 'cuda':
	del self.model
	torch.cuda.empty_cache()

	print(transcript, info)

	return transcript, info


	class PunctuationRestorer:
	"""
	A class for restoring punctuation in transcribed text.

	Parameters
	----------
	language : str, optional
	Language for punctuation restoration. Defaults to 'en'.

	Attributes
	----------
	language : str
	Language used for punctuation restoration.
	punct_model : PunctuationModel
	Model for predicting punctuation.
	supported_languages : List[str]
	List of languages supported by the model.

	Methods
	-------
	restore_punctuation(word_speaker_mapping)
	Restores punctuation in the provided text based on word mappings.
	"""

	def __init__(self, language: Annotated[str, "Language for punctuation restoration"] = 'en') -> None:
	self.language = language
	self.punct_model = PunctuationModel(model="kredor/punctuate-all")
	self.supported_languages = [
	"en", "fr", "de", "es", "it", "nl", "pt", "bg", "pl", "cs", "sk", "sl",
	]

	def restore_punctuation(
	self, word_speaker_mapping: Annotated[List[Dict], "List of word-speaker mappings"]
	) -> Annotated[List[Dict], "Word mappings with restored punctuation"]:
	"""
	Restore punctuation for transcribed text.

	Parameters
	----------
	word_speaker_mapping : List[Dict]
	List of dictionaries containing word and speaker mappings.

	Returns
	-------
	List[Dict]
	Updated list with punctuation restored.

	Examples
	--------
	>>> restorer = PunctuationRestorer()
	>>> mapping = [{"text": "hello"}, {"text": "world"}]
	>>> result = restorer.restore_punctuation(mapping)
	>>> isinstance(result, list)
	True
	>>> "text" in result[0]
	True
	"""
	if self.language not in self.supported_languages:
	print(f"Punctuation restoration is not available for {self.language} language.")
	return word_speaker_mapping

	words_list = [word_dict["text"] for word_dict in word_speaker_mapping]
	labeled_words = self.punct_model.predict(words_list)

	ending_puncts = ".?!"
	model_puncts = ".,;:!?"
	is_acronym = lambda x: re.fullmatch(r"\b(?:[a-zA-Z]\.){2,}", x)

	for word_dict, labeled_tuple in zip(word_speaker_mapping, labeled_words):
	word = word_dict["text"]
	if (
	word
	and labeled_tuple[1] in ending_puncts
	and (word[-1] not in model_puncts or is_acronym(word))
	):
	word += labeled_tuple[1]
	word = word.rstrip(".") if word.endswith("..") else word
	word_dict["text"] = word

	return word_speaker_mapping


	if __name__ == "__main__":
	sample_audio_path = "sample_audio.wav"
	audio_processor_instance = AudioProcessor(sample_audio_path)

	mono_audio_path = audio_processor_instance.convert_to_mono()
	print(f"Mono audio file saved at: {mono_audio_path}")

	audio_duration = audio_processor_instance.get_duration()
	print(f"Audio duration: {audio_duration} seconds")

	converted_audio_path = audio_processor_instance.change_format("mp3")
	print(f"Converted audio file saved at: {converted_audio_path}")

	audio_path_trimmed = audio_processor_instance.trim_audio(0.0, 10.0)
	print(f"Trimmed audio file saved at: {audio_path_trimmed}")

	volume_adjusted_audio_path = audio_processor_instance.adjust_volume(5.0)
	print(f"Volume adjusted audio file saved at: {volume_adjusted_audio_path}")

	additional_audio_path = "additional_audio.wav"
	merged_audio_output_path = audio_processor_instance.merge_audio(additional_audio_path)
	print(f"Merged audio file saved at: {merged_audio_output_path}")

	audio_chunk_paths = audio_processor_instance.split_audio(10.0)
	print(f"Audio chunks saved at: {audio_chunk_paths}")

	output_manifest_path = "output_manifest.json"
	audio_processor_instance.create_manifest(output_manifest_path)
	print(f"Manifest file saved at: {output_manifest_path}")

	transcriber_instance = Transcriber()
	transcribed_text_output, transcription_metadata = transcriber_instance.transcribe(sample_audio_path)
	print(f"Transcribed Text: {transcribed_text_output}")
	print(f"Transcription Info: {transcription_metadata}")

	word_mapping_example = [
	{"text": "hello"},
	{"text": "world"},
	{"text": "this"},
	{"text": "is"},
	{"text": "a"},
	{"text": "test"}
	]
	punctuation_restorer_instance = PunctuationRestorer()
	punctuation_restored_mapping = punctuation_restorer_instance.restore_punctuation(word_mapping_example)
	print(f"Restored Mapping: {punctuation_restored_mapping}")