Spaces:

ArtCloud
/

Callytics

Sleeping

Zubik Aliaksandr

Initial Space snapshot (Docker SDK, Gradio UI, port 7860)

d60c606 17 days ago

4.34 kB

	# Standard library imports
	import os
	from typing import Annotated, List, Dict

	# Related third-party imports
	import torch
	from faster_whisper import decode_audio
	from ctc_forced_aligner import (
	generate_emissions,
	get_alignments,
	get_spans,
	load_alignment_model,
	postprocess_results,
	preprocess_text,
	)


	class ForcedAligner:
	"""
	ForcedAligner is a class for aligning audio to a provided transcript using a pre-trained alignment model.

	Attributes
	----------
	device : str
	Device to run the model on ('cuda' for GPU or 'cpu').
	alignment_model : torch.nn.Module
	The pre-trained alignment model.
	alignment_tokenizer : Any
	Tokenizer for processing text in alignment.

	Methods
	-------
	align(audio_path, transcript, language, batch_size)
	Aligns audio with a transcript and returns word-level timing information.
	"""

	def __init__(self, device: Annotated[str, "Device for model ('cuda' or 'cpu')"] = None):
	"""
	Initialize the ForcedAligner with the specified device.

	Parameters
	----------
	device : str, optional
	Device for running the model, by default 'cuda' if available, otherwise 'cpu'.
	"""
	self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

	self.alignment_model, self.alignment_tokenizer = load_alignment_model(
	self.device,
	dtype=torch.float16 if self.device == 'cuda' else torch.float32,
	)

	def align(
	self,
	audio_path: Annotated[str, "Path to the audio file"],
	transcript: Annotated[str, "Transcript of the audio content"],
	language: Annotated[str, "Language of the transcript"] = 'en',
	batch_size: Annotated[int, "Batch size for emission generation"] = 8,
	) -> Annotated[List[Dict[str, float]], "List of word alignment data with timestamps"]:
	"""
	Aligns audio with a transcript and returns word-level timing information.

	Parameters
	----------
	audio_path : str
	Path to the audio file.
	transcript : str
	Transcript text corresponding to the audio.
	language : str, optional
	Language code for the transcript, default is 'en' (English).
	batch_size : int, optional
	Batch size for generating emissions, by default 8.

	Returns
	-------
	List[Dict[str, float]]
	A list of dictionaries containing word timing information.

	Raises
	------
	FileNotFoundError
	If the specified audio file does not exist.

	Examples
	--------
	>>> aligner = ForcedAligner()
	>>> aligner.align("path/to/audio.wav", "hello world")
	[{'word': 'hello', 'start': 0.0, 'end': 0.5}, {'word': 'world', 'start': 0.6, 'end': 1.0}]
	"""
	if not os.path.exists(audio_path):
	raise FileNotFoundError(
	f"The audio file at path '{audio_path}' was not found."
	)

	speech_array = torch.from_numpy(decode_audio(audio_path))

	emissions, stride = generate_emissions(
	self.alignment_model,
	speech_array.to(self.alignment_model.dtype).to(self.alignment_model.device),
	batch_size=batch_size,
	)

	tokens_starred, text_starred = preprocess_text(
	transcript,
	romanize=True,
	language=language,
	)

	segments, scores, blank_token = get_alignments(
	emissions,
	tokens_starred,
	self.alignment_tokenizer,
	)

	spans = get_spans(tokens_starred, segments, blank_token)

	word_timestamps = postprocess_results(text_starred, spans, stride, scores)

	if self.device == 'cuda':
	del self.alignment_model
	torch.cuda.empty_cache()

	print(f"Word_Timestamps: {word_timestamps}")

	return word_timestamps


	if __name__ == "__main__":

	forced_aligner = ForcedAligner()
	try:
	path = "example_audio.wav"
	audio_transcript = "This is a test transcript."
	word_timestamp = forced_aligner.align(path, audio_transcript)
	print(word_timestamp)
	except FileNotFoundError as e:
	print(e)