Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools /speech_recognition_tool.py

FD900

Update tools/speech_recognition_tool.py

25b2219 verified 7 months ago

raw

history blame contribute delete

2.35 kB

	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
	import torch
	import warnings
	from tools.base_tool import BaseTool

	class SpeechRecognitionTool(BaseTool):
	name = 'speech_to_text'
	description = 'Transcribes speech from audio input.'

	def __init__(self):
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	dtype = torch.float16 if device == 'cuda' else torch.float32
	model_id = 'openai/whisper-large-v3-turbo'

	self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	torch_dtype=dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True,
	).to(device)

	self.processor = AutoProcessor.from_pretrained(model_id)

	logging.set_verbosity_error()
	warnings.filterwarnings("ignore", category=FutureWarning)

	self.pipeline = pipeline(
	"automatic-speech-recognition",
	model=self.model,
	tokenizer=self.processor.tokenizer,
	feature_extractor=self.processor.feature_extractor,
	torch_dtype=dtype,
	device=device,
	chunk_length_s=30,
	return_timestamps=True,
	)

	def transcribe(self, audio_path: str, with_timestamps: bool = False) -> str:
	result = self.pipeline(audio_path)

	if not with_timestamps:
	return result['text'].strip()

	formatted = ""
	for chunk in self._parse_timed_chunks(result['chunks']):
	formatted += f"[{chunk['start']:.2f}]\n{chunk['text']}\n[{chunk['end']:.2f}]\n"
	return formatted.strip()

	def _parse_timed_chunks(self, chunks):
	absolute_offset = 0.0
	current_offset = 0.0
	normalized = []
	max_chunk = 30.0

	for c in chunks:
	start, end = c['timestamp']
	if start < current_offset:
	absolute_offset += max_chunk
	current_offset = start
	start_time = absolute_offset + start

	if end < start:
	absolute_offset += max_chunk
	end_time = absolute_offset + end
	current_offset = end

	text = c['text'].strip()
	if text:
	normalized.append({"start": start_time, "end": end_time, "text": text})

	return normalized