SussurroXRest / speech_models /speech_model_manager.py
LucaR84's picture
code refactor
182c2c2
from abc import ABC, abstractmethod
from typing import Dict, Any, List
import numpy as np
class SpeechModelManager(ABC):
"""Base class for speech transcription models"""
def __init__(self, model_name: str, device: str):
self.model_name = model_name
self.device = device
self.is_loaded = False
@abstractmethod
def load_model(self) -> None:
"""Load the model into memory"""
pass
@abstractmethod
def transcribe(self, audio_file_path: str, **kwargs) -> Dict[str, Any]:
"""Transcribe an audio file"""
pass
@abstractmethod
async def transcribe_stream(self, audio_data: bytes, **kwargs) -> Dict[str, Any]:
"""Transcribe streaming audio data"""
pass
def _chunk_audio(self, audio: np.ndarray, sample_rate: int, chunk_duration: float) -> List[np.ndarray]:
"""Split audio into chunks of specified duration"""
chunk_size = int(sample_rate * chunk_duration)
chunks = []
for i in range(0, len(audio), chunk_size):
chunk = audio[i:i + chunk_size]
chunks.append(chunk)
return chunks
def _merge_segments(self, segments: List[Dict], chunk_duration: float) -> List[Dict]:
"""Merge segments and adjust timestamps"""
merged_segments = []
time_offset = 0.0
for i, chunk_segments in enumerate(segments):
# Adjust timestamps for current chunk
for segment in chunk_segments:
segment["start"] += time_offset
segment["end"] += time_offset
merged_segments.append(segment)
# Update time offset for next chunk
time_offset += chunk_duration
return merged_segments