Spaces:
Paused
Paused
| from abc import ABC, abstractmethod | |
| from typing import Dict, Any, List | |
| import numpy as np | |
| class SpeechModelManager(ABC): | |
| """Base class for speech transcription models""" | |
| def __init__(self, model_name: str, device: str): | |
| self.model_name = model_name | |
| self.device = device | |
| self.is_loaded = False | |
| def load_model(self) -> None: | |
| """Load the model into memory""" | |
| pass | |
| def transcribe(self, audio_file_path: str, **kwargs) -> Dict[str, Any]: | |
| """Transcribe an audio file""" | |
| pass | |
| async def transcribe_stream(self, audio_data: bytes, **kwargs) -> Dict[str, Any]: | |
| """Transcribe streaming audio data""" | |
| pass | |
| def _chunk_audio(self, audio: np.ndarray, sample_rate: int, chunk_duration: float) -> List[np.ndarray]: | |
| """Split audio into chunks of specified duration""" | |
| chunk_size = int(sample_rate * chunk_duration) | |
| chunks = [] | |
| for i in range(0, len(audio), chunk_size): | |
| chunk = audio[i:i + chunk_size] | |
| chunks.append(chunk) | |
| return chunks | |
| def _merge_segments(self, segments: List[Dict], chunk_duration: float) -> List[Dict]: | |
| """Merge segments and adjust timestamps""" | |
| merged_segments = [] | |
| time_offset = 0.0 | |
| for i, chunk_segments in enumerate(segments): | |
| # Adjust timestamps for current chunk | |
| for segment in chunk_segments: | |
| segment["start"] += time_offset | |
| segment["end"] += time_offset | |
| merged_segments.append(segment) | |
| # Update time offset for next chunk | |
| time_offset += chunk_duration | |
| return merged_segments |