from pathlib import Path from typing import Callable, List from openai import OpenAI from openai.types.audio import TranscriptionVerbose from pyannote.pipeline import Pipeline from pydantic import BaseModel from pydub import AudioSegment from openai import BadRequestError class TranscriptSegment(BaseModel): audio_file: str | Path speaker: str i: str start: float end: float transcript: TranscriptionVerbose def get_transcripts( diarization: Pipeline, audio_segment: AudioSegment, openai_api_key: str, whisper_model: str, whisper_prompt: str, whisper_language: str | None, tmp_dir: Path, progress_callback: Callable[[int, int], None] | None = None, ) -> List[TranscriptSegment]: client = OpenAI(api_key=openai_api_key) transcripts = [] # Count total segments total_segments = sum(1 for _ in diarization.speaker_diarization.itertracks()) segment_index = 0 for turn, i, speaker in diarization.speaker_diarization.itertracks( yield_label=True ): segment_index += 1 if progress_callback: progress_callback(segment_index, total_segments) start = turn.start * 1000 end = turn.end * 1000 chunck = audio_segment[slice(start, end)] chunk_filename = tmp_dir.joinpath(f"segment-{start}.mp3") chunck.export(chunk_filename, format="mp3") audio_chunk_segment = open(chunk_filename, "rb") params = { "file": audio_chunk_segment, "model": whisper_model, "response_format": "verbose_json", "timestamp_granularities": ["segment"], "prompt": whisper_prompt, } if whisper_language: params["language"] = whisper_language try: transcript = client.audio.transcriptions.create(**params) except BadRequestError as e: if "too short" in e.message.lower(): continue raise transcripts.append( TranscriptSegment( audio_file=chunk_filename, speaker=speaker, i=i, start=start, end=end, transcript=transcript, ) ) return transcripts