Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from typing import Callable, List | |
| from openai import OpenAI | |
| from openai.types.audio import TranscriptionVerbose | |
| from pyannote.pipeline import Pipeline | |
| from pydantic import BaseModel | |
| from pydub import AudioSegment | |
| from openai import BadRequestError | |
| class TranscriptSegment(BaseModel): | |
| audio_file: str | Path | |
| speaker: str | |
| i: str | |
| start: float | |
| end: float | |
| transcript: TranscriptionVerbose | |
| def get_transcripts( | |
| diarization: Pipeline, | |
| audio_segment: AudioSegment, | |
| openai_api_key: str, | |
| whisper_model: str, | |
| whisper_prompt: str, | |
| whisper_language: str | None, | |
| tmp_dir: Path, | |
| progress_callback: Callable[[int, int], None] | None = None, | |
| ) -> List[TranscriptSegment]: | |
| client = OpenAI(api_key=openai_api_key) | |
| transcripts = [] | |
| # Count total segments | |
| total_segments = sum(1 for _ in diarization.speaker_diarization.itertracks()) | |
| segment_index = 0 | |
| for turn, i, speaker in diarization.speaker_diarization.itertracks( | |
| yield_label=True | |
| ): | |
| segment_index += 1 | |
| if progress_callback: | |
| progress_callback(segment_index, total_segments) | |
| start = turn.start * 1000 | |
| end = turn.end * 1000 | |
| chunck = audio_segment[slice(start, end)] | |
| chunk_filename = tmp_dir.joinpath(f"segment-{start}.mp3") | |
| chunck.export(chunk_filename, format="mp3") | |
| audio_chunk_segment = open(chunk_filename, "rb") | |
| params = { | |
| "file": audio_chunk_segment, | |
| "model": whisper_model, | |
| "response_format": "verbose_json", | |
| "timestamp_granularities": ["segment"], | |
| "prompt": whisper_prompt, | |
| } | |
| if whisper_language: | |
| params["language"] = whisper_language | |
| try: | |
| transcript = client.audio.transcriptions.create(**params) | |
| except BadRequestError as e: | |
| if "too short" in e.message.lower(): | |
| continue | |
| raise | |
| transcripts.append( | |
| TranscriptSegment( | |
| audio_file=chunk_filename, | |
| speaker=speaker, | |
| i=i, | |
| start=start, | |
| end=end, | |
| transcript=transcript, | |
| ) | |
| ) | |
| return transcripts | |