Spaces:
Sleeping
Sleeping
File size: 2,292 Bytes
dd5bcef 1333284 dd5bcef 1333284 dd5bcef 1333284 dd5bcef 1333284 dd5bcef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from pathlib import Path
from typing import Callable, List
from openai import OpenAI
from openai.types.audio import TranscriptionVerbose
from pyannote.pipeline import Pipeline
from pydantic import BaseModel
from pydub import AudioSegment
from openai import BadRequestError
class TranscriptSegment(BaseModel):
audio_file: str | Path
speaker: str
i: str
start: float
end: float
transcript: TranscriptionVerbose
def get_transcripts(
diarization: Pipeline,
audio_segment: AudioSegment,
openai_api_key: str,
whisper_model: str,
whisper_prompt: str,
whisper_language: str | None,
tmp_dir: Path,
progress_callback: Callable[[int, int], None] | None = None,
) -> List[TranscriptSegment]:
client = OpenAI(api_key=openai_api_key)
transcripts = []
# Count total segments
total_segments = sum(1 for _ in diarization.speaker_diarization.itertracks())
segment_index = 0
for turn, i, speaker in diarization.speaker_diarization.itertracks(
yield_label=True
):
segment_index += 1
if progress_callback:
progress_callback(segment_index, total_segments)
start = turn.start * 1000
end = turn.end * 1000
chunck = audio_segment[slice(start, end)]
chunk_filename = tmp_dir.joinpath(f"segment-{start}.mp3")
chunck.export(chunk_filename, format="mp3")
audio_chunk_segment = open(chunk_filename, "rb")
params = {
"file": audio_chunk_segment,
"model": whisper_model,
"response_format": "verbose_json",
"timestamp_granularities": ["segment"],
"prompt": whisper_prompt,
}
if whisper_language:
params["language"] = whisper_language
try:
transcript = client.audio.transcriptions.create(**params)
except BadRequestError as e:
if "too short" in e.message.lower():
continue
raise
transcripts.append(
TranscriptSegment(
audio_file=chunk_filename,
speaker=speaker,
i=i,
start=start,
end=end,
transcript=transcript,
)
)
return transcripts
|