| import subprocess, shutil, torch, os, tempfile |
| from transformers import pipeline |
| import imageio_ffmpeg as ffmpeg_helper |
| from logging_config import logger |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
| def ensure_ffmpeg(): |
| """Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)""" |
| if shutil.which("ffmpeg"): |
| return |
| ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe() |
| os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "") |
|
|
|
|
| def to_wav(src: str) -> str: |
| """Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline""" |
| ensure_ffmpeg() |
| wav = tempfile.mktemp(suffix=".wav") |
| subprocess.run( |
| [ |
| "ffmpeg", |
| "-hide_banner", |
| "-loglevel", |
| "error", |
| "-i", |
| src, |
| "-ar", |
| "16000", |
| "-ac", |
| "1", |
| "-y", |
| wav, |
| ], |
| check=True, |
| ) |
| return wav |
|
|
|
|
| def run_whisper_transcription(src: str): |
| """Run OpenAI Whisper-small via HF pipeline and return list of segments.""" |
| wav = to_wav(src) |
| asr = pipeline( |
| "automatic-speech-recognition", |
| model="openai/whisper-small", |
| device=0 if DEVICE == "cuda" else -1, |
| return_timestamps=True, |
| chunk_length_s=30, |
| stride_length_s=5, |
| generate_kwargs={"task": "transcribe", "language": "en"}, |
| ) |
| logger.info("Starting Whisper …") |
| result = asr(wav) |
| segments = [ |
| { |
| "text": c["text"].strip(), |
| "start": c["timestamp"][0], |
| "end": c["timestamp"][1], |
| } |
| for c in result["chunks"] |
| if c["text"].strip() |
| ] |
| logger.info("Transcribed %d segments", len(segments)) |
| return segments |
|
|