Spaces:

maguid28
/

ClipQuery

Sleeping

ClipQuery / transcription.py

initial commit

45b9636 9 months ago

1.82 kB

	import subprocess, shutil, torch, os, tempfile
	from transformers import pipeline
	import imageio_ffmpeg as ffmpeg_helper
	from logging_config import logger

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


	def ensure_ffmpeg():
	"""Ensure ffmpeg binary exists in PATH (imageio-ffmpeg auto-download)"""
	if shutil.which("ffmpeg"):
	return
	ffmpeg_bin = ffmpeg_helper.get_ffmpeg_exe()
	os.environ["PATH"] = os.path.dirname(ffmpeg_bin) + os.pathsep + os.environ.get("PATH", "")


	def to_wav(src: str) -> str:
	"""Convert any audio/video file to 16 kHz mono wav required by Whisper HF pipeline"""
	ensure_ffmpeg()
	wav = tempfile.mktemp(suffix=".wav")
	subprocess.run(
	[
	"ffmpeg",
	"-hide_banner",
	"-loglevel",
	"error",
	"-i",
	src,
	"-ar",
	"16000",
	"-ac",
	"1",
	"-y",
	wav,
	],
	check=True,
	)
	return wav


	def run_whisper_transcription(src: str):
	"""Run OpenAI Whisper-small via HF pipeline and return list of segments."""
	wav = to_wav(src)
	asr = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-small",
	device=0 if DEVICE == "cuda" else -1,
	return_timestamps=True,
	chunk_length_s=30,
	stride_length_s=5,
	generate_kwargs={"task": "transcribe", "language": "en"},
	)
	logger.info("Starting Whisper …")
	result = asr(wav)
	segments = [
	{
	"text": c["text"].strip(),
	"start": c["timestamp"][0],
	"end": c["timestamp"][1],
	}
	for c in result["chunks"]
	if c["text"].strip()
	]
	logger.info("Transcribed %d segments", len(segments))
	return segments