Spaces:

AIgoose
/

hyperframes-video-studio

Configuration error

hyperframes-video-studio / scripts /gen_captions.py

AIGoose

feat: Bright Studio style — Space Grotesk 100px top, captions, trim, music, navy border rule

b10b75c 13 days ago

3.43 kB

	#!/usr/bin/env python3
	"""gen_captions.py - Whisper -> SRT for burned-in captions via HF Inference API"""
	import os, sys, base64, requests, subprocess, argparse

	def transcribe_hf(audio_path, hf_token):
	with open(audio_path, "rb") as f:
	audio_b64 = base64.b64encode(f.read()).decode()
	headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
	body = {"inputs": audio_b64, "parameters": {"language": "id", "return_timestamps": "word"}}
	url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3"
	print("Transcribing via HF Inference API (whisper-large-v3)...")
	r = requests.post(url, headers=headers, json=body, timeout=300)
	r.raise_for_status()
	chunks = r.json().get("chunks", [])
	print(f"Got {len(chunks)} word chunks")
	return chunks

	def chunks_to_srt(chunks, max_chars=38, gap=0.7):
	lines = []
	idx = 1
	i = 0
	while i < len(chunks):
	line_words, line_start, line_end = [], None, None
	while i < len(chunks):
	c = chunks[i]
	ts = c.get("timestamp", [0, 0])
	s = ts[0] if ts[0] is not None else 0
	e = ts[1] if ts[1] is not None else s + 0.3
	w = c.get("text", "").strip()
	if line_start is None:
	line_start = s
	if line_words and (
	len(" ".join(line_words) + " " + w) > max_chars
	or (line_end is not None and s - line_end > gap)
	):
	break
	line_words.append(w)
	line_end = e
	i += 1
	if line_words and line_start is not None:
	text = " ".join(line_words).strip()
	if text:
	def fmt(t):
	h=int(t//3600); m=int((t%3600)//60); sec=t%60
	return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".",",")
	lines.append(f"{idx}\n{fmt(line_start)} --> {fmt(line_end)}\n{text}\n")
	idx += 1
	return "\n".join(lines)

	def main():
	p = argparse.ArgumentParser()
	p.add_argument("--input", required=True)
	p.add_argument("--srt-out", default="project/assets/captions.srt")
	args = p.parse_args()
	hf_token = os.environ.get("HF_TOKEN", "")
	audio_path = "/tmp/caption_audio.wav"
	subprocess.run(["ffmpeg","-y","-i",args.input,"-vn","-ar","16000","-ac","1",
	"-b:a","32k",audio_path], capture_output=True, check=True)
	print(f"Audio extracted: {os.path.getsize(audio_path)/1024:.0f} KB")
	if hf_token:
	try:
	chunks = transcribe_hf(audio_path, hf_token)
	srt = chunks_to_srt(chunks)
	os.makedirs(os.path.dirname(args.srt_out), exist_ok=True)
	with open(args.srt_out, "w", encoding="utf-8") as f:
	f.write(srt)
	print(f"SRT written: {args.srt_out} ({len(srt.splitlines())} lines)")
	return
	except Exception as e:
	print(f"HF transcription failed: {e} — using fallback SRT")
	fallback = "1\n00:00:00,500 --> 00:00:03,200\nTraining yang langsung bisa dipraktek\n\n2\n00:00:03,500 --> 00:00:06,000\nHasilnya nyata dan bisa dipakai tim\n\n"
	os.makedirs(os.path.dirname(args.srt_out), exist_ok=True)
	with open(args.srt_out, "w", encoding="utf-8") as f:
	f.write(fallback)
	print(f"Fallback SRT written: {args.srt_out}")

	if __name__ == "__main__":
	main()