Spaces:
Configuration error
Configuration error
File size: 3,433 Bytes
b10b75c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | #!/usr/bin/env python3
"""gen_captions.py - Whisper -> SRT for burned-in captions via HF Inference API"""
import os, sys, base64, requests, subprocess, argparse
def transcribe_hf(audio_path, hf_token):
with open(audio_path, "rb") as f:
audio_b64 = base64.b64encode(f.read()).decode()
headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
body = {"inputs": audio_b64, "parameters": {"language": "id", "return_timestamps": "word"}}
url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3"
print("Transcribing via HF Inference API (whisper-large-v3)...")
r = requests.post(url, headers=headers, json=body, timeout=300)
r.raise_for_status()
chunks = r.json().get("chunks", [])
print(f"Got {len(chunks)} word chunks")
return chunks
def chunks_to_srt(chunks, max_chars=38, gap=0.7):
lines = []
idx = 1
i = 0
while i < len(chunks):
line_words, line_start, line_end = [], None, None
while i < len(chunks):
c = chunks[i]
ts = c.get("timestamp", [0, 0])
s = ts[0] if ts[0] is not None else 0
e = ts[1] if ts[1] is not None else s + 0.3
w = c.get("text", "").strip()
if line_start is None:
line_start = s
if line_words and (
len(" ".join(line_words) + " " + w) > max_chars
or (line_end is not None and s - line_end > gap)
):
break
line_words.append(w)
line_end = e
i += 1
if line_words and line_start is not None:
text = " ".join(line_words).strip()
if text:
def fmt(t):
h=int(t//3600); m=int((t%3600)//60); sec=t%60
return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".",",")
lines.append(f"{idx}\n{fmt(line_start)} --> {fmt(line_end)}\n{text}\n")
idx += 1
return "\n".join(lines)
def main():
p = argparse.ArgumentParser()
p.add_argument("--input", required=True)
p.add_argument("--srt-out", default="project/assets/captions.srt")
args = p.parse_args()
hf_token = os.environ.get("HF_TOKEN", "")
audio_path = "/tmp/caption_audio.wav"
subprocess.run(["ffmpeg","-y","-i",args.input,"-vn","-ar","16000","-ac","1",
"-b:a","32k",audio_path], capture_output=True, check=True)
print(f"Audio extracted: {os.path.getsize(audio_path)/1024:.0f} KB")
if hf_token:
try:
chunks = transcribe_hf(audio_path, hf_token)
srt = chunks_to_srt(chunks)
os.makedirs(os.path.dirname(args.srt_out), exist_ok=True)
with open(args.srt_out, "w", encoding="utf-8") as f:
f.write(srt)
print(f"SRT written: {args.srt_out} ({len(srt.splitlines())} lines)")
return
except Exception as e:
print(f"HF transcription failed: {e} — using fallback SRT")
fallback = "1\n00:00:00,500 --> 00:00:03,200\nTraining yang langsung bisa dipraktek\n\n2\n00:00:03,500 --> 00:00:06,000\nHasilnya nyata dan bisa dipakai tim\n\n"
os.makedirs(os.path.dirname(args.srt_out), exist_ok=True)
with open(args.srt_out, "w", encoding="utf-8") as f:
f.write(fallback)
print(f"Fallback SRT written: {args.srt_out}")
if __name__ == "__main__":
main()
|