File size: 3,433 Bytes
b10b75c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
"""gen_captions.py - Whisper -> SRT for burned-in captions via HF Inference API"""
import os, sys, base64, requests, subprocess, argparse

def transcribe_hf(audio_path, hf_token):
    with open(audio_path, "rb") as f:
        audio_b64 = base64.b64encode(f.read()).decode()
    headers = {"Authorization": f"Bearer {hf_token}", "Content-Type": "application/json"}
    body = {"inputs": audio_b64, "parameters": {"language": "id", "return_timestamps": "word"}}
    url = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3"
    print("Transcribing via HF Inference API (whisper-large-v3)...")
    r = requests.post(url, headers=headers, json=body, timeout=300)
    r.raise_for_status()
    chunks = r.json().get("chunks", [])
    print(f"Got {len(chunks)} word chunks")
    return chunks

def chunks_to_srt(chunks, max_chars=38, gap=0.7):
    lines = []
    idx = 1
    i = 0
    while i < len(chunks):
        line_words, line_start, line_end = [], None, None
        while i < len(chunks):
            c = chunks[i]
            ts = c.get("timestamp", [0, 0])
            s = ts[0] if ts[0] is not None else 0
            e = ts[1] if ts[1] is not None else s + 0.3
            w = c.get("text", "").strip()
            if line_start is None:
                line_start = s
            if line_words and (
                len(" ".join(line_words) + " " + w) > max_chars
                or (line_end is not None and s - line_end > gap)
            ):
                break
            line_words.append(w)
            line_end = e
            i += 1
        if line_words and line_start is not None:
            text = " ".join(line_words).strip()
            if text:
                def fmt(t):
                    h=int(t//3600); m=int((t%3600)//60); sec=t%60
                    return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".",",")
                lines.append(f"{idx}\n{fmt(line_start)} --> {fmt(line_end)}\n{text}\n")
                idx += 1
    return "\n".join(lines)

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--input", required=True)
    p.add_argument("--srt-out", default="project/assets/captions.srt")
    args = p.parse_args()
    hf_token = os.environ.get("HF_TOKEN", "")
    audio_path = "/tmp/caption_audio.wav"
    subprocess.run(["ffmpeg","-y","-i",args.input,"-vn","-ar","16000","-ac","1",
                    "-b:a","32k",audio_path], capture_output=True, check=True)
    print(f"Audio extracted: {os.path.getsize(audio_path)/1024:.0f} KB")
    if hf_token:
        try:
            chunks = transcribe_hf(audio_path, hf_token)
            srt = chunks_to_srt(chunks)
            os.makedirs(os.path.dirname(args.srt_out), exist_ok=True)
            with open(args.srt_out, "w", encoding="utf-8") as f:
                f.write(srt)
            print(f"SRT written: {args.srt_out} ({len(srt.splitlines())} lines)")
            return
        except Exception as e:
            print(f"HF transcription failed: {e} — using fallback SRT")
    fallback = "1\n00:00:00,500 --> 00:00:03,200\nTraining yang langsung bisa dipraktek\n\n2\n00:00:03,500 --> 00:00:06,000\nHasilnya nyata dan bisa dipakai tim\n\n"
    os.makedirs(os.path.dirname(args.srt_out), exist_ok=True)
    with open(args.srt_out, "w", encoding="utf-8") as f:
        f.write(fallback)
    print(f"Fallback SRT written: {args.srt_out}")

if __name__ == "__main__":
    main()