File size: 4,625 Bytes
cce83a6
00a1efc
 
cce83a6
00a1efc
 
 
eb04a70
cc8bbf8
696104d
00a1efc
04f489d
696104d
 
 
 
 
 
 
 
cce83a6
696104d
 
 
 
 
 
 
 
 
 
 
 
cce83a6
696104d
cce83a6
696104d
 
 
 
 
 
749833a
696104d
cce83a6
696104d
 
749833a
cc8bbf8
696104d
 
 
 
 
 
 
 
 
 
749833a
696104d
cce83a6
696104d
749833a
696104d
749833a
cce83a6
 
749833a
 
cce83a6
 
 
 
 
 
696104d
 
 
 
749833a
696104d
 
 
cce83a6
696104d
 
 
 
ef2a1d2
cce83a6
dccaaef
696104d
 
 
 
 
 
 
 
 
 
 
 
 
cce83a6
 
 
04f489d
696104d
cce83a6
 
 
 
696104d
 
f888a3d
696104d
749833a
696104d
cce83a6
 
04f489d
696104d
 
 
00a1efc
cce83a6
 
 
28609fc
cce83a6
749833a
ef2a1d2
cce83a6
 
 
00a1efc
04f489d
cce83a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# app.py
import os
import tempfile
import uuid
from pathlib import Path
import gradio as gr
import ffmpeg
from faster_whisper import WhisperModel

# -------- Helper functions --------
def _format_timestamp(seconds: float) -> str:
    ms = int(round(seconds * 1000))
    hours = ms // 3600000
    ms_rem = ms % 3600000
    minutes = ms_rem // 60000
    ms_rem = ms_rem % 60000
    secs = ms_rem // 1000
    millis = ms_rem % 1000
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"


def segments_to_srt(segments: list) -> str:
    lines = []
    for i, seg in enumerate(segments, start=1):
        start_ts = _format_timestamp(seg["start"])
        end_ts = _format_timestamp(seg["end"])
        text = seg["text"].replace("\n", " ").strip()
        if not text:
            continue
        block = f"{i}\n{start_ts} --> {end_ts}\n{text}\n"
        lines.append(block)
    return "\n".join(lines)


# -------- Config --------
MODEL_NAME = "Systran/faster-whisper-small"  # good for HF CPU
DEVICE = "cpu"
OUTPUT_DIR = Path("outputs/subtitles")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Loading model {MODEL_NAME} on {DEVICE} ...")
model = WhisperModel(MODEL_NAME, device=DEVICE)
print("Model loaded.")


# -------- Core functions --------
def extract_audio(input_path: str, out_path: str):
    """Extracts mono 16 kHz WAV using ffmpeg"""
    try:
        (
            ffmpeg
            .input(input_path)
            .output(out_path, format="wav", acodec="pcm_s16le", ac=1, ar="16000")
            .overwrite_output()
            .run(quiet=True)
        )
    except ffmpeg.Error as e:
        stderr = getattr(e, "stderr", None)
        msg = stderr.decode() if stderr else str(e)
        raise RuntimeError(f"ffmpeg error: {msg}")


def transcribe_file_to_srt(file_obj, language: str = "en"):
    """Transcribe uploaded file to SRT; compatible with HF Spaces"""
    tmp_dir = Path(tempfile.mkdtemp(prefix="subgen_"))

    # Handle Hugging Face NamedString / Path
    input_path = Path(file_obj.name)
    if not input_path.exists():
        input_path = tmp_dir / Path(file_obj.name).name
        if hasattr(file_obj, "read_bytes"):
            with open(input_path, "wb") as f:
                f.write(file_obj.read_bytes())
        else:
            with open(file_obj.name, "rb") as src, open(input_path, "wb") as dst:
                dst.write(src.read())

    # Extract audio and transcribe
    audio_path = tmp_dir / "audio.wav"
    extract_audio(str(input_path), str(audio_path))
    segments, _ = model.transcribe(str(audio_path), language=language)
    segs = [{"start": s.start, "end": s.end, "text": s.text} for s in segments]
    srt_text = segments_to_srt(segs)

    # Save .srt file
    output_path = OUTPUT_DIR / f"{Path(file_obj.name).stem}.srt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(srt_text)
    return str(output_path), "βœ… Subtitles generated successfully!"


# -------- Gradio UI --------
with gr.Blocks(title="AI Subtitle Generator") as demo:
    theme_state = gr.State("light")

    def toggle_theme(current):
        return "dark" if current == "light" else "light"

    def apply_theme(theme_mode):
        if theme_mode == "dark":
            bg = "linear-gradient(135deg, #0f2027, #203a43, #2c5364)"
            color = "#ffffff"
        else:
            bg = "linear-gradient(135deg, #fdfbfb, #ebedee)"
            color = "#000000"
        return gr.update(
            value=f"<style>body {{ background: {bg}; color: {color}; }}</style>"
        )

    gr.HTML("<h1 style='text-align:center;'>🎬 AI Subtitle Generator</h1>")
    gr.HTML(
        "<p style='text-align:center;'>Upload a video or audio file to generate English <b>.srt</b> subtitles.</p>"
    )

    style_box = gr.HTML("")
    theme_btn = gr.Button("πŸŒ™ Toggle Light/Dark Mode")

    with gr.Row():
        input_file = gr.File(label="Upload video/audio file")
        output_file = gr.File(label="Download .srt file")

    status_box = gr.Textbox(label="Status", interactive=False)

    def on_click(file):
        srt_path, msg = transcribe_file_to_srt(file)
        return srt_path, msg

    theme_btn.click(
        toggle_theme, inputs=[theme_state], outputs=[theme_state]
    ).then(apply_theme, inputs=[theme_state], outputs=[style_box])

    generate_btn = gr.Button("Generate Subtitles")
    generate_btn.click(on_click, inputs=[input_file], outputs=[output_file, status_box])

    gr.HTML(
        "<p style='text-align:center;font-size:14px;opacity:0.7;'>Powered by Faster-Whisper + Gradio UI</p>"
    )

if __name__ == "__main__":
    demo.launch()