File size: 16,249 Bytes
773be53
 
 
36c1271
773be53
a7fcab4
773be53
a7fcab4
 
 
 
773be53
 
 
 
 
 
bc14a9e
 
 
a7fcab4
 
 
 
773be53
a7fcab4
 
 
 
 
773be53
a7fcab4
36c1271
a7fcab4
773be53
 
36c1271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773be53
 
36c1271
 
 
a7fcab4
 
 
36c1271
a7fcab4
 
 
 
 
36c1271
 
a7fcab4
 
 
 
 
 
 
36c1271
 
a7fcab4
 
 
 
 
 
36c1271
 
a7fcab4
 
 
36c1271
 
a7fcab4
 
 
36c1271
a7fcab4
 
 
 
773be53
36c1271
773be53
a7fcab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773be53
a7fcab4
773be53
a7fcab4
773be53
a7fcab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36c1271
773be53
a7fcab4
 
 
36c1271
 
773be53
a7fcab4
 
36c1271
773be53
 
a7fcab4
 
 
773be53
36c1271
773be53
a7fcab4
36c1271
 
 
a7fcab4
36c1271
a7fcab4
36c1271
 
773be53
 
36c1271
a7fcab4
 
 
 
773be53
a7fcab4
 
773be53
36c1271
a7fcab4
36c1271
 
a7fcab4
36c1271
a7fcab4
773be53
 
a7fcab4
 
 
773be53
a7fcab4
 
773be53
a7fcab4
773be53
a7fcab4
 
 
 
 
 
 
 
 
773be53
 
a7fcab4
773be53
 
a7fcab4
 
 
773be53
4a04d86
773be53
a7fcab4
 
 
 
 
 
773be53
a7fcab4
773be53
 
 
a7fcab4
 
 
 
 
773be53
a7fcab4
 
36c1271
a7fcab4
 
 
 
773be53
36c1271
 
a7fcab4
36c1271
a7fcab4
773be53
a7fcab4
36c1271
4a04d86
a7fcab4
773be53
a7fcab4
 
 
 
 
 
773be53
a7fcab4
773be53
a7fcab4
 
 
 
773be53
36c1271
 
a7fcab4
36c1271
a7fcab4
36c1271
a7fcab4
 
 
 
773be53
a7fcab4
773be53
36c1271
 
a7fcab4
 
 
 
 
36c1271
a7fcab4
36c1271
 
a7fcab4
 
 
 
 
 
 
 
36c1271
 
a7fcab4
36c1271
 
 
a7fcab4
36c1271
 
a7fcab4
36c1271
 
a7fcab4
 
36c1271
 
a7fcab4
 
36c1271
a7fcab4
 
 
773be53
36c1271
a7fcab4
 
 
36c1271
 
a7fcab4
36c1271
 
a7fcab4
36c1271
 
a7fcab4
 
 
 
36c1271
 
a7fcab4
 
 
 
 
36c1271
a7fcab4
 
 
36c1271
773be53
a7fcab4
773be53
4a04d86
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import gradio as gr
import tempfile
import os
import json
import time
from pathlib import Path

# ── faster-whisper (Python 3.13 compatible, pre-built wheel, no build step) ───
from faster_whisper import WhisperModel

# ── TTS: gTTS ─────────────────────────────────────────────────────────────────
try:
    from gtts import gTTS
    GTTS_AVAILABLE = True
except ImportError:
    GTTS_AVAILABLE = False

# ── Device β€” HF free tier is CPU only ─────────────────────────────────────────
DEVICE  = "cpu"
COMPUTE = "int8"   # int8 quantisation: half the RAM, same accuracy, faster on CPU
print(f"[INFO] device={DEVICE}  compute={COMPUTE}")

# ── Model cache ───────────────────────────────────────────────────────────────
_model_cache: dict = {}

def load_model(name: str) -> WhisperModel:
    if name not in _model_cache:
        print(f"[INFO] Loading faster-whisper '{name}'...")
        _model_cache[name] = WhisperModel(name, device=DEVICE, compute_type=COMPUTE)
    return _model_cache[name]

# ── Constants ─────────────────────────────────────────────────────────────────
WHISPER_MODELS = ["tiny", "base", "small", "medium"]

LANGUAGES = {
    "Auto Detect": None,
    "English": "en",    "Spanish": "es",  "French": "fr",
    "German": "de",     "Italian": "it",  "Portuguese": "pt",
    "Russian": "ru",    "Japanese": "ja", "Chinese": "zh",
    "Arabic": "ar",     "Hindi": "hi",    "Korean": "ko",
    "Dutch": "nl",      "Polish": "pl",   "Turkish": "tr",
    "Swedish": "sv",    "Danish": "da",   "Finnish": "fi",
}

GTTS_LANGS = {
    "English (US)": "en",   "English (UK)": "en-gb",
    "Spanish": "es",        "French": "fr",
    "German": "de",         "Italian": "it",
    "Portuguese": "pt",     "Russian": "ru",
    "Japanese": "ja",       "Chinese": "zh-CN",
    "Arabic": "ar",         "Hindi": "hi",
    "Korean": "ko",
}

ESPEAK_VOICES = ["en", "en-us", "en-gb", "es", "fr", "de", "it", "pt", "ru", "zh", "ar", "hi"]


# ──────────────────────────────────────────────────────────────────────────────
#  Helpers
# ──────────────────────────────────────────────────────────────────────────────

def fmt_ts(s: float) -> str:
    h  = int(s // 3600)
    m  = int((s % 3600) // 60)
    sc = s % 60
    return f"{h:02d}:{m:02d}:{sc:06.3f}"


def build_srt(segments) -> str:
    lines = []
    for i, seg in enumerate(segments, 1):
        lines += [str(i),
                  f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}",
                  seg.text.strip(), ""]
    return "\n".join(lines)


def build_vtt(segments) -> str:
    lines = ["WEBVTT", ""]
    for seg in segments:
        lines += [f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}",
                  seg.text.strip(), ""]
    return "\n".join(lines)


def segs_to_list(segments):
    """Materialise the generator so we can iterate multiple times."""
    return list(segments)


# ──────────────────────────────────────────────────────────────────────────────
#  Speech β†’ Text
# ──────────────────────────────────────────────────────────────────────────────

def transcribe_audio(
    audio_input, model_name, task, language,
    output_format, beam_size, temperature, word_timestamps,
):
    if audio_input is None:
        return "Please upload or record audio first.", None, None

    t0    = time.time()
    model = load_model(model_name)
    lang  = LANGUAGES.get(language)          # None β†’ auto-detect

    # faster-whisper API
    segments_gen, info = model.transcribe(
        audio_input,
        task           = task,
        language       = lang,
        beam_size      = int(beam_size),
        temperature    = float(temperature),
        word_timestamps= bool(word_timestamps),
        vad_filter     = True,               # skip silence automatically
    )
    segments = segs_to_list(segments_gen)
    elapsed  = time.time() - t0

    full_text = " ".join(s.text.strip() for s in segments)
    detected  = info.language if not lang else language

    # Format output
    if output_format == "Plain Text":
        body, ext = full_text, "txt"
    elif output_format == "SRT Subtitles":
        body, ext = build_srt(segments), "srt"
    elif output_format == "VTT Subtitles":
        body, ext = build_vtt(segments), "vtt"
    else:  # JSON
        body = json.dumps({
            "text": full_text,
            "language": detected,
            "segments": [
                {
                    "id": i,
                    "start": round(s.start, 3),
                    "end": round(s.end, 3),
                    "text": s.text.strip(),
                    **({"words": [{"word": w.word, "start": round(w.start,3), "end": round(w.end,3)}
                                  for w in s.words]} if word_timestamps and s.words else {}),
                }
                for i, s in enumerate(segments)
            ],
        }, indent=2, ensure_ascii=False)
        ext = "json"

    tmp = tempfile.NamedTemporaryFile(
        suffix=f".{ext}", delete=False, mode="w", encoding="utf-8"
    )
    tmp.write(body)
    tmp.close()

    status = (f"Done in {elapsed:.1f}s | model={model_name} | task={task} "
              f"| detected={detected} | device={DEVICE.upper()}")
    return body, tmp.name, status


# ──────────────────────────────────────────────────────────────────────────────
#  Text β†’ Speech
# ──────────────────────────────────────────────────────────────────────────────

def tts_gtts(text, lang_name, slow):
    if not GTTS_AVAILABLE:
        return None, "gTTS not available"
    lang = GTTS_LANGS.get(lang_name, "en")
    try:
        tts = gTTS(text=text, lang=lang, slow=slow)
        f   = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.save(f.name)
        return f.name, f"gTTS OK  lang={lang}"
    except Exception as e:
        return None, f"gTTS error: {e}"


def tts_espeak(text, voice, speed, pitch):
    f    = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    safe = text.replace('"', '\\"')
    cmd  = f'espeak -v {voice} -s {speed} -p {pitch} -w {f.name} "{safe}"'
    ret  = os.system(cmd)
    if ret != 0:
        return None, "eSpeak failed β€” ensure packages.txt contains 'espeak'"
    return f.name, f"eSpeak OK  voice={voice}"


def synthesize_speech(text, engine, gtts_lang, gtts_slow, ev, es, ep):
    if not text.strip():
        return None, "Please enter some text."
    if "gTTS" in engine:
        return tts_gtts(text, gtts_lang, gtts_slow)
    return tts_espeak(text, ev, int(es), int(ep))


# ──────────────────────────────────────────────────────────────────────────────
#  Model info
# ──────────────────────────────────────────────────────────────────────────────

def show_model_info(name):
    yield f"Loading `{name}`..."
    try:
        load_model(name)
        yield (
            f"### βœ… Model `{name}` loaded\n\n"
            f"**Local install:**\n"
            f"```bash\npip install faster-whisper\n```\n\n"
            f"**Python usage:**\n"
            f"```python\nfrom faster_whisper import WhisperModel\n\n"
            f"model = WhisperModel('{name}', device='cpu', compute_type='int8')\n"
            f"segments, info = model.transcribe('audio.mp3')\n"
            f"for seg in segments:\n"
            f"    print(seg.start, seg.end, seg.text)\n```"
        )
    except Exception as e:
        yield f"❌ Error: {e}"


# ──────────────────────────────────────────────────────────────────────────────
#  Gradio UI
# ──────────────────────────────────────────────────────────────────────────────

with gr.Blocks(title="Whisper STT + TTS Suite") as demo:

    gr.Markdown(
        "# πŸŽ™οΈ Whisper STT + TTS Suite\n"
        "**Speech β†’ Text** via [faster-whisper](https://github.com/SYSTRAN/faster-whisper)  Β·  "
        "**Text β†’ Speech** via gTTS & eSpeak  Β·  "
        "Runs on πŸ€— HF Spaces **free CPU tier** (Python 3.13 βœ…)"
    )

    # ── Tab 1: Speech β†’ Text ─────────────────────────────────────────────────
    with gr.Tab("🎀 Speech β†’ Text"):
        with gr.Row():
            with gr.Column(scale=3):
                audio_in = gr.Audio(
                    label="Audio Input",
                    sources=["upload", "microphone"],
                    type="filepath",
                )
            with gr.Column(scale=2):
                model_sel = gr.Dropdown(WHISPER_MODELS, value="base",       label="Model")
                task_sel  = gr.Radio(["transcribe","translate"], value="transcribe", label="Task")
                lang_sel  = gr.Dropdown(list(LANGUAGES.keys()), value="Auto Detect", label="Language")
                fmt_sel   = gr.Radio(
                    ["Plain Text","SRT Subtitles","VTT Subtitles","JSON"],
                    value="Plain Text", label="Output Format",
                )

        with gr.Accordion("Advanced Options", open=False):
            with gr.Row():
                beam_sl = gr.Slider(1, 10, value=5, step=1,    label="Beam Size")
                temp_sl = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Temperature")
                word_ts = gr.Checkbox(value=False,              label="Word-level Timestamps")

        stt_btn    = gr.Button("β–Ά  Transcribe", variant="primary", size="lg")
        stt_status = gr.Textbox(label="Status", interactive=False, max_lines=1)
        stt_out    = gr.Textbox(label="Result", lines=12)
        stt_dl     = gr.File(label="⬇ Download")

        stt_btn.click(
            fn=transcribe_audio,
            inputs=[audio_in, model_sel, task_sel, lang_sel,
                    fmt_sel, beam_sl, temp_sl, word_ts],
            outputs=[stt_out, stt_dl, stt_status],
        )

    # ── Tab 2: Text β†’ Speech ─────────────────────────────────────────────────
    with gr.Tab("πŸ”Š Text β†’ Speech"):
        tts_text = gr.Textbox(label="Text", placeholder="Type text here…", lines=5)
        tts_eng  = gr.Radio(
            ["gTTS (Google) β€” online, natural", "eSpeak β€” offline, instant"],
            value="gTTS (Google) β€” online, natural", label="Engine",
        )
        with gr.Row():
            with gr.Column():
                gr.Markdown("**gTTS settings**")
                gtts_lang = gr.Dropdown(list(GTTS_LANGS.keys()), value="English (US)", label="Language")
                gtts_slow = gr.Checkbox(value=False, label="Slow mode")
            with gr.Column():
                gr.Markdown("**eSpeak settings**")
                esp_voice = gr.Dropdown(ESPEAK_VOICES, value="en", label="Voice")
                esp_speed = gr.Slider(50, 400, value=150, step=10, label="Speed (wpm)")
                esp_pitch = gr.Slider(0, 99, value=50,  step=1,  label="Pitch")

        tts_btn    = gr.Button("πŸ”Š  Synthesize", variant="primary", size="lg")
        tts_status = gr.Textbox(label="Status", interactive=False, max_lines=1)
        tts_out    = gr.Audio(label="Output", type="filepath")

        tts_btn.click(
            fn=synthesize_speech,
            inputs=[tts_text, tts_eng, gtts_lang, gtts_slow, esp_voice, esp_speed, esp_pitch],
            outputs=[tts_out, tts_status],
        )

    # ── Tab 3: Models ─────────────────────────────────────────────────────────
    with gr.Tab("πŸ“¦ Models"):
        gr.Markdown("""
| Model  | Size   | RAM  | CPU Speed  | Best for        |
|--------|--------|------|------------|-----------------|
| tiny   | ~39 MB | ~1GB | ~32Γ— RT    | Quick tests     |
| base   | ~74 MB | ~1GB | ~16Γ— RT    | General use βœ…  |
| small  | ~244MB | ~2GB | ~6Γ— RT     | Better accuracy |
| medium | ~769MB | ~5GB | ~2Γ— RT     | High accuracy   |

All four fit on the free-tier 16 GB RAM. `int8` quantisation is used automatically on CPU.
        """)
        dl_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model")
        dl_btn = gr.Button("Load & show info", variant="secondary")
        dl_out = gr.Markdown()
        dl_btn.click(fn=show_model_info, inputs=[dl_sel], outputs=[dl_out])

    # ── Tab 4: Guide ──────────────────────────────────────────────────────────
    with gr.Tab("πŸ“– Guide"):
        gr.Markdown("""
## Local Install

```bash
pip install faster-whisper gTTS soundfile
# Linux: sudo apt install espeak ffmpeg
```

## Python Usage

```python
from faster_whisper import WhisperModel

model = WhisperModel("base", device="cpu", compute_type="int8")

# Transcribe
segments, info = model.transcribe("audio.mp3")
for seg in segments:
    print(f"[{seg.start:.1f}s β†’ {seg.end:.1f}s] {seg.text}")

# Translate to English
segments, info = model.transcribe("audio.mp3", task="translate")

# Force language
segments, info = model.transcribe("audio.mp3", language="fr")

# Word-level timestamps
segments, info = model.transcribe("audio.mp3", word_timestamps=True)
for seg in segments:
    for w in seg.words:
        print(w.word, w.start, w.end)
```

## Why faster-whisper?
- Pre-built wheel β†’ no `pkg_resources` / setuptools issues on Python 3.13
- Uses CTranslate2 for **4Γ— faster** CPU inference vs original Whisper
- Same accuracy (same OpenAI model weights)
- `int8` quantisation halves RAM on CPU with no accuracy loss

## Deploy to HF Spaces
Push `app.py`, `requirements.txt`, `packages.txt`, `README.md` to a **Gradio** Space.  
First cold start: ~2–3 min (pip install). Model downloads on first transcription request.
        """)

# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        theme=gr.themes.Soft(primary_hue="violet", secondary_hue="indigo"),
        css="footer{display:none!important}",
    )