import gradio as gr import tempfile import os import json import time from pathlib import Path # ── faster-whisper (Python 3.13 compatible, pre-built wheel, no build step) ─── from faster_whisper import WhisperModel # ── TTS: gTTS ───────────────────────────────────────────────────────────────── try: from gtts import gTTS GTTS_AVAILABLE = True except ImportError: GTTS_AVAILABLE = False # ── Device — HF free tier is CPU only ───────────────────────────────────────── DEVICE = "cpu" COMPUTE = "int8" # int8 quantisation: half the RAM, same accuracy, faster on CPU print(f"[INFO] device={DEVICE} compute={COMPUTE}") # ── Model cache ─────────────────────────────────────────────────────────────── _model_cache: dict = {} def load_model(name: str) -> WhisperModel: if name not in _model_cache: print(f"[INFO] Loading faster-whisper '{name}'...") _model_cache[name] = WhisperModel(name, device=DEVICE, compute_type=COMPUTE) return _model_cache[name] # ── Constants ───────────────────────────────────────────────────────────────── WHISPER_MODELS = ["tiny", "base", "small", "medium"] LANGUAGES = { "Auto Detect": None, "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Russian": "ru", "Japanese": "ja", "Chinese": "zh", "Arabic": "ar", "Hindi": "hi", "Korean": "ko", "Dutch": "nl", "Polish": "pl", "Turkish": "tr", "Swedish": "sv", "Danish": "da", "Finnish": "fi", } GTTS_LANGS = { "English (US)": "en", "English (UK)": "en-gb", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Russian": "ru", "Japanese": "ja", "Chinese": "zh-CN", "Arabic": "ar", "Hindi": "hi", "Korean": "ko", } ESPEAK_VOICES = ["en", "en-us", "en-gb", "es", "fr", "de", "it", "pt", "ru", "zh", "ar", "hi"] # ────────────────────────────────────────────────────────────────────────────── # Helpers # ────────────────────────────────────────────────────────────────────────────── def fmt_ts(s: float) -> str: h = int(s // 3600) m = int((s % 3600) // 60) sc = s % 60 return f"{h:02d}:{m:02d}:{sc:06.3f}" def build_srt(segments) -> str: lines = [] for i, seg in enumerate(segments, 1): lines += [str(i), f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}", seg.text.strip(), ""] return "\n".join(lines) def build_vtt(segments) -> str: lines = ["WEBVTT", ""] for seg in segments: lines += [f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}", seg.text.strip(), ""] return "\n".join(lines) def segs_to_list(segments): """Materialise the generator so we can iterate multiple times.""" return list(segments) # ────────────────────────────────────────────────────────────────────────────── # Speech → Text # ────────────────────────────────────────────────────────────────────────────── def transcribe_audio( audio_input, model_name, task, language, output_format, beam_size, temperature, word_timestamps, ): if audio_input is None: return "Please upload or record audio first.", None, None t0 = time.time() model = load_model(model_name) lang = LANGUAGES.get(language) # None → auto-detect # faster-whisper API segments_gen, info = model.transcribe( audio_input, task = task, language = lang, beam_size = int(beam_size), temperature = float(temperature), word_timestamps= bool(word_timestamps), vad_filter = True, # skip silence automatically ) segments = segs_to_list(segments_gen) elapsed = time.time() - t0 full_text = " ".join(s.text.strip() for s in segments) detected = info.language if not lang else language # Format output if output_format == "Plain Text": body, ext = full_text, "txt" elif output_format == "SRT Subtitles": body, ext = build_srt(segments), "srt" elif output_format == "VTT Subtitles": body, ext = build_vtt(segments), "vtt" else: # JSON body = json.dumps({ "text": full_text, "language": detected, "segments": [ { "id": i, "start": round(s.start, 3), "end": round(s.end, 3), "text": s.text.strip(), **({"words": [{"word": w.word, "start": round(w.start,3), "end": round(w.end,3)} for w in s.words]} if word_timestamps and s.words else {}), } for i, s in enumerate(segments) ], }, indent=2, ensure_ascii=False) ext = "json" tmp = tempfile.NamedTemporaryFile( suffix=f".{ext}", delete=False, mode="w", encoding="utf-8" ) tmp.write(body) tmp.close() status = (f"Done in {elapsed:.1f}s | model={model_name} | task={task} " f"| detected={detected} | device={DEVICE.upper()}") return body, tmp.name, status # ────────────────────────────────────────────────────────────────────────────── # Text → Speech # ────────────────────────────────────────────────────────────────────────────── def tts_gtts(text, lang_name, slow): if not GTTS_AVAILABLE: return None, "gTTS not available" lang = GTTS_LANGS.get(lang_name, "en") try: tts = gTTS(text=text, lang=lang, slow=slow) f = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) tts.save(f.name) return f.name, f"gTTS OK lang={lang}" except Exception as e: return None, f"gTTS error: {e}" def tts_espeak(text, voice, speed, pitch): f = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) safe = text.replace('"', '\\"') cmd = f'espeak -v {voice} -s {speed} -p {pitch} -w {f.name} "{safe}"' ret = os.system(cmd) if ret != 0: return None, "eSpeak failed — ensure packages.txt contains 'espeak'" return f.name, f"eSpeak OK voice={voice}" def synthesize_speech(text, engine, gtts_lang, gtts_slow, ev, es, ep): if not text.strip(): return None, "Please enter some text." if "gTTS" in engine: return tts_gtts(text, gtts_lang, gtts_slow) return tts_espeak(text, ev, int(es), int(ep)) # ────────────────────────────────────────────────────────────────────────────── # Model info # ────────────────────────────────────────────────────────────────────────────── def show_model_info(name): yield f"Loading `{name}`..." try: load_model(name) yield ( f"### ✅ Model `{name}` loaded\n\n" f"**Local install:**\n" f"```bash\npip install faster-whisper\n```\n\n" f"**Python usage:**\n" f"```python\nfrom faster_whisper import WhisperModel\n\n" f"model = WhisperModel('{name}', device='cpu', compute_type='int8')\n" f"segments, info = model.transcribe('audio.mp3')\n" f"for seg in segments:\n" f" print(seg.start, seg.end, seg.text)\n```" ) except Exception as e: yield f"❌ Error: {e}" # ────────────────────────────────────────────────────────────────────────────── # Gradio UI # ────────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Whisper STT + TTS Suite") as demo: gr.Markdown( "# 🎙️ Whisper STT + TTS Suite\n" "**Speech → Text** via [faster-whisper](https://github.com/SYSTRAN/faster-whisper) · " "**Text → Speech** via gTTS & eSpeak · " "Runs on 🤗 HF Spaces **free CPU tier** (Python 3.13 ✅)" ) # ── Tab 1: Speech → Text ───────────────────────────────────────────────── with gr.Tab("🎤 Speech → Text"): with gr.Row(): with gr.Column(scale=3): audio_in = gr.Audio( label="Audio Input", sources=["upload", "microphone"], type="filepath", ) with gr.Column(scale=2): model_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model") task_sel = gr.Radio(["transcribe","translate"], value="transcribe", label="Task") lang_sel = gr.Dropdown(list(LANGUAGES.keys()), value="Auto Detect", label="Language") fmt_sel = gr.Radio( ["Plain Text","SRT Subtitles","VTT Subtitles","JSON"], value="Plain Text", label="Output Format", ) with gr.Accordion("Advanced Options", open=False): with gr.Row(): beam_sl = gr.Slider(1, 10, value=5, step=1, label="Beam Size") temp_sl = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Temperature") word_ts = gr.Checkbox(value=False, label="Word-level Timestamps") stt_btn = gr.Button("▶ Transcribe", variant="primary", size="lg") stt_status = gr.Textbox(label="Status", interactive=False, max_lines=1) stt_out = gr.Textbox(label="Result", lines=12) stt_dl = gr.File(label="⬇ Download") stt_btn.click( fn=transcribe_audio, inputs=[audio_in, model_sel, task_sel, lang_sel, fmt_sel, beam_sl, temp_sl, word_ts], outputs=[stt_out, stt_dl, stt_status], ) # ── Tab 2: Text → Speech ───────────────────────────────────────────────── with gr.Tab("🔊 Text → Speech"): tts_text = gr.Textbox(label="Text", placeholder="Type text here…", lines=5) tts_eng = gr.Radio( ["gTTS (Google) — online, natural", "eSpeak — offline, instant"], value="gTTS (Google) — online, natural", label="Engine", ) with gr.Row(): with gr.Column(): gr.Markdown("**gTTS settings**") gtts_lang = gr.Dropdown(list(GTTS_LANGS.keys()), value="English (US)", label="Language") gtts_slow = gr.Checkbox(value=False, label="Slow mode") with gr.Column(): gr.Markdown("**eSpeak settings**") esp_voice = gr.Dropdown(ESPEAK_VOICES, value="en", label="Voice") esp_speed = gr.Slider(50, 400, value=150, step=10, label="Speed (wpm)") esp_pitch = gr.Slider(0, 99, value=50, step=1, label="Pitch") tts_btn = gr.Button("🔊 Synthesize", variant="primary", size="lg") tts_status = gr.Textbox(label="Status", interactive=False, max_lines=1) tts_out = gr.Audio(label="Output", type="filepath") tts_btn.click( fn=synthesize_speech, inputs=[tts_text, tts_eng, gtts_lang, gtts_slow, esp_voice, esp_speed, esp_pitch], outputs=[tts_out, tts_status], ) # ── Tab 3: Models ───────────────────────────────────────────────────────── with gr.Tab("📦 Models"): gr.Markdown(""" | Model | Size | RAM | CPU Speed | Best for | |--------|--------|------|------------|-----------------| | tiny | ~39 MB | ~1GB | ~32× RT | Quick tests | | base | ~74 MB | ~1GB | ~16× RT | General use ✅ | | small | ~244MB | ~2GB | ~6× RT | Better accuracy | | medium | ~769MB | ~5GB | ~2× RT | High accuracy | All four fit on the free-tier 16 GB RAM. `int8` quantisation is used automatically on CPU. """) dl_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model") dl_btn = gr.Button("Load & show info", variant="secondary") dl_out = gr.Markdown() dl_btn.click(fn=show_model_info, inputs=[dl_sel], outputs=[dl_out]) # ── Tab 4: Guide ────────────────────────────────────────────────────────── with gr.Tab("📖 Guide"): gr.Markdown(""" ## Local Install ```bash pip install faster-whisper gTTS soundfile # Linux: sudo apt install espeak ffmpeg ``` ## Python Usage ```python from faster_whisper import WhisperModel model = WhisperModel("base", device="cpu", compute_type="int8") # Transcribe segments, info = model.transcribe("audio.mp3") for seg in segments: print(f"[{seg.start:.1f}s → {seg.end:.1f}s] {seg.text}") # Translate to English segments, info = model.transcribe("audio.mp3", task="translate") # Force language segments, info = model.transcribe("audio.mp3", language="fr") # Word-level timestamps segments, info = model.transcribe("audio.mp3", word_timestamps=True) for seg in segments: for w in seg.words: print(w.word, w.start, w.end) ``` ## Why faster-whisper? - Pre-built wheel → no `pkg_resources` / setuptools issues on Python 3.13 - Uses CTranslate2 for **4× faster** CPU inference vs original Whisper - Same accuracy (same OpenAI model weights) - `int8` quantisation halves RAM on CPU with no accuracy loss ## Deploy to HF Spaces Push `app.py`, `requirements.txt`, `packages.txt`, `README.md` to a **Gradio** Space. First cold start: ~2–3 min (pip install). Model downloads on first transcription request. """) # ────────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, theme=gr.themes.Soft(primary_hue="violet", secondary_hue="indigo"), css="footer{display:none!important}", )