Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tempfile | |
| import os | |
| import json | |
| import time | |
| from pathlib import Path | |
| # ββ faster-whisper (Python 3.13 compatible, pre-built wheel, no build step) βββ | |
| from faster_whisper import WhisperModel | |
| # ββ TTS: gTTS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| from gtts import gTTS | |
| GTTS_AVAILABLE = True | |
| except ImportError: | |
| GTTS_AVAILABLE = False | |
| # ββ Device β HF free tier is CPU only βββββββββββββββββββββββββββββββββββββββββ | |
| DEVICE = "cpu" | |
| COMPUTE = "int8" # int8 quantisation: half the RAM, same accuracy, faster on CPU | |
| print(f"[INFO] device={DEVICE} compute={COMPUTE}") | |
| # ββ Model cache βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _model_cache: dict = {} | |
| def load_model(name: str) -> WhisperModel: | |
| if name not in _model_cache: | |
| print(f"[INFO] Loading faster-whisper '{name}'...") | |
| _model_cache[name] = WhisperModel(name, device=DEVICE, compute_type=COMPUTE) | |
| return _model_cache[name] | |
| # ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WHISPER_MODELS = ["tiny", "base", "small", "medium"] | |
| LANGUAGES = { | |
| "Auto Detect": None, | |
| "English": "en", "Spanish": "es", "French": "fr", | |
| "German": "de", "Italian": "it", "Portuguese": "pt", | |
| "Russian": "ru", "Japanese": "ja", "Chinese": "zh", | |
| "Arabic": "ar", "Hindi": "hi", "Korean": "ko", | |
| "Dutch": "nl", "Polish": "pl", "Turkish": "tr", | |
| "Swedish": "sv", "Danish": "da", "Finnish": "fi", | |
| } | |
| GTTS_LANGS = { | |
| "English (US)": "en", "English (UK)": "en-gb", | |
| "Spanish": "es", "French": "fr", | |
| "German": "de", "Italian": "it", | |
| "Portuguese": "pt", "Russian": "ru", | |
| "Japanese": "ja", "Chinese": "zh-CN", | |
| "Arabic": "ar", "Hindi": "hi", | |
| "Korean": "ko", | |
| } | |
| ESPEAK_VOICES = ["en", "en-us", "en-gb", "es", "fr", "de", "it", "pt", "ru", "zh", "ar", "hi"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def fmt_ts(s: float) -> str: | |
| h = int(s // 3600) | |
| m = int((s % 3600) // 60) | |
| sc = s % 60 | |
| return f"{h:02d}:{m:02d}:{sc:06.3f}" | |
| def build_srt(segments) -> str: | |
| lines = [] | |
| for i, seg in enumerate(segments, 1): | |
| lines += [str(i), | |
| f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}", | |
| seg.text.strip(), ""] | |
| return "\n".join(lines) | |
| def build_vtt(segments) -> str: | |
| lines = ["WEBVTT", ""] | |
| for seg in segments: | |
| lines += [f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}", | |
| seg.text.strip(), ""] | |
| return "\n".join(lines) | |
| def segs_to_list(segments): | |
| """Materialise the generator so we can iterate multiple times.""" | |
| return list(segments) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Speech β Text | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def transcribe_audio( | |
| audio_input, model_name, task, language, | |
| output_format, beam_size, temperature, word_timestamps, | |
| ): | |
| if audio_input is None: | |
| return "Please upload or record audio first.", None, None | |
| t0 = time.time() | |
| model = load_model(model_name) | |
| lang = LANGUAGES.get(language) # None β auto-detect | |
| # faster-whisper API | |
| segments_gen, info = model.transcribe( | |
| audio_input, | |
| task = task, | |
| language = lang, | |
| beam_size = int(beam_size), | |
| temperature = float(temperature), | |
| word_timestamps= bool(word_timestamps), | |
| vad_filter = True, # skip silence automatically | |
| ) | |
| segments = segs_to_list(segments_gen) | |
| elapsed = time.time() - t0 | |
| full_text = " ".join(s.text.strip() for s in segments) | |
| detected = info.language if not lang else language | |
| # Format output | |
| if output_format == "Plain Text": | |
| body, ext = full_text, "txt" | |
| elif output_format == "SRT Subtitles": | |
| body, ext = build_srt(segments), "srt" | |
| elif output_format == "VTT Subtitles": | |
| body, ext = build_vtt(segments), "vtt" | |
| else: # JSON | |
| body = json.dumps({ | |
| "text": full_text, | |
| "language": detected, | |
| "segments": [ | |
| { | |
| "id": i, | |
| "start": round(s.start, 3), | |
| "end": round(s.end, 3), | |
| "text": s.text.strip(), | |
| **({"words": [{"word": w.word, "start": round(w.start,3), "end": round(w.end,3)} | |
| for w in s.words]} if word_timestamps and s.words else {}), | |
| } | |
| for i, s in enumerate(segments) | |
| ], | |
| }, indent=2, ensure_ascii=False) | |
| ext = "json" | |
| tmp = tempfile.NamedTemporaryFile( | |
| suffix=f".{ext}", delete=False, mode="w", encoding="utf-8" | |
| ) | |
| tmp.write(body) | |
| tmp.close() | |
| status = (f"Done in {elapsed:.1f}s | model={model_name} | task={task} " | |
| f"| detected={detected} | device={DEVICE.upper()}") | |
| return body, tmp.name, status | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Text β Speech | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def tts_gtts(text, lang_name, slow): | |
| if not GTTS_AVAILABLE: | |
| return None, "gTTS not available" | |
| lang = GTTS_LANGS.get(lang_name, "en") | |
| try: | |
| tts = gTTS(text=text, lang=lang, slow=slow) | |
| f = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) | |
| tts.save(f.name) | |
| return f.name, f"gTTS OK lang={lang}" | |
| except Exception as e: | |
| return None, f"gTTS error: {e}" | |
| def tts_espeak(text, voice, speed, pitch): | |
| f = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| safe = text.replace('"', '\\"') | |
| cmd = f'espeak -v {voice} -s {speed} -p {pitch} -w {f.name} "{safe}"' | |
| ret = os.system(cmd) | |
| if ret != 0: | |
| return None, "eSpeak failed β ensure packages.txt contains 'espeak'" | |
| return f.name, f"eSpeak OK voice={voice}" | |
| def synthesize_speech(text, engine, gtts_lang, gtts_slow, ev, es, ep): | |
| if not text.strip(): | |
| return None, "Please enter some text." | |
| if "gTTS" in engine: | |
| return tts_gtts(text, gtts_lang, gtts_slow) | |
| return tts_espeak(text, ev, int(es), int(ep)) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Model info | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def show_model_info(name): | |
| yield f"Loading `{name}`..." | |
| try: | |
| load_model(name) | |
| yield ( | |
| f"### β Model `{name}` loaded\n\n" | |
| f"**Local install:**\n" | |
| f"```bash\npip install faster-whisper\n```\n\n" | |
| f"**Python usage:**\n" | |
| f"```python\nfrom faster_whisper import WhisperModel\n\n" | |
| f"model = WhisperModel('{name}', device='cpu', compute_type='int8')\n" | |
| f"segments, info = model.transcribe('audio.mp3')\n" | |
| f"for seg in segments:\n" | |
| f" print(seg.start, seg.end, seg.text)\n```" | |
| ) | |
| except Exception as e: | |
| yield f"β Error: {e}" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio UI | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Whisper STT + TTS Suite") as demo: | |
| gr.Markdown( | |
| "# ποΈ Whisper STT + TTS Suite\n" | |
| "**Speech β Text** via [faster-whisper](https://github.com/SYSTRAN/faster-whisper) Β· " | |
| "**Text β Speech** via gTTS & eSpeak Β· " | |
| "Runs on π€ HF Spaces **free CPU tier** (Python 3.13 β )" | |
| ) | |
| # ββ Tab 1: Speech β Text βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π€ Speech β Text"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| audio_in = gr.Audio( | |
| label="Audio Input", | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| ) | |
| with gr.Column(scale=2): | |
| model_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model") | |
| task_sel = gr.Radio(["transcribe","translate"], value="transcribe", label="Task") | |
| lang_sel = gr.Dropdown(list(LANGUAGES.keys()), value="Auto Detect", label="Language") | |
| fmt_sel = gr.Radio( | |
| ["Plain Text","SRT Subtitles","VTT Subtitles","JSON"], | |
| value="Plain Text", label="Output Format", | |
| ) | |
| with gr.Accordion("Advanced Options", open=False): | |
| with gr.Row(): | |
| beam_sl = gr.Slider(1, 10, value=5, step=1, label="Beam Size") | |
| temp_sl = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Temperature") | |
| word_ts = gr.Checkbox(value=False, label="Word-level Timestamps") | |
| stt_btn = gr.Button("βΆ Transcribe", variant="primary", size="lg") | |
| stt_status = gr.Textbox(label="Status", interactive=False, max_lines=1) | |
| stt_out = gr.Textbox(label="Result", lines=12) | |
| stt_dl = gr.File(label="β¬ Download") | |
| stt_btn.click( | |
| fn=transcribe_audio, | |
| inputs=[audio_in, model_sel, task_sel, lang_sel, | |
| fmt_sel, beam_sl, temp_sl, word_ts], | |
| outputs=[stt_out, stt_dl, stt_status], | |
| ) | |
| # ββ Tab 2: Text β Speech βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Text β Speech"): | |
| tts_text = gr.Textbox(label="Text", placeholder="Type text hereβ¦", lines=5) | |
| tts_eng = gr.Radio( | |
| ["gTTS (Google) β online, natural", "eSpeak β offline, instant"], | |
| value="gTTS (Google) β online, natural", label="Engine", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("**gTTS settings**") | |
| gtts_lang = gr.Dropdown(list(GTTS_LANGS.keys()), value="English (US)", label="Language") | |
| gtts_slow = gr.Checkbox(value=False, label="Slow mode") | |
| with gr.Column(): | |
| gr.Markdown("**eSpeak settings**") | |
| esp_voice = gr.Dropdown(ESPEAK_VOICES, value="en", label="Voice") | |
| esp_speed = gr.Slider(50, 400, value=150, step=10, label="Speed (wpm)") | |
| esp_pitch = gr.Slider(0, 99, value=50, step=1, label="Pitch") | |
| tts_btn = gr.Button("π Synthesize", variant="primary", size="lg") | |
| tts_status = gr.Textbox(label="Status", interactive=False, max_lines=1) | |
| tts_out = gr.Audio(label="Output", type="filepath") | |
| tts_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[tts_text, tts_eng, gtts_lang, gtts_slow, esp_voice, esp_speed, esp_pitch], | |
| outputs=[tts_out, tts_status], | |
| ) | |
| # ββ Tab 3: Models βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π¦ Models"): | |
| gr.Markdown(""" | |
| | Model | Size | RAM | CPU Speed | Best for | | |
| |--------|--------|------|------------|-----------------| | |
| | tiny | ~39 MB | ~1GB | ~32Γ RT | Quick tests | | |
| | base | ~74 MB | ~1GB | ~16Γ RT | General use β | | |
| | small | ~244MB | ~2GB | ~6Γ RT | Better accuracy | | |
| | medium | ~769MB | ~5GB | ~2Γ RT | High accuracy | | |
| All four fit on the free-tier 16 GB RAM. `int8` quantisation is used automatically on CPU. | |
| """) | |
| dl_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model") | |
| dl_btn = gr.Button("Load & show info", variant="secondary") | |
| dl_out = gr.Markdown() | |
| dl_btn.click(fn=show_model_info, inputs=[dl_sel], outputs=[dl_out]) | |
| # ββ Tab 4: Guide ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Guide"): | |
| gr.Markdown(""" | |
| ## Local Install | |
| ```bash | |
| pip install faster-whisper gTTS soundfile | |
| # Linux: sudo apt install espeak ffmpeg | |
| ``` | |
| ## Python Usage | |
| ```python | |
| from faster_whisper import WhisperModel | |
| model = WhisperModel("base", device="cpu", compute_type="int8") | |
| # Transcribe | |
| segments, info = model.transcribe("audio.mp3") | |
| for seg in segments: | |
| print(f"[{seg.start:.1f}s β {seg.end:.1f}s] {seg.text}") | |
| # Translate to English | |
| segments, info = model.transcribe("audio.mp3", task="translate") | |
| # Force language | |
| segments, info = model.transcribe("audio.mp3", language="fr") | |
| # Word-level timestamps | |
| segments, info = model.transcribe("audio.mp3", word_timestamps=True) | |
| for seg in segments: | |
| for w in seg.words: | |
| print(w.word, w.start, w.end) | |
| ``` | |
| ## Why faster-whisper? | |
| - Pre-built wheel β no `pkg_resources` / setuptools issues on Python 3.13 | |
| - Uses CTranslate2 for **4Γ faster** CPU inference vs original Whisper | |
| - Same accuracy (same OpenAI model weights) | |
| - `int8` quantisation halves RAM on CPU with no accuracy loss | |
| ## Deploy to HF Spaces | |
| Push `app.py`, `requirements.txt`, `packages.txt`, `README.md` to a **Gradio** Space. | |
| First cold start: ~2β3 min (pip install). Model downloads on first transcription request. | |
| """) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| theme=gr.themes.Soft(primary_hue="violet", secondary_hue="indigo"), | |
| css="footer{display:none!important}", | |
| ) |