whisperapi / app.py
sonuprasad23's picture
Update app.py
4a04d86 verified
import gradio as gr
import tempfile
import os
import json
import time
from pathlib import Path
# ── faster-whisper (Python 3.13 compatible, pre-built wheel, no build step) ───
from faster_whisper import WhisperModel
# ── TTS: gTTS ─────────────────────────────────────────────────────────────────
try:
from gtts import gTTS
GTTS_AVAILABLE = True
except ImportError:
GTTS_AVAILABLE = False
# ── Device β€” HF free tier is CPU only ─────────────────────────────────────────
DEVICE = "cpu"
COMPUTE = "int8" # int8 quantisation: half the RAM, same accuracy, faster on CPU
print(f"[INFO] device={DEVICE} compute={COMPUTE}")
# ── Model cache ───────────────────────────────────────────────────────────────
_model_cache: dict = {}
def load_model(name: str) -> WhisperModel:
if name not in _model_cache:
print(f"[INFO] Loading faster-whisper '{name}'...")
_model_cache[name] = WhisperModel(name, device=DEVICE, compute_type=COMPUTE)
return _model_cache[name]
# ── Constants ─────────────────────────────────────────────────────────────────
WHISPER_MODELS = ["tiny", "base", "small", "medium"]
LANGUAGES = {
"Auto Detect": None,
"English": "en", "Spanish": "es", "French": "fr",
"German": "de", "Italian": "it", "Portuguese": "pt",
"Russian": "ru", "Japanese": "ja", "Chinese": "zh",
"Arabic": "ar", "Hindi": "hi", "Korean": "ko",
"Dutch": "nl", "Polish": "pl", "Turkish": "tr",
"Swedish": "sv", "Danish": "da", "Finnish": "fi",
}
GTTS_LANGS = {
"English (US)": "en", "English (UK)": "en-gb",
"Spanish": "es", "French": "fr",
"German": "de", "Italian": "it",
"Portuguese": "pt", "Russian": "ru",
"Japanese": "ja", "Chinese": "zh-CN",
"Arabic": "ar", "Hindi": "hi",
"Korean": "ko",
}
ESPEAK_VOICES = ["en", "en-us", "en-gb", "es", "fr", "de", "it", "pt", "ru", "zh", "ar", "hi"]
# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
def fmt_ts(s: float) -> str:
h = int(s // 3600)
m = int((s % 3600) // 60)
sc = s % 60
return f"{h:02d}:{m:02d}:{sc:06.3f}"
def build_srt(segments) -> str:
lines = []
for i, seg in enumerate(segments, 1):
lines += [str(i),
f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}",
seg.text.strip(), ""]
return "\n".join(lines)
def build_vtt(segments) -> str:
lines = ["WEBVTT", ""]
for seg in segments:
lines += [f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}",
seg.text.strip(), ""]
return "\n".join(lines)
def segs_to_list(segments):
"""Materialise the generator so we can iterate multiple times."""
return list(segments)
# ──────────────────────────────────────────────────────────────────────────────
# Speech β†’ Text
# ──────────────────────────────────────────────────────────────────────────────
def transcribe_audio(
audio_input, model_name, task, language,
output_format, beam_size, temperature, word_timestamps,
):
if audio_input is None:
return "Please upload or record audio first.", None, None
t0 = time.time()
model = load_model(model_name)
lang = LANGUAGES.get(language) # None β†’ auto-detect
# faster-whisper API
segments_gen, info = model.transcribe(
audio_input,
task = task,
language = lang,
beam_size = int(beam_size),
temperature = float(temperature),
word_timestamps= bool(word_timestamps),
vad_filter = True, # skip silence automatically
)
segments = segs_to_list(segments_gen)
elapsed = time.time() - t0
full_text = " ".join(s.text.strip() for s in segments)
detected = info.language if not lang else language
# Format output
if output_format == "Plain Text":
body, ext = full_text, "txt"
elif output_format == "SRT Subtitles":
body, ext = build_srt(segments), "srt"
elif output_format == "VTT Subtitles":
body, ext = build_vtt(segments), "vtt"
else: # JSON
body = json.dumps({
"text": full_text,
"language": detected,
"segments": [
{
"id": i,
"start": round(s.start, 3),
"end": round(s.end, 3),
"text": s.text.strip(),
**({"words": [{"word": w.word, "start": round(w.start,3), "end": round(w.end,3)}
for w in s.words]} if word_timestamps and s.words else {}),
}
for i, s in enumerate(segments)
],
}, indent=2, ensure_ascii=False)
ext = "json"
tmp = tempfile.NamedTemporaryFile(
suffix=f".{ext}", delete=False, mode="w", encoding="utf-8"
)
tmp.write(body)
tmp.close()
status = (f"Done in {elapsed:.1f}s | model={model_name} | task={task} "
f"| detected={detected} | device={DEVICE.upper()}")
return body, tmp.name, status
# ──────────────────────────────────────────────────────────────────────────────
# Text β†’ Speech
# ──────────────────────────────────────────────────────────────────────────────
def tts_gtts(text, lang_name, slow):
if not GTTS_AVAILABLE:
return None, "gTTS not available"
lang = GTTS_LANGS.get(lang_name, "en")
try:
tts = gTTS(text=text, lang=lang, slow=slow)
f = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.save(f.name)
return f.name, f"gTTS OK lang={lang}"
except Exception as e:
return None, f"gTTS error: {e}"
def tts_espeak(text, voice, speed, pitch):
f = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
safe = text.replace('"', '\\"')
cmd = f'espeak -v {voice} -s {speed} -p {pitch} -w {f.name} "{safe}"'
ret = os.system(cmd)
if ret != 0:
return None, "eSpeak failed β€” ensure packages.txt contains 'espeak'"
return f.name, f"eSpeak OK voice={voice}"
def synthesize_speech(text, engine, gtts_lang, gtts_slow, ev, es, ep):
if not text.strip():
return None, "Please enter some text."
if "gTTS" in engine:
return tts_gtts(text, gtts_lang, gtts_slow)
return tts_espeak(text, ev, int(es), int(ep))
# ──────────────────────────────────────────────────────────────────────────────
# Model info
# ──────────────────────────────────────────────────────────────────────────────
def show_model_info(name):
yield f"Loading `{name}`..."
try:
load_model(name)
yield (
f"### βœ… Model `{name}` loaded\n\n"
f"**Local install:**\n"
f"```bash\npip install faster-whisper\n```\n\n"
f"**Python usage:**\n"
f"```python\nfrom faster_whisper import WhisperModel\n\n"
f"model = WhisperModel('{name}', device='cpu', compute_type='int8')\n"
f"segments, info = model.transcribe('audio.mp3')\n"
f"for seg in segments:\n"
f" print(seg.start, seg.end, seg.text)\n```"
)
except Exception as e:
yield f"❌ Error: {e}"
# ──────────────────────────────────────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Whisper STT + TTS Suite") as demo:
gr.Markdown(
"# πŸŽ™οΈ Whisper STT + TTS Suite\n"
"**Speech β†’ Text** via [faster-whisper](https://github.com/SYSTRAN/faster-whisper) Β· "
"**Text β†’ Speech** via gTTS & eSpeak Β· "
"Runs on πŸ€— HF Spaces **free CPU tier** (Python 3.13 βœ…)"
)
# ── Tab 1: Speech β†’ Text ─────────────────────────────────────────────────
with gr.Tab("🎀 Speech β†’ Text"):
with gr.Row():
with gr.Column(scale=3):
audio_in = gr.Audio(
label="Audio Input",
sources=["upload", "microphone"],
type="filepath",
)
with gr.Column(scale=2):
model_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model")
task_sel = gr.Radio(["transcribe","translate"], value="transcribe", label="Task")
lang_sel = gr.Dropdown(list(LANGUAGES.keys()), value="Auto Detect", label="Language")
fmt_sel = gr.Radio(
["Plain Text","SRT Subtitles","VTT Subtitles","JSON"],
value="Plain Text", label="Output Format",
)
with gr.Accordion("Advanced Options", open=False):
with gr.Row():
beam_sl = gr.Slider(1, 10, value=5, step=1, label="Beam Size")
temp_sl = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Temperature")
word_ts = gr.Checkbox(value=False, label="Word-level Timestamps")
stt_btn = gr.Button("β–Ά Transcribe", variant="primary", size="lg")
stt_status = gr.Textbox(label="Status", interactive=False, max_lines=1)
stt_out = gr.Textbox(label="Result", lines=12)
stt_dl = gr.File(label="⬇ Download")
stt_btn.click(
fn=transcribe_audio,
inputs=[audio_in, model_sel, task_sel, lang_sel,
fmt_sel, beam_sl, temp_sl, word_ts],
outputs=[stt_out, stt_dl, stt_status],
)
# ── Tab 2: Text β†’ Speech ─────────────────────────────────────────────────
with gr.Tab("πŸ”Š Text β†’ Speech"):
tts_text = gr.Textbox(label="Text", placeholder="Type text here…", lines=5)
tts_eng = gr.Radio(
["gTTS (Google) β€” online, natural", "eSpeak β€” offline, instant"],
value="gTTS (Google) β€” online, natural", label="Engine",
)
with gr.Row():
with gr.Column():
gr.Markdown("**gTTS settings**")
gtts_lang = gr.Dropdown(list(GTTS_LANGS.keys()), value="English (US)", label="Language")
gtts_slow = gr.Checkbox(value=False, label="Slow mode")
with gr.Column():
gr.Markdown("**eSpeak settings**")
esp_voice = gr.Dropdown(ESPEAK_VOICES, value="en", label="Voice")
esp_speed = gr.Slider(50, 400, value=150, step=10, label="Speed (wpm)")
esp_pitch = gr.Slider(0, 99, value=50, step=1, label="Pitch")
tts_btn = gr.Button("πŸ”Š Synthesize", variant="primary", size="lg")
tts_status = gr.Textbox(label="Status", interactive=False, max_lines=1)
tts_out = gr.Audio(label="Output", type="filepath")
tts_btn.click(
fn=synthesize_speech,
inputs=[tts_text, tts_eng, gtts_lang, gtts_slow, esp_voice, esp_speed, esp_pitch],
outputs=[tts_out, tts_status],
)
# ── Tab 3: Models ─────────────────────────────────────────────────────────
with gr.Tab("πŸ“¦ Models"):
gr.Markdown("""
| Model | Size | RAM | CPU Speed | Best for |
|--------|--------|------|------------|-----------------|
| tiny | ~39 MB | ~1GB | ~32Γ— RT | Quick tests |
| base | ~74 MB | ~1GB | ~16Γ— RT | General use βœ… |
| small | ~244MB | ~2GB | ~6Γ— RT | Better accuracy |
| medium | ~769MB | ~5GB | ~2Γ— RT | High accuracy |
All four fit on the free-tier 16 GB RAM. `int8` quantisation is used automatically on CPU.
""")
dl_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model")
dl_btn = gr.Button("Load & show info", variant="secondary")
dl_out = gr.Markdown()
dl_btn.click(fn=show_model_info, inputs=[dl_sel], outputs=[dl_out])
# ── Tab 4: Guide ──────────────────────────────────────────────────────────
with gr.Tab("πŸ“– Guide"):
gr.Markdown("""
## Local Install
```bash
pip install faster-whisper gTTS soundfile
# Linux: sudo apt install espeak ffmpeg
```
## Python Usage
```python
from faster_whisper import WhisperModel
model = WhisperModel("base", device="cpu", compute_type="int8")
# Transcribe
segments, info = model.transcribe("audio.mp3")
for seg in segments:
print(f"[{seg.start:.1f}s β†’ {seg.end:.1f}s] {seg.text}")
# Translate to English
segments, info = model.transcribe("audio.mp3", task="translate")
# Force language
segments, info = model.transcribe("audio.mp3", language="fr")
# Word-level timestamps
segments, info = model.transcribe("audio.mp3", word_timestamps=True)
for seg in segments:
for w in seg.words:
print(w.word, w.start, w.end)
```
## Why faster-whisper?
- Pre-built wheel β†’ no `pkg_resources` / setuptools issues on Python 3.13
- Uses CTranslate2 for **4Γ— faster** CPU inference vs original Whisper
- Same accuracy (same OpenAI model weights)
- `int8` quantisation halves RAM on CPU with no accuracy loss
## Deploy to HF Spaces
Push `app.py`, `requirements.txt`, `packages.txt`, `README.md` to a **Gradio** Space.
First cold start: ~2–3 min (pip install). Model downloads on first transcription request.
""")
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
theme=gr.themes.Soft(primary_hue="violet", secondary_hue="indigo"),
css="footer{display:none!important}",
)