Spaces:

sonuprasad23
/

whisperapi

Sleeping

File size: 16,249 Bytes

773be53
 
 
36c1271
773be53
a7fcab4
773be53
a7fcab4
 
 
 
773be53
 
 
 
 
 
bc14a9e
 
 
a7fcab4
 
 
 
773be53
a7fcab4
 
 
 
 
773be53
a7fcab4
36c1271
a7fcab4
773be53
 
36c1271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773be53
 
36c1271
 
 
a7fcab4
 
 
36c1271
a7fcab4
 
 
 
 
36c1271
 
a7fcab4
 
 
 
 
 
 
36c1271
 
a7fcab4
 
 
 
 
 
36c1271
 
a7fcab4
 
 
36c1271
 
a7fcab4
 
 
36c1271
a7fcab4
 
 
 
773be53
36c1271
773be53
a7fcab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773be53
a7fcab4
773be53
a7fcab4
773be53
a7fcab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36c1271
773be53
a7fcab4
 
 
36c1271
 
773be53
a7fcab4
 
36c1271
773be53
 
a7fcab4
 
 
773be53
36c1271
773be53
a7fcab4
36c1271
 
 
a7fcab4
36c1271
a7fcab4
36c1271
 
773be53
 
36c1271
a7fcab4
 
 
 
773be53
a7fcab4
 
773be53
36c1271
a7fcab4
36c1271
 
a7fcab4
36c1271
a7fcab4
773be53
 
a7fcab4
 
 
773be53
a7fcab4
 
773be53
a7fcab4
773be53
a7fcab4
 
 
 
 
 
 
 
 
773be53
 
a7fcab4
773be53
 
a7fcab4
 
 
773be53
4a04d86
773be53
a7fcab4
 
 
 
 
 
773be53
a7fcab4
773be53
 
 
a7fcab4
 
 
 
 
773be53
a7fcab4
 
36c1271
a7fcab4
 
 
 
773be53
36c1271
 
a7fcab4
36c1271
a7fcab4
773be53
a7fcab4
36c1271
4a04d86
a7fcab4
773be53
a7fcab4
 
 
 
 
 
773be53
a7fcab4
773be53
a7fcab4
 
 
 
773be53
36c1271
 
a7fcab4
36c1271
a7fcab4
36c1271
a7fcab4
 
 
 
773be53
a7fcab4
773be53
36c1271
 
a7fcab4
 
 
 
 
36c1271
a7fcab4
36c1271
 
a7fcab4
 
 
 
 
 
 
 
36c1271
 
a7fcab4
36c1271
 
 
a7fcab4
36c1271
 
a7fcab4
36c1271
 
a7fcab4
 
36c1271
 
a7fcab4
 
36c1271
a7fcab4
 
 
773be53
36c1271
a7fcab4
 
 
36c1271
 
a7fcab4
36c1271
 
a7fcab4
36c1271
 
a7fcab4
 
 
 
36c1271
 
a7fcab4
 
 
 
 
36c1271
a7fcab4
 
 
36c1271
773be53
a7fcab4
773be53
4a04d86

import gradio as gr
import tempfile
import os
import json
import time
from pathlib import Path

# ── faster-whisper (Python 3.13 compatible, pre-built wheel, no build step) ───
from faster_whisper import WhisperModel

# ── TTS: gTTS ─────────────────────────────────────────────────────────────────
try:
    from gtts import gTTS
    GTTS_AVAILABLE = True
except ImportError:
    GTTS_AVAILABLE = False

# ── Device — HF free tier is CPU only ─────────────────────────────────────────
DEVICE  = "cpu"
COMPUTE = "int8"   # int8 quantisation: half the RAM, same accuracy, faster on CPU
print(f"[INFO] device={DEVICE}  compute={COMPUTE}")

# ── Model cache ───────────────────────────────────────────────────────────────
_model_cache: dict = {}

def load_model(name: str) -> WhisperModel:
    if name not in _model_cache:
        print(f"[INFO] Loading faster-whisper '{name}'...")
        _model_cache[name] = WhisperModel(name, device=DEVICE, compute_type=COMPUTE)
    return _model_cache[name]

# ── Constants ─────────────────────────────────────────────────────────────────
WHISPER_MODELS = ["tiny", "base", "small", "medium"]

LANGUAGES = {
    "Auto Detect": None,
    "English": "en",    "Spanish": "es",  "French": "fr",
    "German": "de",     "Italian": "it",  "Portuguese": "pt",
    "Russian": "ru",    "Japanese": "ja", "Chinese": "zh",
    "Arabic": "ar",     "Hindi": "hi",    "Korean": "ko",
    "Dutch": "nl",      "Polish": "pl",   "Turkish": "tr",
    "Swedish": "sv",    "Danish": "da",   "Finnish": "fi",
}

GTTS_LANGS = {
    "English (US)": "en",   "English (UK)": "en-gb",
    "Spanish": "es",        "French": "fr",
    "German": "de",         "Italian": "it",
    "Portuguese": "pt",     "Russian": "ru",
    "Japanese": "ja",       "Chinese": "zh-CN",
    "Arabic": "ar",         "Hindi": "hi",
    "Korean": "ko",
}

ESPEAK_VOICES = ["en", "en-us", "en-gb", "es", "fr", "de", "it", "pt", "ru", "zh", "ar", "hi"]


# ──────────────────────────────────────────────────────────────────────────────
#  Helpers
# ──────────────────────────────────────────────────────────────────────────────

def fmt_ts(s: float) -> str:
    h  = int(s // 3600)
    m  = int((s % 3600) // 60)
    sc = s % 60
    return f"{h:02d}:{m:02d}:{sc:06.3f}"


def build_srt(segments) -> str:
    lines = []
    for i, seg in enumerate(segments, 1):
        lines += [str(i),
                  f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}",
                  seg.text.strip(), ""]
    return "\n".join(lines)


def build_vtt(segments) -> str:
    lines = ["WEBVTT", ""]
    for seg in segments:
        lines += [f"{fmt_ts(seg.start)} --> {fmt_ts(seg.end)}",
                  seg.text.strip(), ""]
    return "\n".join(lines)


def segs_to_list(segments):
    """Materialise the generator so we can iterate multiple times."""
    return list(segments)


# ──────────────────────────────────────────────────────────────────────────────
#  Speech → Text
# ──────────────────────────────────────────────────────────────────────────────

def transcribe_audio(
    audio_input, model_name, task, language,
    output_format, beam_size, temperature, word_timestamps,
):
    if audio_input is None:
        return "Please upload or record audio first.", None, None

    t0    = time.time()
    model = load_model(model_name)
    lang  = LANGUAGES.get(language)          # None → auto-detect

    # faster-whisper API
    segments_gen, info = model.transcribe(
        audio_input,
        task           = task,
        language       = lang,
        beam_size      = int(beam_size),
        temperature    = float(temperature),
        word_timestamps= bool(word_timestamps),
        vad_filter     = True,               # skip silence automatically
    )
    segments = segs_to_list(segments_gen)
    elapsed  = time.time() - t0

    full_text = " ".join(s.text.strip() for s in segments)
    detected  = info.language if not lang else language

    # Format output
    if output_format == "Plain Text":
        body, ext = full_text, "txt"
    elif output_format == "SRT Subtitles":
        body, ext = build_srt(segments), "srt"
    elif output_format == "VTT Subtitles":
        body, ext = build_vtt(segments), "vtt"
    else:  # JSON
        body = json.dumps({
            "text": full_text,
            "language": detected,
            "segments": [
                {
                    "id": i,
                    "start": round(s.start, 3),
                    "end": round(s.end, 3),
                    "text": s.text.strip(),
                    **({"words": [{"word": w.word, "start": round(w.start,3), "end": round(w.end,3)}
                                  for w in s.words]} if word_timestamps and s.words else {}),
                }
                for i, s in enumerate(segments)
            ],
        }, indent=2, ensure_ascii=False)
        ext = "json"

    tmp = tempfile.NamedTemporaryFile(
        suffix=f".{ext}", delete=False, mode="w", encoding="utf-8"
    )
    tmp.write(body)
    tmp.close()

    status = (f"Done in {elapsed:.1f}s | model={model_name} | task={task} "
              f"| detected={detected} | device={DEVICE.upper()}")
    return body, tmp.name, status


# ──────────────────────────────────────────────────────────────────────────────
#  Text → Speech
# ──────────────────────────────────────────────────────────────────────────────

def tts_gtts(text, lang_name, slow):
    if not GTTS_AVAILABLE:
        return None, "gTTS not available"
    lang = GTTS_LANGS.get(lang_name, "en")
    try:
        tts = gTTS(text=text, lang=lang, slow=slow)
        f   = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.save(f.name)
        return f.name, f"gTTS OK  lang={lang}"
    except Exception as e:
        return None, f"gTTS error: {e}"


def tts_espeak(text, voice, speed, pitch):
    f    = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    safe = text.replace('"', '\\"')
    cmd  = f'espeak -v {voice} -s {speed} -p {pitch} -w {f.name} "{safe}"'
    ret  = os.system(cmd)
    if ret != 0:
        return None, "eSpeak failed — ensure packages.txt contains 'espeak'"
    return f.name, f"eSpeak OK  voice={voice}"


def synthesize_speech(text, engine, gtts_lang, gtts_slow, ev, es, ep):
    if not text.strip():
        return None, "Please enter some text."
    if "gTTS" in engine:
        return tts_gtts(text, gtts_lang, gtts_slow)
    return tts_espeak(text, ev, int(es), int(ep))


# ──────────────────────────────────────────────────────────────────────────────
#  Model info
# ──────────────────────────────────────────────────────────────────────────────

def show_model_info(name):
    yield f"Loading `{name}`..."
    try:
        load_model(name)
        yield (
            f"### ✅ Model `{name}` loaded\n\n"
            f"**Local install:**\n"
            f"```bash\npip install faster-whisper\n```\n\n"
            f"**Python usage:**\n"
            f"```python\nfrom faster_whisper import WhisperModel\n\n"
            f"model = WhisperModel('{name}', device='cpu', compute_type='int8')\n"
            f"segments, info = model.transcribe('audio.mp3')\n"
            f"for seg in segments:\n"
            f"    print(seg.start, seg.end, seg.text)\n```"
        )
    except Exception as e:
        yield f"❌ Error: {e}"


# ──────────────────────────────────────────────────────────────────────────────
#  Gradio UI
# ──────────────────────────────────────────────────────────────────────────────

with gr.Blocks(title="Whisper STT + TTS Suite") as demo:

    gr.Markdown(
        "# 🎙️ Whisper STT + TTS Suite\n"
        "**Speech → Text** via [faster-whisper](https://github.com/SYSTRAN/faster-whisper)  ·  "
        "**Text → Speech** via gTTS & eSpeak  ·  "
        "Runs on 🤗 HF Spaces **free CPU tier** (Python 3.13 ✅)"
    )

    # ── Tab 1: Speech → Text ─────────────────────────────────────────────────
    with gr.Tab("🎤 Speech → Text"):
        with gr.Row():
            with gr.Column(scale=3):
                audio_in = gr.Audio(
                    label="Audio Input",
                    sources=["upload", "microphone"],
                    type="filepath",
                )
            with gr.Column(scale=2):
                model_sel = gr.Dropdown(WHISPER_MODELS, value="base",       label="Model")
                task_sel  = gr.Radio(["transcribe","translate"], value="transcribe", label="Task")
                lang_sel  = gr.Dropdown(list(LANGUAGES.keys()), value="Auto Detect", label="Language")
                fmt_sel   = gr.Radio(
                    ["Plain Text","SRT Subtitles","VTT Subtitles","JSON"],
                    value="Plain Text", label="Output Format",
                )

        with gr.Accordion("Advanced Options", open=False):
            with gr.Row():
                beam_sl = gr.Slider(1, 10, value=5, step=1,    label="Beam Size")
                temp_sl = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Temperature")
                word_ts = gr.Checkbox(value=False,              label="Word-level Timestamps")

        stt_btn    = gr.Button("▶  Transcribe", variant="primary", size="lg")
        stt_status = gr.Textbox(label="Status", interactive=False, max_lines=1)
        stt_out    = gr.Textbox(label="Result", lines=12)
        stt_dl     = gr.File(label="⬇ Download")

        stt_btn.click(
            fn=transcribe_audio,
            inputs=[audio_in, model_sel, task_sel, lang_sel,
                    fmt_sel, beam_sl, temp_sl, word_ts],
            outputs=[stt_out, stt_dl, stt_status],
        )

    # ── Tab 2: Text → Speech ─────────────────────────────────────────────────
    with gr.Tab("🔊 Text → Speech"):
        tts_text = gr.Textbox(label="Text", placeholder="Type text here…", lines=5)
        tts_eng  = gr.Radio(
            ["gTTS (Google) — online, natural", "eSpeak — offline, instant"],
            value="gTTS (Google) — online, natural", label="Engine",
        )
        with gr.Row():
            with gr.Column():
                gr.Markdown("**gTTS settings**")
                gtts_lang = gr.Dropdown(list(GTTS_LANGS.keys()), value="English (US)", label="Language")
                gtts_slow = gr.Checkbox(value=False, label="Slow mode")
            with gr.Column():
                gr.Markdown("**eSpeak settings**")
                esp_voice = gr.Dropdown(ESPEAK_VOICES, value="en", label="Voice")
                esp_speed = gr.Slider(50, 400, value=150, step=10, label="Speed (wpm)")
                esp_pitch = gr.Slider(0, 99, value=50,  step=1,  label="Pitch")

        tts_btn    = gr.Button("🔊  Synthesize", variant="primary", size="lg")
        tts_status = gr.Textbox(label="Status", interactive=False, max_lines=1)
        tts_out    = gr.Audio(label="Output", type="filepath")

        tts_btn.click(
            fn=synthesize_speech,
            inputs=[tts_text, tts_eng, gtts_lang, gtts_slow, esp_voice, esp_speed, esp_pitch],
            outputs=[tts_out, tts_status],
        )

    # ── Tab 3: Models ─────────────────────────────────────────────────────────
    with gr.Tab("📦 Models"):
        gr.Markdown("""
| Model  | Size   | RAM  | CPU Speed  | Best for        |
|--------|--------|------|------------|-----------------|
| tiny   | ~39 MB | ~1GB | ~32× RT    | Quick tests     |
| base   | ~74 MB | ~1GB | ~16× RT    | General use ✅  |
| small  | ~244MB | ~2GB | ~6× RT     | Better accuracy |
| medium | ~769MB | ~5GB | ~2× RT     | High accuracy   |

All four fit on the free-tier 16 GB RAM. `int8` quantisation is used automatically on CPU.
        """)
        dl_sel = gr.Dropdown(WHISPER_MODELS, value="base", label="Model")
        dl_btn = gr.Button("Load & show info", variant="secondary")
        dl_out = gr.Markdown()
        dl_btn.click(fn=show_model_info, inputs=[dl_sel], outputs=[dl_out])

    # ── Tab 4: Guide ──────────────────────────────────────────────────────────
    with gr.Tab("📖 Guide"):
        gr.Markdown("""
## Local Install

```bash
pip install faster-whisper gTTS soundfile
# Linux: sudo apt install espeak ffmpeg
```

## Python Usage

```python
from faster_whisper import WhisperModel

model = WhisperModel("base", device="cpu", compute_type="int8")

# Transcribe
segments, info = model.transcribe("audio.mp3")
for seg in segments:
    print(f"[{seg.start:.1f}s → {seg.end:.1f}s] {seg.text}")

# Translate to English
segments, info = model.transcribe("audio.mp3", task="translate")

# Force language
segments, info = model.transcribe("audio.mp3", language="fr")

# Word-level timestamps
segments, info = model.transcribe("audio.mp3", word_timestamps=True)
for seg in segments:
    for w in seg.words:
        print(w.word, w.start, w.end)
```

## Why faster-whisper?
- Pre-built wheel → no `pkg_resources` / setuptools issues on Python 3.13
- Uses CTranslate2 for **4× faster** CPU inference vs original Whisper
- Same accuracy (same OpenAI model weights)
- `int8` quantisation halves RAM on CPU with no accuracy loss

## Deploy to HF Spaces
Push `app.py`, `requirements.txt`, `packages.txt`, `README.md` to a **Gradio** Space.  
First cold start: ~2–3 min (pip install). Model downloads on first transcription request.
        """)

# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        theme=gr.themes.Soft(primary_hue="violet", secondary_hue="indigo"),
        css="footer{display:none!important}",
    )