Za6na
/

sorani2

Safetensors

whisper

Model card Files Files and versions

xet

Community

Za6na commited on Feb 7

Commit

27b824a

verified ·

1 Parent(s): cbc21a0

Upload demo.py with huggingface_hub

Browse files

Files changed (1) hide show

demo.py +289 -0

demo.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Gradio demo for fine-tuned Whisper models — Kurdish Sorani & Persian transcription.
+"""
+import gc
+import time
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import torch
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+# ---------------------------------------------------------------------------
+# Model registry
+# ---------------------------------------------------------------------------
+MODELS = {
+    "Small (whisper-small, PEFT-merged)": {
+        "path": Path(__file__).parent / "models" / "whisper-small-peft-kurdish-on-persian-converted",
+        "base": "openai/whisper-small",
+    },
+    "Large-v3 (full fine-tune)": {
+        "path": Path(__file__).parent / "models" / "whisper-largev3-on-persian-centralkurdish-full",
+        "base": "openai/whisper-large-v3",
+    },
+}
+LANGUAGES = {
+    "Kurdish Sorani (کوردی سۆرانی)": "fa",   # no native <|ku|>; models trained with <|fa|>
+    "Persian (فارسی)": "fa",
+}
+SAMPLE_RATE = 16_000
+CHUNK_SECONDS = 30
+CHUNK_SAMPLES = CHUNK_SECONDS * SAMPLE_RATE
+# ---------------------------------------------------------------------------
+# ModelManager — lazy loading, one model in memory at a time
+# ---------------------------------------------------------------------------
+class ModelManager:
+    def __init__(self):
+        self.processor: WhisperProcessor | None = None
+        self.model: WhisperForConditionalGeneration | None = None
+        self.current_name: str | None = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # --- public -----------------------------------------------------------
+    def load(self, name: str) -> str:
+        """Load *name*, unloading any previously loaded model first."""
+        if name == self.current_name:
+            return self._status()
+        self._unload()
+        cfg = MODELS[name]
+        model_path = str(cfg["path"])
+        self.processor = WhisperProcessor.from_pretrained(model_path)
+        # The small PEFT model was saved with load_in_8bit in its config.
+        # bitsandbytes doesn't work on Windows / CPU, so we catch and
+        # fall back to float16 (or float32 on CPU).
+        try:
+            self.model = WhisperForConditionalGeneration.from_pretrained(
+                model_path,
+                device_map="auto" if self.device.type == "cuda" else None,
+            )
+        except (ImportError, ValueError, RuntimeError):
+            # Quantisation failed — reload without it.
+            dtype = torch.float16 if self.device.type == "cuda" else torch.float32
+            self.model = WhisperForConditionalGeneration.from_pretrained(
+                model_path,
+                quantization_config=None,
+                torch_dtype=dtype,
+                low_cpu_mem_usage=True,
+            )
+            self.model.to(self.device)
+        # Ensure generate uses KV-cache regardless of saved config.
+        self.model.config.use_cache = True
+        # Clear stale forced_decoder_ids so they don't conflict
+        # with the language/task kwargs we pass to generate().
+        self.model.generation_config.forced_decoder_ids = None
+        if self.device.type != "cuda" and next(self.model.parameters()).device.type != "cpu":
+            self.model.to(self.device)
+        self.model.eval()
+        self._dtype = next(self.model.parameters()).dtype
+        self.current_name = name
+        return self._status()
+    def generate(self, audio: np.ndarray, language_code: str) -> str:
+        """Run inference on a float32 mono 16 kHz numpy array."""
+        if self.model is None or self.processor is None:
+            raise RuntimeError("No model loaded.")
+        chunks = self._chunk(audio)
+        parts: list[str] = []
+        for chunk in chunks:
+            inputs = self.processor(
+                chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt",
+            )
+            input_features = inputs.input_features.to(self.device, dtype=self._dtype)
+            with torch.no_grad():
+                predicted_ids = self.model.generate(
+                    input_features,
+                    language=language_code,
+                    task="transcribe",
+                    max_new_tokens=440,
+                )
+            text = self.processor.batch_decode(
+                predicted_ids, skip_special_tokens=True,
+            )[0].strip()
+            if text:
+                parts.append(text)
+        return " ".join(parts)
+    # --- private ----------------------------------------------------------
+    def _unload(self):
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.processor is not None:
+            del self.processor
+            self.processor = None
+        self.current_name = None
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    def _status(self) -> str:
+        mem = ""
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            mem = f" | GPU memory: {allocated:.1f} GB"
+        return f"{self.current_name}  •  {self.device}{mem}"
+    @staticmethod
+    def _chunk(audio: np.ndarray) -> list[np.ndarray]:
+        if len(audio) <= CHUNK_SAMPLES:
+            return [audio]
+        return [audio[i : i + CHUNK_SAMPLES] for i in range(0, len(audio), CHUNK_SAMPLES)]
+# ---------------------------------------------------------------------------
+# Audio normalisation helper
+# ---------------------------------------------------------------------------
+def prepare_audio(audio) -> np.ndarray:
+    """Accept a filepath from Gradio and return float32 mono 16 kHz numpy array."""
+    import subprocess
+    import tempfile
+    if not audio:
+        raise gr.Error("No audio provided — please record or upload a file first.")
+    audio_path = Path(audio)
+    if not audio_path.exists():
+        raise gr.Error(f"Audio file not found: {audio}")
+    # Convert any format to 16 kHz mono WAV via ffmpeg, then load the raw PCM.
+    # This handles ogg, webm, mp3, flac, m4a, opus — anything ffmpeg supports.
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        wav_path = tmp.name
+    try:
+        subprocess.run(
+            [
+                "ffmpeg", "-y", "-i", str(audio_path),
+                "-ar", str(SAMPLE_RATE),
+                "-ac", "1",
+                "-c:a", "pcm_s16le",
+                wav_path,
+            ],
+            capture_output=True,
+            check=True,
+        )
+        import soundfile as sf
+        data, _ = sf.read(wav_path, dtype="float32")
+    finally:
+        Path(wav_path).unlink(missing_ok=True)
+    return data
+# ---------------------------------------------------------------------------
+# Gradio callback
+# ---------------------------------------------------------------------------
+manager = ModelManager()
+def transcribe(audio_mic, audio_file, model_name: str, language: str):
+    # Prefer uploaded file; fall back to microphone recording.
+    audio = audio_file if audio_file is not None else audio_mic
+    if model_name not in MODELS:
+        raise gr.Error("Please select a model.")
+    status = manager.load(model_name)
+    lang_code = LANGUAGES[language]
+    t0 = time.perf_counter()
+    text = manager.generate(prepare_audio(audio), lang_code)
+    elapsed = time.perf_counter() - t0
+    status += f"  |  {elapsed:.1f}s"
+    return text, status
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+RTL_CSS = """
+#output-box textarea {
+    direction: rtl;
+    text-align: right;
+    font-family: 'Vazirmatn', 'Noto Sans Arabic', Tahoma, sans-serif;
+    font-size: 1.15rem;
+    line-height: 1.9;
+}
+"""
+def build_ui() -> gr.Blocks:
+    with gr.Blocks(title="Whisper Kurdish & Persian") as app:
+        gr.Markdown("## Whisper — Kurdish Sorani & Persian Transcription")
+        with gr.Row():
+            model_dd = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value=list(MODELS.keys())[0],
+                label="Model",
+            )
+            lang_dd = gr.Dropdown(
+                choices=list(LANGUAGES.keys()),
+                value=list(LANGUAGES.keys())[0],
+                label="Language",
+            )
+        with gr.Row():
+            audio_mic = gr.Audio(
+                label="Record from microphone",
+                sources=["microphone"],
+                type="filepath",
+            )
+            audio_file = gr.File(
+                label="Or upload audio file (wav, ogg, mp3, flac, m4a, opus …)",
+                file_types=[".wav", ".ogg", ".oga", ".mp3", ".flac", ".m4a",
+                            ".opus", ".webm", ".wma", ".aac", ".amr"],
+            )
+        btn = gr.Button("Transcribe", variant="primary")
+        output = gr.Textbox(
+            label="Transcription",
+            lines=6,
+            buttons=["copy"],
+            elem_id="output-box",
+            rtl=True,
+        )
+        status = gr.Textbox(label="Status", interactive=False, lines=1)
+        btn.click(
+            fn=transcribe,
+            inputs=[audio_mic, audio_file, model_dd, lang_dd],
+            outputs=[output, status],
+        )
+    return app
+# ---------------------------------------------------------------------------
+# Entry
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    build_ui().launch(
+        server_name="0.0.0.0",
+        server_port=7865,
+        show_error=True,
+        theme=gr.themes.Soft(),
+        css=RTL_CSS,
+    )