Spaces:
Running on Zero
Running on Zero
| # app.py — Whisper-small + WhisperX Diarization + Timestamps | |
| # Public, no login, contact email | |
| import os | |
| os.environ["OMP_NUM_THREADS"] = "1" | |
| import gradio as gr | |
| import spaces | |
| import whisperx | |
| from transformers import pipeline | |
| import torch | |
| # Keep Space awake | |
| import threading, time, requests | |
| def keep_awake(): | |
| while True: | |
| time.sleep(45 * 60) | |
| try: | |
| requests.get(f"https://{os.getenv('SPACE_HOST')}") | |
| except: pass | |
| threading.Thread(target=keep_awake, daemon=True).start() | |
| # Load your Whisper-small | |
| asr = pipeline( | |
| "automatic-speech-recognition", | |
| model="palli23/whisper-small-sam_spjall", | |
| torch_dtype="float16", | |
| device=0, | |
| chunk_length_s=30, | |
| batch_size=8, | |
| ) | |
| # WhisperX setup (diarization + timestamps) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| batch_size = 16 | |
| compute_type = "float16" | |
| # Load WhisperX model | |
| model = whisperx.load_model("base", device, compute_type=compute_type) | |
| # Load diarization model | |
| diarize_model = whisperx.DiarizationPipeline( | |
| use_auth_token=True, | |
| device=device, | |
| min_speakers=2, | |
| max_speakers=5, | |
| ) | |
| def transcribe_with_whisperx(audio_path, use_diarization=False): | |
| if not audio_path: | |
| return "Hladdu upp hljóðskrá" | |
| # Load audio | |
| audio = whisperx.load_audio(audio_path) | |
| # Transcribe with Whisper | |
| result = model.transcribe(audio, batch_size=batch_size) | |
| # Align for word-level timestamps | |
| model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) | |
| result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) | |
| if not use_diarization: | |
| # Return with timestamps | |
| lines = [] | |
| for segment in result["segments"]: | |
| start = segment["start"] | |
| end = segment["end"] | |
| text = segment["text"] | |
| lines.append(f"{start:.1f}s – {end:.1f}s: {text}") | |
| return "\n".join(lines) | |
| # Diarization | |
| diarize_segments = diarize_model(audio) | |
| result = whisperx.assign_word_speakers(diarize_segments, result) | |
| # Return with speakers + timestamps | |
| lines = [] | |
| for segment in result["segments"]: | |
| speaker = segment.get("speaker", "Unknown") | |
| start = segment["start"] | |
| end = segment["end"] | |
| text = segment["text"] | |
| lines.append(f"[{speaker}] {start:.1f}s – {end:.1f}s: {text}") | |
| return "\n".join(lines) | |
| # UI | |
| with gr.Blocks(title="Íslensk talgreining + WhisperX") as demo: | |
| gr.Markdown("# Íslensk talgreining + WhisperX") | |
| gr.Markdown("**Whisper-small + diarization + timestamps • pallinr1@protonmail.com**") | |
| audio = gr.Audio(type="filepath", label="Hladdu upp hljóð (max 15 mín)") | |
| diarize = gr.Checkbox(label="Virkja diarization (speakers + timestamps)", value=True) | |
| btn = gr.Button("Transcribe", variant="primary") | |
| out = gr.Textbox(lines=25, label="Útskrift") | |
| btn.click(transcribe_with_whisperx, inputs=[audio, diarize], outputs=out) | |
| demo.launch(auth=None, share=True) |