palli23's picture
Update app.py
e4aa950 verified
raw
history blame
3.09 kB
# app.py — Whisper-small + WhisperX Diarization + Timestamps
# Public, no login, contact email
import os
os.environ["OMP_NUM_THREADS"] = "1"
import gradio as gr
import spaces
import whisperx
from transformers import pipeline
import torch
# Keep Space awake
import threading, time, requests
def keep_awake():
while True:
time.sleep(45 * 60)
try:
requests.get(f"https://{os.getenv('SPACE_HOST')}")
except: pass
threading.Thread(target=keep_awake, daemon=True).start()
# Load your Whisper-small
asr = pipeline(
"automatic-speech-recognition",
model="palli23/whisper-small-sam_spjall",
torch_dtype="float16",
device=0,
chunk_length_s=30,
batch_size=8,
)
# WhisperX setup (diarization + timestamps)
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 16
compute_type = "float16"
# Load WhisperX model
model = whisperx.load_model("base", device, compute_type=compute_type)
# Load diarization model
diarize_model = whisperx.DiarizationPipeline(
use_auth_token=True,
device=device,
min_speakers=2,
max_speakers=5,
)
def transcribe_with_whisperx(audio_path, use_diarization=False):
if not audio_path:
return "Hladdu upp hljóðskrá"
# Load audio
audio = whisperx.load_audio(audio_path)
# Transcribe with Whisper
result = model.transcribe(audio, batch_size=batch_size)
# Align for word-level timestamps
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
if not use_diarization:
# Return with timestamps
lines = []
for segment in result["segments"]:
start = segment["start"]
end = segment["end"]
text = segment["text"]
lines.append(f"{start:.1f}s – {end:.1f}s: {text}")
return "\n".join(lines)
# Diarization
diarize_segments = diarize_model(audio)
result = whisperx.assign_word_speakers(diarize_segments, result)
# Return with speakers + timestamps
lines = []
for segment in result["segments"]:
speaker = segment.get("speaker", "Unknown")
start = segment["start"]
end = segment["end"]
text = segment["text"]
lines.append(f"[{speaker}] {start:.1f}s – {end:.1f}s: {text}")
return "\n".join(lines)
# UI
with gr.Blocks(title="Íslensk talgreining + WhisperX") as demo:
gr.Markdown("# Íslensk talgreining + WhisperX")
gr.Markdown("**Whisper-small + diarization + timestamps • pallinr1@protonmail.com**")
audio = gr.Audio(type="filepath", label="Hladdu upp hljóð (max 15 mín)")
diarize = gr.Checkbox(label="Virkja diarization (speakers + timestamps)", value=True)
btn = gr.Button("Transcribe", variant="primary")
out = gr.Textbox(lines=25, label="Útskrift")
btn.click(transcribe_with_whisperx, inputs=[audio, diarize], outputs=out)
demo.launch(auth=None, share=True)