File size: 2,316 Bytes
3407dd3 2bc8f15 f0e9bad 2179696 67163c1 3407dd3 90a78b4 3407dd3 f0e9bad c95f5de 2767a40 04cee61 d89e139 2179696 ef69ec6 c95f5de 3407dd3 67163c1 3407dd3 67163c1 5be18fb 3407dd3 5be18fb 168cab1 3407dd3 5be18fb 3407dd3 d89e139 c95f5de 2179696 3407dd3 c95f5de 3407dd3 5be18fb 3407dd3 5be18fb f0e9bad 3407dd3 04cee61 d89e139 3407dd3 860aaf0 90a78b4 3407dd3 90a78b4 3407dd3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# app.py for HF Spaces (ZeroGPU safe pyannote)
import os
import gradio as gr
import spaces
import tempfile
import torch
from torch.serialization import safe_globals
from pyannote.audio.core.model import Model
from pyannote.audio.core.task import Task, Specifications
from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
from typing import OrderedDict
from transformers import pipeline
from pyannote.audio import Pipeline
# Required patches for ZeroGPU
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
torch.serialization.add_safe_globals({
"OrderedDict": OrderedDict,
})
MODEL_NAME = "palli23/whisper-small-sam_spjall"
@spaces.GPU(duration=120)
def transcribe_with_diarization(audio_path):
if not audio_path:
return "Hladdu upp hljóðskrá"
# Fix strict unpickling in torch 2.6 (ZeroGPU)
with safe_globals([
torch.torch_version.TorchVersion,
Model,
Task,
Specifications,
SpeakerDiarization,
OrderedDict,
]):
diarization = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=os.getenv("HF_TOKEN")
).to("cuda")
# Run diarization
dia = diarization(audio_path)
# Whisper model
asr = pipeline(
"automatic-speech-recognition",
model=MODEL_NAME,
device=0,
use_auth_token=os.getenv("HF_TOKEN"),
)
# segment-by-segment ASR
result = []
for turn, _, speaker in dia.itertracks(yield_label=True):
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
diarization.crop(audio_path, turn).export(f.name, format="wav")
chunk = f.name
text = asr(chunk)["text"].strip()
os.unlink(chunk)
result.append(f"[MÆLENDI {speaker}] {text}")
return "\n".join(result) or "Enginn texti heyrðist."
with gr.Blocks() as demo:
gr.Markdown("# Íslenskt ASR + Mælendagreining")
gr.Markdown("Whisper-small + pyannote 3.1 (ZeroGPU örugg útgáfa)")
audio = gr.Audio(type="filepath", label="Hljóðskrá")
btn = gr.Button("Transcribe með mælendum")
out = gr.Textbox(lines=35, label="Úttak")
btn.click(transcribe_with_diarization, inputs=audio, outputs=out)
demo.launch(auth=("beta", "beta2025"))
|