File size: 2,316 Bytes
3407dd3
2bc8f15
f0e9bad
2179696
67163c1
3407dd3
 
90a78b4
3407dd3
 
 
 
 
 
 
 
 
 
 
 
 
f0e9bad
c95f5de
2767a40
04cee61
d89e139
2179696
ef69ec6
c95f5de
3407dd3
67163c1
 
3407dd3
 
 
 
 
67163c1
5be18fb
 
3407dd3
5be18fb
168cab1
3407dd3
5be18fb
3407dd3
 
d89e139
c95f5de
 
2179696
3407dd3
c95f5de
3407dd3
 
5be18fb
 
 
3407dd3
 
 
 
 
5be18fb
f0e9bad
3407dd3
 
04cee61
d89e139
3407dd3
860aaf0
90a78b4
3407dd3
 
 
90a78b4
 
3407dd3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# app.py for HF Spaces (ZeroGPU safe pyannote)
import os
import gradio as gr
import spaces
import tempfile
import torch

from torch.serialization import safe_globals
from pyannote.audio.core.model import Model
from pyannote.audio.core.task import Task, Specifications
from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
from typing import OrderedDict

from transformers import pipeline
from pyannote.audio import Pipeline

# Required patches for ZeroGPU
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
torch.serialization.add_safe_globals({
    "OrderedDict": OrderedDict,
})

MODEL_NAME = "palli23/whisper-small-sam_spjall"

@spaces.GPU(duration=120)
def transcribe_with_diarization(audio_path):
    if not audio_path:
        return "Hladdu upp hljóðskrá"
    
    # Fix strict unpickling in torch 2.6 (ZeroGPU)
    with safe_globals([
        torch.torch_version.TorchVersion,
        Model,
        Task,
        Specifications,
        SpeakerDiarization,
        OrderedDict,
    ]):
        diarization = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=os.getenv("HF_TOKEN")
        ).to("cuda")
    
    # Run diarization
    dia = diarization(audio_path)

    # Whisper model
    asr = pipeline(
        "automatic-speech-recognition",
        model=MODEL_NAME,
        device=0,
        use_auth_token=os.getenv("HF_TOKEN"),
    )

    # segment-by-segment ASR
    result = []
    for turn, _, speaker in dia.itertracks(yield_label=True):
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            diarization.crop(audio_path, turn).export(f.name, format="wav")
            chunk = f.name

        text = asr(chunk)["text"].strip()
        os.unlink(chunk)
        result.append(f"[MÆLENDI {speaker}] {text}")

    return "\n".join(result) or "Enginn texti heyrðist."

with gr.Blocks() as demo:
    gr.Markdown("# Íslenskt ASR + Mælendagreining")
    gr.Markdown("Whisper-small + pyannote 3.1 (ZeroGPU örugg útgáfa)")
    
    audio = gr.Audio(type="filepath", label="Hljóðskrá")
    btn = gr.Button("Transcribe með mælendum")
    out = gr.Textbox(lines=35, label="Úttak")

    btn.click(transcribe_with_diarization, inputs=audio, outputs=out)

demo.launch(auth=("beta", "beta2025"))