File size: 1,646 Bytes
04cee61
2bc8f15
f0e9bad
2179696
ddeefba
d89e139
 
f0e9bad
c95f5de
2767a40
04cee61
d89e139
2179696
ef69ec6
c95f5de
04cee61
d89e139
 
04cee61
d89e139
168cab1
d89e139
 
04cee61
d89e139
c95f5de
 
2179696
c95f5de
 
2179696
168cab1
d89e139
 
 
 
 
 
 
04cee61
2179696
d89e139
f0e9bad
04cee61
d89e139
04cee61
2179696
04cee61
 
 
365da29
04cee61
365da29
ef69ec6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# app.py – Mælendagreining VIRKAR á ZeroGPU (2025 fix)
import os
import gradio as gr
import spaces
from transformers import pipeline
from pyannote.audio import Pipeline
import tempfile

MODEL_NAME = "palli23/whisper-small-sam_spjall"

@spaces.GPU(duration=120)
def transcribe_with_diarization(audio_path):
    if not audio_path:
        return "Hladdu upp hljóðskrá"
    
    # Mælendagreining – 2025 syntax
    diarization = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        token=os.getenv("HF_TOKEN")   # ← FIX
    ).to("cuda")
    
    dia_result = diarization(audio_path)
    
    # Whisper-small
    asr = pipeline(
        "automatic-speech-recognition",
        model=MODEL_NAME,
        device=0,
        token=os.getenv("HF_TOKEN")
    )
    
    full_text = ""
    for turn, _, speaker in dia_result.itertracks(yield_label=True):
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            dia_result.crop(audio_path, turn).export(tmp.name, format="wav")
            segment_path = tmp.name
        
        text = asr(segment_path)["text"].strip()
        full_text += f"[MÆLENDI {speaker}] {text}\n"
        os.unlink(segment_path)
    
    return full_text or "Ekkert heyrt"

with gr.Blocks() as demo:
    gr.Markdown("# Íslenskt ASR + Mælendagreining")
    gr.Markdown("**Whisper-small + pyannote 3.1 · 2025 fix**")
    
    audio = gr.Audio(type="filepath")
    btn = gr.Button("Transcribe með mælendum", variant="primary")
    out = gr.Textbox(lines=35)
    
    btn.click(transcribe_with_diarization, audio, out)

demo.launch(auth=("beta", "beta2025"))