|
|
|
|
|
import os |
|
|
import gradio as gr |
|
|
import spaces |
|
|
from transformers import pipeline |
|
|
from pyannote.audio import Pipeline |
|
|
import tempfile |
|
|
|
|
|
MODEL_NAME = "palli23/whisper-small-sam_spjall" |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def transcribe_with_diarization(audio_path): |
|
|
if not audio_path: |
|
|
return "Hladdu upp hljóðskrá" |
|
|
|
|
|
|
|
|
diarization = Pipeline.from_pretrained( |
|
|
"pyannote/speaker-diarization-3.1", |
|
|
token=os.getenv("HF_TOKEN") |
|
|
).to("cuda") |
|
|
|
|
|
dia_result = diarization(audio_path) |
|
|
|
|
|
|
|
|
asr = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=MODEL_NAME, |
|
|
device=0, |
|
|
token=os.getenv("HF_TOKEN") |
|
|
) |
|
|
|
|
|
full_text = "" |
|
|
for turn, _, speaker in dia_result.itertracks(yield_label=True): |
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: |
|
|
dia_result.crop(audio_path, turn).export(tmp.name, format="wav") |
|
|
segment_path = tmp.name |
|
|
|
|
|
text = asr(segment_path)["text"].strip() |
|
|
full_text += f"[MÆLENDI {speaker}] {text}\n" |
|
|
os.unlink(segment_path) |
|
|
|
|
|
return full_text or "Ekkert heyrt" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# Íslenskt ASR + Mælendagreining") |
|
|
gr.Markdown("**Whisper-small + pyannote 3.1 · 2025 fix**") |
|
|
|
|
|
audio = gr.Audio(type="filepath") |
|
|
btn = gr.Button("Transcribe með mælendum", variant="primary") |
|
|
out = gr.Textbox(lines=35) |
|
|
|
|
|
btn.click(transcribe_with_diarization, audio, out) |
|
|
|
|
|
demo.launch(auth=("beta", "beta2025")) |