Spaces:

palli23
/

ASR_API2

Sleeping

App Files Files Community

ASR_API2 / app.py

palli23

diarization1Mæló

d817784 5 months ago

raw

history blame

3.36 kB

	# ============================================================
	# app.py – Whisper-small + Pyannote 3.1 (ZeroGPU örugg útgáfa)
	# ============================================================

	import os
	import gradio as gr
	import spaces
	import tempfile
	import torch
	from transformers import pipeline
	from pyannote.audio import Pipeline
	from torch.serialization import safe_globals

	# ------------------------------------------------------------
	# STILLT MODELNÖFN
	# ------------------------------------------------------------
	ASR_MODEL = "palli23/whisper-small-sam_spjall"
	DIAR_MODEL = "pyannote/speaker-diarization-3.1"

	# ------------------------------------------------------------
	# Aðalfallið – keyrir á ZeroGPU (120s GPU max)
	# ------------------------------------------------------------
	@spaces.GPU(duration=120)
	def transcribe_with_diarization(audio_path):

	if not audio_path:
	return "Hladdu upp hljóðskrá."

	# ----------------------------
	# 1. PYTORCH SAFE GLOBALS FIX
	# ----------------------------
	# PyTorch 2.6+ ZeroGPU unpickling patch – MANDATORY
	with safe_globals([
	torch.torch_version.TorchVersion,
	"pyannote.audio.core.task.Specifications",
	"pyannote.audio.core.model.Model",
	"pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization"
	]):

	# ----------------------------
	# 2. Load diarization pipeline
	# ----------------------------
	diarization = Pipeline.from_pretrained(
	DIAR_MODEL,
	token=os.getenv("HF_TOKEN") # <--- RÉTT FYRIR PYANNOTE 3.1
	).to("cuda")

	# Keyra diarization
	diar = diarization(audio_path)

	# ----------------------------
	# 3. Whisper ASR
	# ----------------------------
	asr = pipeline(
	task="automatic-speech-recognition",
	model=ASR_MODEL,
	device=0,
	token=os.getenv("HF_TOKEN")
	)

	# ----------------------------
	# 4. Skera út segment + Greina texta
	# ----------------------------
	final_output = []

	for turn, _, speaker in diar.itertracks(yield_label=True):

	# Vista tímabundna WAV fyrir hvert segment
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	diar.crop(audio_path, turn).export(tmp.name, format="wav")
	seg_path = tmp.name

	# ASR texti
	text = asr(seg_path)["text"].strip()

	# Vista niðurstöðu
	final_output.append(f"[MÆLENDI {speaker}] {text}")

	# Hreinsa
	os.unlink(seg_path)

	return "\n".join(final_output) if final_output else "Ekkert heyrt í hljóðinu."


	# ------------------------------------------------------------
	# GRADIO UI
	# ------------------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# 🎙️ Íslenskt tal → texti + mælendagreining")
	gr.Markdown("Whisper-small + pyannote 3.1 • Virkar á ZeroGPU • 5 mín hljóð max")

	audio_input = gr.Audio(type="filepath", label="Hladdu upp hljóðskrá (.wav / .mp3)")
	out_box = gr.Textbox(lines=30, label="Útskrift + mælendur")

	run_button = gr.Button("Transcribe með mælendum", variant="primary")
	run_button.click(transcribe_with_diarization, inputs=audio_input, outputs=out_box)

	# Spaces auth
	demo.launch(auth=("beta", "beta2025"))