palli23's picture
Update app.py
1155b96 verified
import os
import gradio as gr
import whisperx
HF_TOKEN = os.getenv("HF_TOKEN") # MUST be set in HF Spaces secrets
ASR_MODEL = "palli23/whisper-small-sam_spjall-ct2"
DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
ALIGN_MODEL = "WAV2VEC2_ASR_LARGE_LV60K_960H"
def load_models():
print("Loading WhisperX ASR...")
asr = whisperx.load_model(
model_name=ASR_MODEL,
device="cuda" if whisperx.is_cuda_available() else "cpu",
compute_type="int8", # Safe for Spaces
hf_token=HF_TOKEN
)
print("Loading alignment model...")
align_model, metadata = whisperx.load_align_model(
language_code="is",
model_name=ALIGN_MODEL,
hf_token=HF_TOKEN
)
print("Loading diarization model...")
diar = whisperx.DiarizationPipeline(
DIARIZATION_MODEL,
hf_token=HF_TOKEN,
use_auth_token=True
)
return asr, align_model, metadata, diar
asr_model, align_model, align_metadata, diar_pipeline = load_models()
def transcribe(audio):
if audio is None:
return "No audio provided."
print("Running ASR...")
result = asr_model.transcribe(audio)
print("Running alignment...")
aligned = whisperx.align(
result["segments"],
align_model,
align_metadata,
audio,
"is"
)
print("Running diarization...")
diarization = diar_pipeline(audio)
print("Assigning speaker labels...")
final_result = whisperx.assign_word_speakers(
diarization,
aligned
)
text_out = ""
for seg in final_result["segments"]:
speaker = seg.get("speaker", "Unknown")
text_out += f"[{speaker}] {seg['text']}\n"
return text_out
ui = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath"),
outputs=gr.Textbox(label="Transcription + Speakers", lines=20),
title="WhisperX Icelandic CT2 + Diarization",
description="Uses your private CT2 Whisper Small model + alignment + pyannote diarization."
)
if __name__ == "__main__":
ui.launch()