import os import gradio as gr import whisperx HF_TOKEN = os.getenv("HF_TOKEN") # MUST be set in HF Spaces secrets ASR_MODEL = "palli23/whisper-small-sam_spjall-ct2" DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1" ALIGN_MODEL = "WAV2VEC2_ASR_LARGE_LV60K_960H" def load_models(): print("Loading WhisperX ASR...") asr = whisperx.load_model( model_name=ASR_MODEL, device="cuda" if whisperx.is_cuda_available() else "cpu", compute_type="int8", # Safe for Spaces hf_token=HF_TOKEN ) print("Loading alignment model...") align_model, metadata = whisperx.load_align_model( language_code="is", model_name=ALIGN_MODEL, hf_token=HF_TOKEN ) print("Loading diarization model...") diar = whisperx.DiarizationPipeline( DIARIZATION_MODEL, hf_token=HF_TOKEN, use_auth_token=True ) return asr, align_model, metadata, diar asr_model, align_model, align_metadata, diar_pipeline = load_models() def transcribe(audio): if audio is None: return "No audio provided." print("Running ASR...") result = asr_model.transcribe(audio) print("Running alignment...") aligned = whisperx.align( result["segments"], align_model, align_metadata, audio, "is" ) print("Running diarization...") diarization = diar_pipeline(audio) print("Assigning speaker labels...") final_result = whisperx.assign_word_speakers( diarization, aligned ) text_out = "" for seg in final_result["segments"]: speaker = seg.get("speaker", "Unknown") text_out += f"[{speaker}] {seg['text']}\n" return text_out ui = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath"), outputs=gr.Textbox(label="Transcription + Speakers", lines=20), title="WhisperX Icelandic CT2 + Diarization", description="Uses your private CT2 Whisper Small model + alignment + pyannote diarization." ) if __name__ == "__main__": ui.launch()