import os os.environ["TRANSFORMERS_NO_TF"] = "1" from transformers import pipeline import gradio as gr from evaluate import load # Load WER metric wer_metric = load("wer") # Preload multiple ASR models for comparison models = { "Wav2Vec2": pipeline( task="automatic-speech-recognition", model="Devion333/wav2vec2-xls-r-300m-dv" ), "Whisper small": pipeline( task="automatic-speech-recognition", model="Devion333/whisper-small-dv-syn" ), } def transcribe(audio, chosen_models, reference): results = {} for model_name in chosen_models: asr_pipe = models[model_name] prediction = asr_pipe(audio)["text"] if reference.strip(): # compute WER if reference provided wer = wer_metric.compute( predictions=[prediction.lower()], references=[reference.lower()] ) results[model_name] = { "prediction": prediction, "WER": round(wer, 3) } else: results[model_name] = { "prediction": prediction } return results demo = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record Speech"), gr.CheckboxGroup(choices=list(models.keys()), value=["Wav2Vec2"], label="Choose Models to Compare"), gr.Textbox(label="Reference Transcript (optional)") ], outputs=gr.JSON(label="Transcriptions & Statistics"), title="ASR Model Comparison", description="Upload or record audio, select ASR models, and compare their transcriptions. Optionally, provide a reference transcript to calculate WER." ) if __name__ == "__main__": demo.launch()