| | import gradio as gr |
| | from transformers import pipeline |
| | import torch |
| | import librosa |
| | import torch.nn.functional as F |
| |
|
| | |
| | print("Loading Strict Pronunciation Engine...") |
| | asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") |
| |
|
| | def assess_pronunciation(audio_filepath, target_text): |
| | if not audio_filepath or not target_text: |
| | return {"error": "Missing input"} |
| | |
| | |
| | try: |
| | |
| | audio, sr = librosa.load(audio_filepath, sr=16000) |
| | |
| | |
| | |
| | with torch.no_grad(): |
| | logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits |
| | |
| | probs = F.softmax(logits, dim=-1) |
| | |
| | confidence = float(torch.mean(torch.max(probs, dim=-1).values)) |
| | |
| | |
| | transcription_result = asr_pipe(audio_filepath) |
| | said = transcription_result["text"].lower() |
| |
|
| | |
| | |
| | accuracy = round((confidence ** 2) * 100) |
| | |
| | |
| | duration = len(audio) / sr |
| | fluency = min(100, round((len(said) / max(duration, 1)) * 10)) |
| |
|
| | return { |
| | "accuracy_score": accuracy, |
| | "fluency_score": fluency, |
| | "completeness_score": 100 if accuracy > 70 else 80, |
| | "transcription": said |
| | } |
| | except Exception as e: |
| | return {"error": str(e)} |
| |
|
| | |
| | interface = gr.Interface( |
| | fn=assess_pronunciation, |
| | inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")], |
| | outputs=gr.JSON(), |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | interface.launch() |