import gradio as gr from transformers import pipeline import torch import librosa import torch.nn.functional as F # Load the engine print("Loading Strict Pronunciation Engine...") asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") def assess_pronunciation(audio_filepath, target_text): if not audio_filepath or not target_text: return {"error": "Missing input"} # --- FIXED INDENTATION STARTS HERE --- try: # 1. Process Audio audio, sr = librosa.load(audio_filepath, sr=16000) # 2. Strict Scoring (Confidence Analysis) # We check how 'confident' the model is about your sounds with torch.no_grad(): logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits probs = F.softmax(logits, dim=-1) # We calculate the average confidence across the whole clip confidence = float(torch.mean(torch.max(probs, dim=-1).values)) # 3. Transcription for feedback transcription_result = asr_pipe(audio_filepath) said = transcription_result["text"].lower() # Strict Logic: Penalty for thick accents or mumbling # We scale the 0-1 confidence into a 0-100 score with a difficulty curve accuracy = round((confidence ** 2) * 100) # Fluency calculation (Characters per second) duration = len(audio) / sr fluency = min(100, round((len(said) / max(duration, 1)) * 10)) return { "accuracy_score": accuracy, "fluency_score": fluency, "completeness_score": 100 if accuracy > 70 else 80, "transcription": said } except Exception as e: return {"error": str(e)} # Gradio 3 Interface interface = gr.Interface( fn=assess_pronunciation, inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")], outputs=gr.JSON(), ) if __name__ == "__main__": interface.launch()