File size: 2,008 Bytes
8c5429a
fbe7334
c57c8d4
 
89b7da4
8c5429a
89b7da4
 
fbe7334
8c5429a
c57c8d4
fbe7334
 
 
89b7da4
8c5429a
89b7da4
051e28c
c57c8d4
89b7da4
 
051e28c
 
89b7da4
051e28c
89b7da4
 
 
 
 
 
 
 
 
 
 
 
 
 
c57c8d4
 
89b7da4
 
 
 
c57c8d4
8c5429a
c57c8d4
89b7da4
354dd60
8c5429a
c57c8d4
 
 
8c5429a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from transformers import pipeline
import torch
import librosa
import torch.nn.functional as F

# Load the engine
print("Loading Strict Pronunciation Engine...")
asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

def assess_pronunciation(audio_filepath, target_text):
    if not audio_filepath or not target_text:
        return {"error": "Missing input"}
    
    # --- FIXED INDENTATION STARTS HERE ---
    try:
        # 1. Process Audio
        audio, sr = librosa.load(audio_filepath, sr=16000)
        
        # 2. Strict Scoring (Confidence Analysis)
        # We check how 'confident' the model is about your sounds
        with torch.no_grad():
            logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits
        
        probs = F.softmax(logits, dim=-1)
        # We calculate the average confidence across the whole clip
        confidence = float(torch.mean(torch.max(probs, dim=-1).values))
        
        # 3. Transcription for feedback
        transcription_result = asr_pipe(audio_filepath)
        said = transcription_result["text"].lower()

        # Strict Logic: Penalty for thick accents or mumbling
        # We scale the 0-1 confidence into a 0-100 score with a difficulty curve
        accuracy = round((confidence ** 2) * 100) 
        
        # Fluency calculation (Characters per second)
        duration = len(audio) / sr
        fluency = min(100, round((len(said) / max(duration, 1)) * 10))

        return {
            "accuracy_score": accuracy,
            "fluency_score": fluency,
            "completeness_score": 100 if accuracy > 70 else 80,
            "transcription": said
        }
    except Exception as e:
        return {"error": str(e)}

# Gradio 3 Interface
interface = gr.Interface(
    fn=assess_pronunciation,
    inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],
    outputs=gr.JSON(),
)

if __name__ == "__main__":
    interface.launch()