stt-uzbek / app.py
UpCoder's picture
Update app.py
89b7da4 verified
import gradio as gr
from transformers import pipeline
import torch
import librosa
import torch.nn.functional as F
# Load the engine
print("Loading Strict Pronunciation Engine...")
asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
def assess_pronunciation(audio_filepath, target_text):
if not audio_filepath or not target_text:
return {"error": "Missing input"}
# --- FIXED INDENTATION STARTS HERE ---
try:
# 1. Process Audio
audio, sr = librosa.load(audio_filepath, sr=16000)
# 2. Strict Scoring (Confidence Analysis)
# We check how 'confident' the model is about your sounds
with torch.no_grad():
logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits
probs = F.softmax(logits, dim=-1)
# We calculate the average confidence across the whole clip
confidence = float(torch.mean(torch.max(probs, dim=-1).values))
# 3. Transcription for feedback
transcription_result = asr_pipe(audio_filepath)
said = transcription_result["text"].lower()
# Strict Logic: Penalty for thick accents or mumbling
# We scale the 0-1 confidence into a 0-100 score with a difficulty curve
accuracy = round((confidence ** 2) * 100)
# Fluency calculation (Characters per second)
duration = len(audio) / sr
fluency = min(100, round((len(said) / max(duration, 1)) * 10))
return {
"accuracy_score": accuracy,
"fluency_score": fluency,
"completeness_score": 100 if accuracy > 70 else 80,
"transcription": said
}
except Exception as e:
return {"error": str(e)}
# Gradio 3 Interface
interface = gr.Interface(
fn=assess_pronunciation,
inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],
outputs=gr.JSON(),
)
if __name__ == "__main__":
interface.launch()