Spaces:

UpCoder
/

stt-uzbek

Running

App Files Files Community

stt-uzbek / app.py

UpCoder

Update app.py

89b7da4 verified 6 days ago

raw

history blame contribute delete

2.01 kB

	import gradio as gr
	from transformers import pipeline
	import torch
	import librosa
	import torch.nn.functional as F

	# Load the engine
	print("Loading Strict Pronunciation Engine...")
	asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

	def assess_pronunciation(audio_filepath, target_text):
	if not audio_filepath or not target_text:
	return {"error": "Missing input"}

	# --- FIXED INDENTATION STARTS HERE ---
	try:
	# 1. Process Audio
	audio, sr = librosa.load(audio_filepath, sr=16000)

	# 2. Strict Scoring (Confidence Analysis)
	# We check how 'confident' the model is about your sounds
	with torch.no_grad():
	logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits

	probs = F.softmax(logits, dim=-1)
	# We calculate the average confidence across the whole clip
	confidence = float(torch.mean(torch.max(probs, dim=-1).values))

	# 3. Transcription for feedback
	transcription_result = asr_pipe(audio_filepath)
	said = transcription_result["text"].lower()

	# Strict Logic: Penalty for thick accents or mumbling
	# We scale the 0-1 confidence into a 0-100 score with a difficulty curve
	accuracy = round((confidence ** 2) * 100)

	# Fluency calculation (Characters per second)
	duration = len(audio) / sr
	fluency = min(100, round((len(said) / max(duration, 1)) * 10))

	return {
	"accuracy_score": accuracy,
	"fluency_score": fluency,
	"completeness_score": 100 if accuracy > 70 else 80,
	"transcription": said
	}
	except Exception as e:
	return {"error": str(e)}

	# Gradio 3 Interface
	interface = gr.Interface(
	fn=assess_pronunciation,
	inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],
	outputs=gr.JSON(),
	)

	if __name__ == "__main__":
	interface.launch()