Spaces:

UpCoder
/

stt-uzbek

Running

App Files Files Community

UpCoder commited on 8 days ago

Commit

89b7da4

verified ·

1 Parent(s): 051e28c

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -17

app.py CHANGED Viewed

@@ -2,41 +2,51 @@ import gradio as gr
 from transformers import pipeline
 import torch
 import librosa
-from difflib import SequenceMatcher
-# Load a fast, accurate English speech model
-print("Loading Pronunciation Engine...")
 asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 def assess_pronunciation(audio_filepath, target_text):
     if not audio_filepath or not target_text:
         return {"error": "Missing input"}
     try:
-        # Load audio and get raw features
-        import torch.nn.functional as F
         audio, sr = librosa.load(audio_filepath, sr=16000)
-        input_values = asr_pipe.tokenizer(target_text.upper(), return_tensors="pt").input_values
-        # New Strict Logic: Compare your audio waves directly to the expected text
         with torch.no_grad():
             logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits
-        # This calculates how 'weird' your version sounded compared to the native model
-        # The lower the 'probability', the lower the score.
         probs = F.softmax(logits, dim=-1)
-        # (Simplified for your dissertation logic)
-        strict_score = float(torch.mean(torch.max(probs, dim=-1).values)) * 100
         return {
-            "accuracy_score": round(strict_score - 10), # Adding a 'difficulty' offset
-            "fluency_score": round(len(target_text) / len(audio) * 10),
-            "completeness_score": 100,
-            "transcription": asr_pipe(audio_filepath)["text"].lower()
         }
     except Exception as e:
         return {"error": str(e)}
 # Gradio 3 Interface
 interface = gr.Interface(
     fn=assess_pronunciation,

 from transformers import pipeline
 import torch
 import librosa
+import torch.nn.functional as F
+# Load the engine
+print("Loading Strict Pronunciation Engine...")
 asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 def assess_pronunciation(audio_filepath, target_text):
     if not audio_filepath or not target_text:
         return {"error": "Missing input"}
+    # --- FIXED INDENTATION STARTS HERE ---
     try:
+        # 1. Process Audio
         audio, sr = librosa.load(audio_filepath, sr=16000)
+        # 2. Strict Scoring (Confidence Analysis)
+        # We check how 'confident' the model is about your sounds
         with torch.no_grad():
             logits = asr_pipe.model(torch.tensor(audio).unsqueeze(0)).logits
         probs = F.softmax(logits, dim=-1)
+        # We calculate the average confidence across the whole clip
+        confidence = float(torch.mean(torch.max(probs, dim=-1).values))
+        # 3. Transcription for feedback
+        transcription_result = asr_pipe(audio_filepath)
+        said = transcription_result["text"].lower()
+        # Strict Logic: Penalty for thick accents or mumbling
+        # We scale the 0-1 confidence into a 0-100 score with a difficulty curve
+        accuracy = round((confidence ** 2) * 100)
+        # Fluency calculation (Characters per second)
+        duration = len(audio) / sr
+        fluency = min(100, round((len(said) / max(duration, 1)) * 10))
         return {
+            "accuracy_score": accuracy,
+            "fluency_score": fluency,
+            "completeness_score": 100 if accuracy > 70 else 80,
+            "transcription": said
         }
     except Exception as e:
         return {"error": str(e)}
 # Gradio 3 Interface
 interface = gr.Interface(
     fn=assess_pronunciation,