Spaces:

UpCoder
/

stt-uzbek

Runtime error

App Files Files Community

UpCoder commited on about 1 month ago

Commit

fbe7334

verified ·

1 Parent(s): 62ede0e

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -20

app.py CHANGED Viewed

@@ -1,46 +1,44 @@
 import gradio as gr
 import torch
 import librosa
-import numpy as np
-from transformers import pipeline
-# Load a lightweight pronunciation assessment model (based on Wav2Vec2/GOPT)
-# This model is designed for CPU speed and phoneme-level accuracy
 print("Loading Pronunciation Engine...")
-evaluator = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 def assess_pronunciation(audio_filepath, target_text):
-    if audio_filepath is None or not target_text:
-        return {"error": "Missing audio or target text"}
     try:
-        # 1. Transcribe the student's speech
-        result = evaluator(audio_filepath)
-        student_said = result["text"].lower()
         target_clean = target_text.lower().strip()
-        # 2. Basic Scoring Logic (Goodness of Pronunciation)
-        # In a production GOPT model, this compares acoustic features.
-        # Here we use a high-accuracy string similarity for immediate results.
-        from difflib import SequenceMatcher
         accuracy = SequenceMatcher(None, target_clean, student_said).ratio() * 100
-        # Fluency is estimated based on the length/pace of the audio
         audio, sr = librosa.load(audio_filepath)
         duration = librosa.get_duration(y=audio, sr=sr)
         words_count = len(student_said.split())
-        fluency = min(100, (words_count / duration) * 20) # Simple WPM heuristic
         return {
             "accuracy_score": round(accuracy),
             "fluency_score": round(fluency),
-            "completeness_score": 100 if accuracy > 80 else round(accuracy + 5),
-            "student_said": student_said
         }
     except Exception as e:
         return {"error": str(e)}
-# Gradio 3 API Interface
 interface = gr.Interface(
     fn=assess_pronunciation,
     inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],

 import gradio as gr
+from transformers import pipeline
 import torch
 import librosa
+from difflib import SequenceMatcher
+# Load a fast, accurate English speech model
 print("Loading Pronunciation Engine...")
+asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 def assess_pronunciation(audio_filepath, target_text):
+    if not audio_filepath or not target_text:
+        return {"error": "Missing input"}
     try:
+        # 1. Transcribe the audio
+        result = asr_pipe(audio_filepath)
+        student_said = result["text"].lower().strip()
         target_clean = target_text.lower().strip()
+        # 2. Calculate Accuracy (String similarity)
+        # This acts as a 'Goodness of Pronunciation' proxy
         accuracy = SequenceMatcher(None, target_clean, student_said).ratio() * 100
+        # 3. Calculate Fluency (Words per second)
         audio, sr = librosa.load(audio_filepath)
         duration = librosa.get_duration(y=audio, sr=sr)
         words_count = len(student_said.split())
+        # Heuristic: 120 WPM is native fluency
+        fluency = min(100, (words_count / max(duration, 1)) * 40)
         return {
             "accuracy_score": round(accuracy),
             "fluency_score": round(fluency),
+            "completeness_score": 100 if accuracy > 75 else round(accuracy + 10),
+            "transcription": student_said
         }
     except Exception as e:
         return {"error": str(e)}
+# Interface set to Gradio 3 standards for simple API calls
 interface = gr.Interface(
     fn=assess_pronunciation,
     inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],