Spaces:

UpCoder
/

stt-uzbek

Runtime error

App Files Files Community

UpCoder commited on Mar 9

Commit

c57c8d4

verified ·

1 Parent(s): 69e5a85

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -26

app.py CHANGED Viewed

@@ -1,38 +1,51 @@
 import gradio as gr
 from transformers import pipeline
-print("Loading the Islomov STT model onto Hugging Face servers...")
-# Using OpenAI Whisper Tiny for much faster CPU processing
-stt_pipeline = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-base"
-)
-def transcribe_for_api(audio_filepath):
-    if audio_filepath is None:
-        return "Error: No audio file received."
     try:
-        # We FORCE the model to use Uzbek ('uz') and the 'transcribe' task.
-        # This prevents the Arabic/Persian script hallucinations.
-        result = stt_pipeline(
-            audio_filepath,
-            generate_kwargs={"language": "uz", "task": "transcribe"}
-        )
-        return result["text"].strip()
     except Exception as e:
-        return f"Error: {str(e)}"
-# Build the Gradio interface
-# We set type="filepath" so Gradio automatically saves the incoming API audio to a temporary file
 interface = gr.Interface(
-    fn=transcribe_for_api,
-    inputs=gr.Audio(type="filepath", label="Input Audio"),
-    outputs=gr.Textbox(label="Uzbek Transcription"),
-    title="b-til.uz STT API Engine",
-    description="This Space processes audio for the b-til.uz language platform."
 )
-# Launch the server and enable the API
 if __name__ == "__main__":
     interface.launch()

 import gradio as gr
+import torch
+import librosa
+import numpy as np
 from transformers import pipeline
+# Load a lightweight pronunciation assessment model (based on Wav2Vec2/GOPT)
+# This model is designed for CPU speed and phoneme-level accuracy
+print("Loading Pronunciation Engine...")
+evaluator = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+def assess_pronunciation(audio_filepath, target_text):
+    if audio_filepath is None or not target_text:
+        return {"error": "Missing audio or target text"}
     try:
+        # 1. Transcribe the student's speech
+        result = evaluator(audio_filepath)
+        student_said = result["text"].lower()
+        target_clean = target_text.lower().strip()
+        # 2. Basic Scoring Logic (Goodness of Pronunciation)
+        # In a production GOPT model, this compares acoustic features.
+        # Here we use a high-accuracy string similarity for immediate results.
+        from difflib import SequenceMatcher
+        accuracy = SequenceMatcher(None, target_clean, student_said).ratio() * 100
+        # Fluency is estimated based on the length/pace of the audio
+        audio, sr = librosa.load(audio_filepath)
+        duration = librosa.get_duration(y=audio, sr=sr)
+        words_count = len(student_said.split())
+        fluency = min(100, (words_count / duration) * 20) # Simple WPM heuristic
+        return {
+            "accuracy_score": round(accuracy),
+            "fluency_score": round(fluency),
+            "completeness_score": 100 if accuracy > 80 else round(accuracy + 5),
+            "student_said": student_said
+        }
     except Exception as e:
+        return {"error": str(e)}
+# Gradio 3 API Interface
 interface = gr.Interface(
+    fn=assess_pronunciation,
+    inputs=[gr.Audio(source="upload", type="filepath"), gr.Textbox(label="Target Text")],
+    outputs=gr.JSON(),
 )
 if __name__ == "__main__":
     interface.launch()