Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

App Files Files Community

Yilin0601 commited on Mar 21, 2025

Commit

227aa4c

verified ·

1 Parent(s): 20b8be9

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -11

app.py CHANGED Viewed

@@ -5,10 +5,10 @@ import librosa
 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # --------------------------------------------------
-# Load Your Fine-Tuned Model
 # --------------------------------------------------
 # This model was fine-tuned with labels remapped from [3..10] to [0..7].
-# Make sure the model repo name below is correct and accessible.
 model = Wav2Vec2ForSequenceClassification.from_pretrained(
     "Yilin0601/wav2vec2-fluency-checkpoints"
 )
@@ -22,11 +22,11 @@ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
 def predict(audio):
     if audio is None:
         return "No audio provided."
-    # Gradio provides audio as (sample_rate, np.array)
     sample_rate, audio_data = audio
-    # Ensure the audio is floating-point (librosa requires float32 or float64)
     if audio_data.dtype not in [np.float32, np.float64]:
         audio_data = audio_data.astype(np.float32)
@@ -34,11 +34,11 @@ def predict(audio):
     if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
         audio_data = np.mean(audio_data, axis=1)
-    # Resample to 16 kHz if needed
     if sample_rate != 16000:
         audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-    # Extract features
     inputs = feature_extractor(
         audio_data,
         sampling_rate=16000,
@@ -51,11 +51,11 @@ def predict(audio):
     with torch.no_grad():
         logits = model(**inputs).logits
-    # The model output is an 8-class prediction (0..7), corresponding to original labels 3..10
     pred_class = torch.argmax(logits, dim=-1).item()
     predicted_level = pred_class + 3  # Map back to [3..10]
-    return f"Predicted Level: {predicted_level}"
 # --------------------------------------------------
 # Gradio Interface
@@ -66,8 +66,10 @@ iface = gr.Interface(
     outputs="text",
     title="L2 English Fluency Predictor",
     description=(
-        "This demo uses a fine-tuned Wav2Vec2ForSequenceClassification model with labels for accuracy evaluation "
-        "mapped from 0 to 10. Record or upload audio to see the predicted level."
     ),
     allow_flagging="never"
 )

 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # --------------------------------------------------
+# Load Your Fine-Tuned Model for Fluency Prediction
 # --------------------------------------------------
 # This model was fine-tuned with labels remapped from [3..10] to [0..7].
+# Ensure that "Yilin0601/wav2vec2-fluency-checkpoints" is your correct repo.
 model = Wav2Vec2ForSequenceClassification.from_pretrained(
     "Yilin0601/wav2vec2-fluency-checkpoints"
 )
 def predict(audio):
     if audio is None:
         return "No audio provided."
+    # Gradio returns audio as (sample_rate, np.array)
     sample_rate, audio_data = audio
+    # Ensure audio is in floating-point (librosa requires float32 or float64)
     if audio_data.dtype not in [np.float32, np.float64]:
         audio_data = audio_data.astype(np.float32)
     if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
         audio_data = np.mean(audio_data, axis=1)
+    # Resample to 16 kHz if necessary
     if sample_rate != 16000:
         audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+    # Extract features using the feature extractor
     inputs = feature_extractor(
         audio_data,
         sampling_rate=16000,
     with torch.no_grad():
         logits = model(**inputs).logits
+    # The model outputs an 8-class prediction (0..7), corresponding to original fluency scores [3..10]
     pred_class = torch.argmax(logits, dim=-1).item()
     predicted_level = pred_class + 3  # Map back to [3..10]
+    return f"Predicted Fluency Level: {predicted_level}"
 # --------------------------------------------------
 # Gradio Interface
     outputs="text",
     title="L2 English Fluency Predictor",
     description=(
+        "This demo uses a fine-tuned Wav2Vec2ForSequenceClassification model for fluency prediction. "
+        "The model was fine-tuned with fluency scores remapped from [3..10] to [0..7]. "
+        "Record or upload audio to see the predicted fluency level. "
+        "If the predicted level is always the same (e.g., 8), it might indicate that the model needs further fine-tuning or calibration."
     ),
     allow_flagging="never"
 )