Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

Yilin0601 commited on Mar 21, 2025

Commit

3e63959

verified ·

1 Parent(s): af74093

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,13 +5,10 @@ import librosa
 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # --------------------------------------------------
-# Configuration
 # --------------------------------------------------
-# Your fine-tuned model has 8 classes, corresponding to levels 3..10
-num_labels = 8
-# Load your fine-tuned model from the Hugging Face Hub
-# (Replace "Yilin0601/wav2vec2-accuracy-checkpoints" with your actual repo if different)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(
     "Yilin0601/wav2vec2-accuracy-checkpoints"
 )
@@ -29,11 +26,15 @@ def predict(audio):
     # Gradio provides audio as (sample_rate, np.array)
     sample_rate, audio_data = audio
     # Convert stereo to mono if needed
     if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
         audio_data = np.mean(audio_data, axis=1)
-    # Resample to 16 kHz if not already
     if sample_rate != 16000:
         audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
@@ -50,11 +51,9 @@ def predict(audio):
     with torch.no_grad():
         logits = model(**inputs).logits
-    # Argmax over logits -> integer class in [0..7]
     pred_class = torch.argmax(logits, dim=-1).item()
-    # Map [0..7] back to levels [3..10] by adding 3
-    predicted_level = pred_class + 3
     return f"Predicted Level: {predicted_level}"

 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # --------------------------------------------------
+# Load Your Fine-Tuned Model
 # --------------------------------------------------
+# This model was fine-tuned with labels remapped from [3..10] to [0..7].
+# Make sure the model repo name below is correct and accessible.
 model = Wav2Vec2ForSequenceClassification.from_pretrained(
     "Yilin0601/wav2vec2-accuracy-checkpoints"
 )
     # Gradio provides audio as (sample_rate, np.array)
     sample_rate, audio_data = audio
+    # Ensure the audio is floating-point (librosa requires float32 or float64)
+    if audio_data.dtype not in [np.float32, np.float64]:
+        audio_data = audio_data.astype(np.float32)
     # Convert stereo to mono if needed
     if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
         audio_data = np.mean(audio_data, axis=1)
+    # Resample to 16 kHz if needed
     if sample_rate != 16000:
         audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
     with torch.no_grad():
         logits = model(**inputs).logits
+    # The model output is an 8-class prediction (0..7), corresponding to original labels 3..10
     pred_class = torch.argmax(logits, dim=-1).item()
+    predicted_level = pred_class + 3  # Map back to [3..10]
     return f"Predicted Level: {predicted_level}"