Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

App Files Files Community

Yilin0601 commited on Mar 21, 2025

Commit

2597334

verified ·

1 Parent(s): 141103b

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -29

app.py CHANGED Viewed

@@ -4,61 +4,69 @@ import numpy as np
 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 import librosa
-# -------------------------------
-# Configuration – Modify as Needed
-# -------------------------------
-# Number of labels for your classification task.
-# For example, if you want levels 3 through 10, that's 8 labels.
-num_labels = 8
-# Pre-trained model: We're using facebook's wav2vec2-base-960h.
-# Since we are not fine-tuning, we are simply adding a classification head with random weights.
-model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base-960h", num_labels=8)
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-# -------------------------------
 # Prediction Function
-# -------------------------------
 def predict(audio):
     if audio is None:
         return "No audio provided."
-    # Gradio returns audio as a tuple: (sample_rate, np.array)
     sample_rate, audio_data = audio
-    # Ensure the audio is at 16 kHz (the expected sampling rate)
     if sample_rate != 16000:
-        audio_data = librosa.resample(np.asarray(audio_data), orig_sr=sample_rate, target_sr=16000)
-    # Process the audio input using the feature extractor
     inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
-    # Set the model to evaluation mode and run inference
     model.eval()
     with torch.no_grad():
         logits = model(**inputs).logits
-    # Obtain the predicted class (index)
     pred_class = torch.argmax(logits, dim=-1).item()
-    # If you want to map from 0..7 to your intended label range (e.g., 3..10),
-    # simply add an offset. Here we add 3.
-    predicted_level = pred_class + 3
-    # Return a string with the predicted level
-    return f"Predicted L2 English Accuracy Level: {predicted_level}"
-# -------------------------------
-# Gradio Interface Setup
-# -------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=gr.Audio(type="numpy", label="Record or Upload Audio"),
     outputs="text",
-    title="L2 English Accuracy Predictor Demo",
     description=(
-        "This demo uses Wav2Vec2ForSequenceClassification without fine-tuning. "
-        "the prediction results are random and for demonstration purposes only."
     ),
     allow_flagging="never"
 )

 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 import librosa
+# --------------------------------------------------
+# Configuration
+# --------------------------------------------------
+# We have 3 classes: 0 = "low", 1 = "medium", 2 = "high"
+num_labels = 3
+# Load a base Wav2Vec2 model for classification with 3 labels.
+# The classification head will be randomly initialized.
+model = Wav2Vec2ForSequenceClassification.from_pretrained(
+    "facebook/wav2vec2-base-960h",
+    num_labels=num_labels
+)
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+# Map integer predictions to textual labels
+label_map = {0: "low", 1: "medium", 2: "high"}
+# --------------------------------------------------
 # Prediction Function
+# --------------------------------------------------
 def predict(audio):
     if audio is None:
         return "No audio provided."
+    # Gradio provides audio as (sample_rate, np.array)
     sample_rate, audio_data = audio
+    # Convert stereo to mono if needed
+    if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
+        audio_data = np.mean(audio_data, axis=1)
+    # Resample to 16 kHz if not already
     if sample_rate != 16000:
+        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+    # Extract features
     inputs = feature_extractor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
+    # Model inference
     model.eval()
     with torch.no_grad():
         logits = model(**inputs).logits
+    # Argmax over logits -> integer class
     pred_class = torch.argmax(logits, dim=-1).item()
+    # Convert integer class to textual label
+    predicted_label = label_map.get(pred_class, "Unknown")
+    return f"Predicted Level: {predicted_label}"
+# --------------------------------------------------
+# Gradio Interface
+# --------------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=gr.Audio(type="numpy", label="Record or Upload Audio"),
     outputs="text",
+    title="3-Class Audio Classification Demo (Random)",
     description=(
+        "This demo uses Wav2Vec2ForSequenceClassification with 3 classes (low, medium, high) "
+        "but has not been fine-tuned, so the classification head is random. The predictions "
+        "are not meaningful, but the pipeline demonstrates how a 3-class audio classifier can be set up."
     ),
     allow_flagging="never"
 )