Spaces:

creativepurus
/

accent_classification

Sleeping

App Files Files Community

creativepurus commited on Aug 6, 2025

Commit

b804c93

1 Parent(s): 7785622

Updated Model Path

Browse files

Files changed (2) hide show

app.py +50 -92
requirements.txt +5 -14

app.py CHANGED Viewed

@@ -1,106 +1,64 @@
-# ------------------- Type "python app.py" in TERMINAL to Run the App -------------------
-import torch
-import torchaudio
-import gradio as gr
 from transformers import Wav2Vec2Processor, Wav2Vec2Model
 from safetensors.torch import load_file
-import torch.nn as nn
-import torch.nn.functional as F
-# ------------------- Label Mapping -------------------
-id2label = {
-    0: "Canadian English",
-    1: "England English"
-}
-# ------------------- Load Processor -------------------
-processor = Wav2Vec2Processor.from_pretrained("creativepurus/accent-wav2vec2")
-# ------------------- Define Model -------------------
-class Wav2Vec2Classifier(nn.Module):
-    def __init__(self, num_labels):
-        super(Wav2Vec2Classifier, self).__init__()
-        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
-        self.dropout = nn.Dropout(0.2)
-        self.classifier = nn.Linear(self.wav2vec2.config.hidden_size, num_labels)
     def forward(self, input_values):
-        outputs = self.wav2vec2(input_values)
-        hidden_states = outputs.last_hidden_state
-        pooled_output = hidden_states.mean(dim=1)
-        logits = self.classifier(self.dropout(pooled_output))
         return logits
-# ------------------- Load Weights -------------------
-model = Wav2Vec2Classifier(num_labels=2)
-state_dict = load_file("model.safetensors", device="cpu")  # assuming in root dir
-model.load_state_dict(state_dict)
 model.eval()
-# ------------------- Prediction Function -------------------
 def predict(audio_path):
-    # Load & preprocess audio
-    speech_array, sr = torchaudio.load(audio_path)
-    if sr != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        speech_array = resampler(speech_array)
-    inputs = processor(
-        speech_array.squeeze().numpy(),
-        sampling_rate=16000,
-        return_tensors="pt",
-        padding="max_length",
-        truncation=True,
-        max_length=16000 * 4
-    )
-    with torch.no_grad():
-        logits = model(inputs.input_values)
-        probs = torch.nn.functional.softmax(logits, dim=-1)
-        pred_id = torch.argmax(probs, dim=-1).item()
-    return id2label[pred_id]
-# ------------------- Gradio UI with Dark Theme -------------------
-with gr.Blocks(
-    theme=gr.themes.Monochrome(primary_hue="blue", secondary_hue="purple", neutral_hue="slate"),
-    css="""
-        body { background-color: #1E1E2F !important; color: #E0E0E0 !important; }
-        .gr-button { background-color: #3B82F6 !important; color: white !important; font-weight: bold; }
-        .gr-textbox { font-size: 18px; }
-        .gr-audio label { color: white !important; }
-    """
-) as demo:
-    gr.Markdown(
-        """
-        <h1 style="text-align: center; color: #00FFFF;">🌍 Accent Classifier using Wav2Vec2</h1>
-        <p style="text-align: center; font-size: 16px;">Upload or record a 4-second <b>English voice clip</b><br>
-        This AI model detects whether your accent is <span style='color: #3B82F6; font-weight: bold;'>Canadian</span> or <span style='color: #FF4C4C; font-weight: bold;'>British</span>.</p>
-        <br>
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            audio_input = gr.Audio(type="filepath", label="🎧 Upload or Record English Voice")
-            submit_btn = gr.Button("🔍 Detect Accent")
-        with gr.Column(scale=1):
-            label_output = gr.Text(label="🗣️ Predicted Accent")
-    submit_btn.click(fn=predict, inputs=audio_input, outputs=label_output)
-    gr.Markdown("---")
-    gr.Markdown(
-        "<p style='text-align: center;'>👨‍💻 Created by <a href='https://github.com/creativepurus' target='_blank' style='color:#66CFFF;'>Anand Purushottam</a> | <a href='https://www.linkedin.com/in/creativepurus/' target='_blank' style='color:#66CFFF;'>LinkedIn</a></p>"
-    )
-if __name__ == "__main__":
-    demo.launch()

 from transformers import Wav2Vec2Processor, Wav2Vec2Model
 from safetensors.torch import load_file
+import torch
+import gradio as gr
+import torchaudio
+# Load processor from Hugging Face Model Hub
+processor = Wav2Vec2Processor.from_pretrained("creativepurus/accent-wav2vec2")
+# Load base model (large version)
+base_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
+# Define your classifier head
+classifier_head = torch.nn.Sequential(
+    torch.nn.AdaptiveAvgPool1d(1),
+    torch.nn.Flatten(),
+    torch.nn.Dropout(0.1),
+    torch.nn.Linear(1024, 2)  # 1024 hidden size for wav2vec2-large
+)
+# Load fine-tuned classifier weights
+state_dict = load_file("model.safetensors", device="cpu")
+classifier_head.load_state_dict(state_dict)
+# Combine base model + classifier head
+class AccentClassifier(torch.nn.Module):
+    def __init__(self, base, head):
+        super().__init__()
+        self.base = base
+        self.head = head
     def forward(self, input_values):
+        with torch.no_grad():
+            features = self.base(input_values).last_hidden_state
+        logits = self.head(features.transpose(1, 2))
         return logits
+model = AccentClassifier(base_model, classifier_head)
 model.eval()
+# Inference function
 def predict(audio_path):
+    # Load and preprocess audio
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if sample_rate != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+        waveform = resampler(waveform)
+    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
+    logits = model(inputs.input_values)
+    probs = torch.nn.functional.softmax(logits, dim=1)
+    labels = ["Canadian English", "England English"]
+    return {labels[i]: float(probs[0][i]) for i in range(2)}
+# Gradio Interface
+interface = gr.Interface(
+    fn=predict,
+    inputs=gr.Audio(source="upload", type="filepath"),
+    outputs=gr.Label(num_top_classes=2),
+    title="Accent Classification with Wav2Vec2-Large"
+)
+interface.launch()

requirements.txt CHANGED Viewed

@@ -1,14 +1,5 @@
-fastapi==0.116.1
-gradio==5.38.2
-torch==2.5.1
-torchaudio==2.5.1
-transformers==4.41.2
-datasets==4.0.0
-huggingface-hub==0.34.1
-safetensors==0.5.3
-librosa==0.11.0
-soundfile==0.13.1
-pandas==2.3.1
-numpy==1.26.4
-scikit-learn==1.7.0
-uvicorn==0.35.0

+torch
+transformers
+safetensors
+torchaudio
+gradio