Moremoholo2 commited on
Commit
4a8263d
·
verified ·
1 Parent(s): 3fe421c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +130 -0
  2. audio_cnn_model.pth +3 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torchaudio
7
+ import numpy as np
8
+ import os
9
+
10
+ # Import the model architecture from model.py
11
+ # Make sure model.py exists and contains the AudioCNN class definition
12
+ try:
13
+ from model import AudioCNN
14
+ except ImportError:
15
+ print("Error: model.py not found. Please ensure the model architecture is defined in model.py.")
16
+ # Define a placeholder class to avoid NameError if model.py is missing
17
+ class AudioCNN(nn.Module):
18
+ def __init__(self, num_classes, input_shape):
19
+ super(AudioCNN, self).__init__()
20
+ print("Placeholder AudioCNN model used as model.py was not found.")
21
+ self.dummy_layer = nn.Linear(1, 1)
22
+ def forward(self, x):
23
+ return self.dummy_layer(x.view(x.size(0), -1)[:, :1])
24
+
25
+
26
+ # Define the input shape and number of classes used during training
27
+ # These values should match what was used in the training step
28
+ num_classes = 2
29
+ # Assuming the training was done with MFCC features of shape (batch_size, 1, 40, 156)
30
+ # You might need to adjust this input_shape based on your actual training data
31
+ input_shape = (1, 40, 156) # (channels, height, width) - Adjust if your training used different dimensions
32
+
33
+ # Instantiate the model and load the trained state dictionary
34
+ model = AudioCNN(num_classes, input_shape)
35
+ model_save_path = "audio_cnn_model.pth"
36
+
37
+ # Check if the model file exists before loading
38
+ if os.path.exists(model_save_path):
39
+ try:
40
+ model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu'))) # Map to CPU
41
+ model.eval() # Set the model to evaluation mode
42
+ print(f"Model state dictionary loaded from {model_save_path}")
43
+ except Exception as e:
44
+ print(f"Error loading model state dictionary: {e}")
45
+ model = None
46
+ else:
47
+ print(f"Error: Model state dictionary not found at {model_save_path}. Please ensure the trained model is saved and available.")
48
+ model = None # Set model to None if loading fails
49
+
50
+
51
+ # Define the prediction function
52
+ def predict_audio(audio_file_path):
53
+ if model is None:
54
+ return "Model not loaded. Cannot make predictions."
55
+ if audio_file_path is None:
56
+ return "No audio file provided."
57
+
58
+ try:
59
+ # Load the audio file
60
+ waveform, sample_rate = torchaudio.load(audio_file_path)
61
+
62
+ # Ensure the audio is mono (single channel) and add batch dimension
63
+ if waveform.shape[0] > 1:
64
+ waveform = torch.mean(waveform, dim=0, keepdim=True).unsqueeze(0)
65
+ else:
66
+ waveform = waveform.unsqueeze(0).unsqueeze(0) # Add channel and batch dimension if missing
67
+
68
+
69
+ # Define the MFCC transform (using the same parameters as training)
70
+ mfcc_transform = torchaudio.transforms.MFCC(
71
+ sample_rate=sample_rate,
72
+ n_mfcc=40
73
+ )
74
+
75
+ # Apply the transform
76
+ # mfcc_features will have shape (batch_size, n_mfcc, sequence_length)
77
+ mfcc_features = mfcc_transform(waveform)
78
+
79
+ # Ensure the features have the correct shape (batch_size, 1, 40, sequence_length) for the CNN
80
+ # mfcc_features is currently (1, 40, sequence_length) from torchaudio after processing one audio
81
+ # Add channel dimension
82
+ mfcc_features = mfcc_features.unsqueeze(1)
83
+
84
+
85
+ # Pad or truncate the features to match the expected input_shape width
86
+ target_width = input_shape[2]
87
+ if mfcc_features.shape[3] < target_width:
88
+ # Pad with zeros
89
+ padding = target_width - mfcc_features.shape[3]
90
+ mfcc_features = torch.nn.functional.pad(mfcc_features, (0, padding))
91
+ elif mfcc_features.shape[3] > target_width:
92
+ # Truncate
93
+ mfcc_features = mfcc_features[:, :, :, :target_width]
94
+
95
+ # Run inference
96
+ with torch.no_grad():
97
+ outputs = model(mfcc_features)
98
+
99
+ # Get the predicted class index
100
+ _, predicted_index = torch.max(outputs.data, 1)
101
+ predicted_index = predicted_index.item()
102
+
103
+ # Interpret the prediction
104
+ # Assuming your labels were 0 and 1 during training.
105
+ # You'll need to know what 0 and 1 correspond to in your dataset.
106
+ label_map = {0: 'English', 1: 'Code-switched'} # Replace with your actual label mapping
107
+ predicted_label = label_map.get(predicted_index, "Unknown")
108
+
109
+ return predicted_label
110
+
111
+ except Exception as e:
112
+ return f"Error during prediction: {e}"
113
+
114
+
115
+ # Create the Gradio interface
116
+ if model is not None:
117
+ interface = gr.Interface(
118
+ fn=predict_audio,
119
+ inputs=gr.Audio(type="filepath"),
120
+ outputs=gr.Label(),
121
+ title="Audio Code-Switching Detector",
122
+ description="Upload an audio file to detect if it contains code-switching.",
123
+ )
124
+
125
+ # Launch the interface (for testing locally)
126
+ # interface.launch(debug=True)
127
+ else:
128
+ print("Gradio interface not created due to model loading error.")
129
+
130
+ # This script will be saved as app.py for Hugging Face Spaces deployment
audio_cnn_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff465244d30d269ac350eaadf82abae3434daaab14fe00439b191afb121c2fe7
3
+ size 194384
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ gradio
3
+ torch
4
+ torchaudio
5
+ datasets
6
+ # Add any other libraries your model or data loading might need
7
+ # e.g., scikit-learn if you used train_test_split in your final app code
8
+ sklearn # Added based on previous usage of train_test_split