SwitchAlpha commited on
Commit
e4c7073
·
verified ·
1 Parent(s): 4839cef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -32
app.py CHANGED
@@ -5,7 +5,7 @@ import librosa
5
  import traceback
6
  import os
7
 
8
- # ==== Feature params (adjust to match training if you know them) ====
9
  SR = 16000
10
  N_FFT = 1024
11
  HOP_LENGTH = 256
@@ -15,58 +15,110 @@ N_BANDS = 6
15
  FMIN = 20.0
16
  WINDOW = "hann"
17
  N_MFCC = 40
18
- # ====================================================================
19
 
20
- # Lazy-load so startup doesn't crash if files are missing
21
  _model = None
22
  _label = None
23
  _model_err = None
24
 
25
  def load_artifacts():
 
26
  global _model, _label, _model_err
27
  if _model is not None:
28
  return
29
  try:
30
- assert os.path.exists("model.joblib"), "model.joblib not found in working dir"
31
- assert os.path.exists("label.joblib"), "label.joblib not found in working dir"
 
 
32
  _model = joblib.load("model.joblib")
33
  _label = joblib.load("label.joblib")
34
  except Exception as e:
35
  _model_err = f"Model load failed: {e}\n{traceback.format_exc()}"
36
 
 
 
 
 
 
 
37
  def extract_features_from_array(y, sr):
38
- # mono + resample
 
 
 
 
 
 
 
 
39
  y = np.asarray(y, dtype=np.float32)
 
 
40
  if y.ndim > 1:
41
  y = np.mean(y, axis=1)
42
  if sr != SR:
43
  y = librosa.resample(y=y, orig_sr=sr, target_sr=SR)
44
  sr = SR
45
- # pad to at least 1s
 
46
  if len(y) < SR:
47
  y = np.pad(y, (0, SR - len(y)))
48
- # features
49
- mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC,
50
- n_fft=N_FFT, hop_length=HOP_LENGTH,
51
- win_length=WIN_LENGTH, window=WINDOW).T, axis=0)
52
- mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT,
53
- hop_length=HOP_LENGTH, win_length=WIN_LENGTH,
54
- n_mels=N_MELS).T, axis=0)
55
- stft = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LENGTH,
56
- win_length=WIN_LENGTH, window=WINDOW))
57
- chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
58
- contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr,
59
- n_fft=N_FFT, hop_length=HOP_LENGTH,
60
- win_length=WIN_LENGTH,
61
- n_bands=N_BANDS, fmin=FMIN).T, axis=0)
62
- tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr).T, axis=0)
63
- feats = np.concatenate([mfcc, chroma, mel, contrast, tonnetz]).astype(np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return feats
65
 
66
  def predict_from_audio(audio):
67
  """
68
- We set inputs=gr.Audio(type="numpy"), which gives a tuple: (sr, array)
69
- If you prefer filepath, switch inputs to gr.Audio(type="filepath") and keep only the filepath branch.
70
  """
71
  try:
72
  load_artifacts()
@@ -76,7 +128,7 @@ def predict_from_audio(audio):
76
  if audio is None:
77
  return "Lütfen bir ses dosyası yükleyin veya kaydedin."
78
 
79
- # Handle different gradio formats robustly
80
  if isinstance(audio, dict) and "sampling_rate" in audio and "array" in audio:
81
  sr = int(audio["sampling_rate"])
82
  y = np.array(audio["array"], dtype=np.float32)
@@ -85,29 +137,27 @@ def predict_from_audio(audio):
85
  sr = int(sr)
86
  y = np.array(y, dtype=np.float32)
87
  elif isinstance(audio, str):
88
- # if you changed inputs to type="filepath"
89
  y, sr = librosa.load(audio, sr=SR)
90
  else:
91
  return "Beklenmedik ses girdisi formatı."
92
 
93
  feats = extract_features_from_array(y, sr)
94
- X = feats.reshape(1, -1)
95
  pred = _model.predict(X)
96
- # Make sure label is a Python string (not numpy type)
97
  label = _label.inverse_transform(pred)[0]
98
  return f"Tahmin: {str(label)}"
99
 
100
  except Exception as e:
101
- # Show full traceback in the textbox so we see the real error instead of generic “output error”
102
  tb = traceback.format_exc()
103
  return f"❌ Hata oluştu:\n{e}\n\nTraceback:\n{tb}"
104
 
105
  TITLE = "Baby Cry Classification (foduucom)"
106
- DESC = "Bir bebek ağlaması sesini yükleyin veya kaydedin; model sınıf tahmini yapsın."
107
 
108
  demo = gr.Interface(
109
  fn=predict_from_audio,
110
- inputs=gr.Audio(sources=["upload", "microphone"], type="numpy"), # or "filepath"
111
  outputs=gr.Textbox(lines=6),
112
  title=TITLE,
113
  description=DESC,
 
5
  import traceback
6
  import os
7
 
8
+ # ==== Özellik/işleme parametreleri (eğitimdekilerle eşleştirmen önerilir) ====
9
  SR = 16000
10
  N_FFT = 1024
11
  HOP_LENGTH = 256
 
15
  FMIN = 20.0
16
  WINDOW = "hann"
17
  N_MFCC = 40
18
+ # ============================================================================
19
 
 
20
  _model = None
21
  _label = None
22
  _model_err = None
23
 
24
  def load_artifacts():
25
+ """model.joblib ve label.joblib dosyalarını geç yükle (lazy load)."""
26
  global _model, _label, _model_err
27
  if _model is not None:
28
  return
29
  try:
30
+ if not os.path.exists("model.joblib"):
31
+ raise FileNotFoundError("model.joblib not found in working dir")
32
+ if not os.path.exists("label.joblib"):
33
+ raise FileNotFoundError("label.joblib not found in working dir")
34
  _model = joblib.load("model.joblib")
35
  _label = joblib.load("label.joblib")
36
  except Exception as e:
37
  _model_err = f"Model load failed: {e}\n{traceback.format_exc()}"
38
 
39
+ def _mean_std(feat_2d):
40
+ # (time, dim) dizisinden mean ve std çıkar
41
+ m = np.mean(feat_2d, axis=0)
42
+ s = np.std(feat_2d, axis=0)
43
+ return m, s
44
+
45
  def extract_features_from_array(y, sr):
46
+ """
47
+ 194 boyutlu özellik vektörü üret:
48
+ MFCC mean+std = 40*2=80
49
+ Chroma mean+std = 12*2=24
50
+ Mel mean = 64
51
+ Spectral contrast mean+std = 7*2=14
52
+ Tonnetz mean+std = 6*2=12
53
+ Toplam = 194
54
+ """
55
  y = np.asarray(y, dtype=np.float32)
56
+
57
+ # mono + yeniden örnekleme
58
  if y.ndim > 1:
59
  y = np.mean(y, axis=1)
60
  if sr != SR:
61
  y = librosa.resample(y=y, orig_sr=sr, target_sr=SR)
62
  sr = SR
63
+
64
+ # çok kısa kayıtları pad et (>=1 sn)
65
  if len(y) < SR:
66
  y = np.pad(y, (0, SR - len(y)))
67
+
68
+ # MFCC (mean + std) → 80
69
+ mfcc = librosa.feature.mfcc(
70
+ y=y, sr=sr, n_mfcc=N_MFCC,
71
+ n_fft=N_FFT, hop_length=HOP_LENGTH,
72
+ win_length=WIN_LENGTH, window=WINDOW
73
+ ).T
74
+ mfcc_mean, mfcc_std = _mean_std(mfcc)
75
+
76
+ # Mel-spectrogram (sadece mean) → 64
77
+ mel = librosa.feature.melspectrogram(
78
+ y=y, sr=sr, n_fft=N_FFT,
79
+ hop_length=HOP_LENGTH, win_length=WIN_LENGTH,
80
+ n_mels=N_MELS
81
+ ).T
82
+ mel_mean = np.mean(mel, axis=0)
83
+
84
+ # STFT
85
+ S = np.abs(librosa.stft(
86
+ y, n_fft=N_FFT, hop_length=HOP_LENGTH,
87
+ win_length=WIN_LENGTH, window=WINDOW
88
+ ))
89
+
90
+ # Chroma (mean + std) → 24
91
+ chroma = librosa.feature.chroma_stft(S=S, sr=sr).T
92
+ chroma_mean, chroma_std = _mean_std(chroma)
93
+
94
+ # Spectral Contrast (mean + std) → 14
95
+ contrast = librosa.feature.spectral_contrast(
96
+ S=S, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH,
97
+ win_length=WIN_LENGTH, n_bands=N_BANDS, fmin=FMIN
98
+ ).T
99
+ contrast_mean, contrast_std = _mean_std(contrast)
100
+
101
+ # Tonnetz (mean + std) → 12
102
+ y_harm = librosa.effects.harmonic(y)
103
+ tonnetz = librosa.feature.tonnetz(y=y_harm, sr=sr).T
104
+ tonnetz_mean, tonnetz_std = _mean_std(tonnetz)
105
+
106
+ feats = np.concatenate([
107
+ mfcc_mean, mfcc_std, # 80
108
+ chroma_mean, chroma_std, # 24
109
+ mel_mean, # 64
110
+ contrast_mean, contrast_std, # 14
111
+ tonnetz_mean, tonnetz_std # 12
112
+ ]).astype(np.float32)
113
+
114
+ # Güvenlik kontrolü
115
+ # print("feature_dim:", feats.shape[0]) # 194 olmalı
116
  return feats
117
 
118
  def predict_from_audio(audio):
119
  """
120
+ inputs=gr.Audio(type="numpy") (sr, array)
121
+ Dilersen type="filepath" yapıp aşağıdaki string yol dalını kullanabilirsin.
122
  """
123
  try:
124
  load_artifacts()
 
128
  if audio is None:
129
  return "Lütfen bir ses dosyası yükleyin veya kaydedin."
130
 
131
+ # Gradio girdi varyantlarını karşıla
132
  if isinstance(audio, dict) and "sampling_rate" in audio and "array" in audio:
133
  sr = int(audio["sampling_rate"])
134
  y = np.array(audio["array"], dtype=np.float32)
 
137
  sr = int(sr)
138
  y = np.array(y, dtype=np.float32)
139
  elif isinstance(audio, str):
140
+ # inputs=gr.Audio(type="filepath") kullanırsan burası çalışır
141
  y, sr = librosa.load(audio, sr=SR)
142
  else:
143
  return "Beklenmedik ses girdisi formatı."
144
 
145
  feats = extract_features_from_array(y, sr)
146
+ X = feats.reshape(1, -1) # (1, 194)
147
  pred = _model.predict(X)
 
148
  label = _label.inverse_transform(pred)[0]
149
  return f"Tahmin: {str(label)}"
150
 
151
  except Exception as e:
 
152
  tb = traceback.format_exc()
153
  return f"❌ Hata oluştu:\n{e}\n\nTraceback:\n{tb}"
154
 
155
  TITLE = "Baby Cry Classification (foduucom)"
156
+ DESC = "Bebek ağlaması sesini yükleyin veya mikrofondan kaydedin; model sınıf tahmini yapsın."
157
 
158
  demo = gr.Interface(
159
  fn=predict_from_audio,
160
+ inputs=gr.Audio(sources=["upload", "microphone"], type="numpy"),
161
  outputs=gr.Textbox(lines=6),
162
  title=TITLE,
163
  description=DESC,