Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import joblib | |
| MODEL_PATH = "gender_recognition_svm.joblib" | |
| # Mapping nhãn model -> text hiển thị | |
| VALID_LABELS = { | |
| "male": "Male", | |
| "male_masculine": "Male", | |
| "female": "Female", | |
| "female_feminine": "Female", | |
| } | |
| print("Loading model...") | |
| model = joblib.load(MODEL_PATH) | |
| print("Model loaded successfully.") | |
| def predict_gender(audio): | |
| print("Received input to predict_gender:", type(audio)) | |
| # Không có audio | |
| if audio is None: | |
| print("No audio input received.") | |
| return "No voice detected. Please record or upload audio." | |
| # Gradio (type='numpy') -> (sr, y) | |
| try: | |
| sr, y = audio | |
| except Exception as e: | |
| print("Unexpected audio format when unpacking:", e, "value:", audio) | |
| return "Unknown (invalid audio format)." | |
| if not isinstance(y, np.ndarray): | |
| print("Audio data is not a numpy array:", type(y)) | |
| return "Unknown (invalid audio data)." | |
| print(f"Original sample rate: {sr}, audio shape: {y.shape}") | |
| # Convert stereo -> mono nếu cần | |
| if y.ndim == 2: | |
| print("Converting stereo to mono.") | |
| try: | |
| # y shape: (samples, channels) -> transpose để to_mono xử lý | |
| y = librosa.to_mono(y.T) | |
| except Exception as e: | |
| print("Error converting to mono:", e) | |
| return "Unknown (error converting audio to mono)." | |
| # Đảm bảo kiểu float | |
| if not np.issubdtype(y.dtype, np.floating): | |
| y = y.astype(np.float32) | |
| # Chuẩn hóa sample rate về 16k | |
| target_sr = 16000 | |
| if sr != target_sr: | |
| print(f"Resampling from {sr} Hz to {target_sr} Hz") | |
| try: | |
| y = librosa.resample(y, orig_sr=sr, target_sr=target_sr) | |
| sr = target_sr | |
| print(f"Resampled audio shape: {y.shape}") | |
| except Exception as e: | |
| print("Error during resampling:", e) | |
| return "Unknown (error during resampling)." | |
| print(f"Preprocessed audio shape: {y.shape}, sample rate: {sr}") | |
| # Trích xuất MFCC | |
| try: | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| features = np.mean(mfcc.T, axis=0).reshape(1, -1) | |
| print("MFCC features extracted:", features.shape) | |
| except Exception as e: | |
| print("Error extracting MFCC features:", e) | |
| return "Unknown (error processing audio features)." | |
| # Dự đoán | |
| try: | |
| pred = model.predict(features) | |
| print("Raw prediction:", pred) | |
| label = str(pred[0]) | |
| except Exception as e: | |
| print("Error during model prediction:", e) | |
| return "Unknown (error during prediction)." | |
| # Kiểm tra nhãn hợp lệ | |
| if label not in VALID_LABELS: | |
| print("Warning: unexpected label from model:", label) | |
| return f"Unknown (unexpected model label: {label})" | |
| # Trả về kết quả chuẩn | |
| return VALID_LABELS[label] | |
| iface = gr.Interface( | |
| fn=predict_gender, | |
| inputs=gr.Audio(type="numpy", label="Record or upload voice"), | |
| outputs=gr.Textbox(label="Predicted gender"), | |
| title="Gender Recognition For Thai Voices", | |
| description="Upload or record a short voice clip to classify gender.", | |
| ) | |
| if __name__ == "__main__": | |
| print("Launching Gradio interface...") | |
| iface.launch(share=True) | |