Spaces:

ProfRom
/

TestSpace3

Sleeping

File size: 5,788 Bytes

import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import numpy as np
from datasets import load_dataset


# ---------------------------
# Constants
# ---------------------------
TARGET_SR = 44100
N_FFT = 1024
HOP_LENGTH = 512
N_MELS = 64


# ---------------------------
# Load Dataset Metadata for Labels
# ---------------------------
dataset = load_dataset("ccmusic-database/pianos", name="8_class")
label_names = dataset["train"].features["label"].names
num_classes = len(label_names)


# ---------------------------
# Define the Same CNN Model as in Training
# ---------------------------
class PianoCNNMultiTask(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),   # 128 -> 64


            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),   # 64 -> 32


            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),   # 32 -> 16


            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((4, 4))  # 4x4 feature map
        )
        self.flatten = nn.Flatten()
        self.fc_shared = nn.Linear(128 * 4 * 4, 256)
        self.dropout = nn.Dropout(0.3)


        # Classification head
        self.fc_class = nn.Linear(256, num_classes)
        # Regression head (quality score)
        self.fc_reg = nn.Linear(256, 1)


    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = F.relu(self.fc_shared(x))
        x = self.dropout(x)
        class_logits = self.fc_class(x)
        quality_pred = self.fc_reg(x).squeeze(1)
        return class_logits, quality_pred


# ---------------------------
# Initialize and Load Trained Model (CPU)
# ---------------------------
model = PianoCNNMultiTask(num_classes=num_classes)
state_dict = torch.load("piano_cnn_multitask.pt", map_location=torch.device("cpu"))
model.load_state_dict(state_dict)
model.eval()  # inference mode


# ---------------------------
# Audio Preprocessing
# ---------------------------
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=TARGET_SR,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS,
    center=False  # we will handle padding manually
)


def preprocess_audio_to_mel_image(audio):
    """
    audio from gradio.Audio(type="numpy") is (sample_rate, data)
    Returns a 3x128x128 tensor ready for the CNN.
    """
    sr, data = audio


    # Convert to tensor
    waveform = torch.tensor(data, dtype=torch.float32)


    # If shape is (samples,), make it (1, samples)
    if waveform.ndim == 1:
        waveform = waveform.unsqueeze(0)


    # If shape is (samples, channels), transpose to (channels, samples)
    if waveform.ndim == 2 and waveform.shape[0] < waveform.shape[1]:
        waveform = waveform.transpose(0, 1)


    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)


    # Resample to TARGET_SR if needed
    if sr != TARGET_SR:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SR)
        waveform = resampler(waveform)


    # Ensure minimum length for STFT
    min_len = N_FFT
    if waveform.shape[-1] < min_len:
        pad_amount = min_len - waveform.shape[-1]
        waveform = F.pad(waveform, (0, pad_amount))


    # Compute Mel-spectrogram and convert to dB
    mel = mel_transform(waveform)          # [1, n_mels, time]
    mel_db = torchaudio.transforms.AmplitudeToDB()(mel)


    # Normalize to 0–1
    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)


    # Resize to 128x128 and make 3 channels
    mel_db = mel_db.unsqueeze(0)  # [1, 1, H, W]
    mel_resized = F.interpolate(mel_db, size=(128, 128), mode="bilinear", align_corners=False)
    mel_rgb = mel_resized.repeat(1, 3, 1, 1)  # [1, 3, 128, 128]


    return mel_rgb.squeeze(0)  # [3, 128, 128]


# ---------------------------
# Main Inference Function
# ---------------------------
def analyze_piano(audio):
    if audio is None:
        return "Please upload or record a piano audio clip (around 1–3 seconds)."


    try:
        # Preprocess input
        mel_img = preprocess_audio_to_mel_image(audio)  # [3,128,128]
        mel_batch = mel_img.unsqueeze(0)  # [1,3,128,128]


        with torch.no_grad():
            logits, q_pred = model(mel_batch)
            class_idx = torch.argmax(logits, dim=1).item()
            quality_score = float(q_pred.item())


        piano_type = label_names[class_idx]
        quality_score_rounded = round(quality_score, 2)


        output_text = (
            f"Piano Type Prediction: {piano_type}\n"
            f"Estimated Sound Quality Score: {quality_score_rounded} / 10"
        )
        return output_text


    except Exception as e:
        return f"An error occurred while processing the audio: {e}"


# ---------------------------
# Gradio Interface
# ---------------------------
demo = gr.Interface(
    fn=analyze_piano,
    inputs=gr.Audio(
        sources=["upload", "microphone"],
        type="numpy",
        label="Upload Piano Audio or Record with Microphone"
    ),
    outputs=gr.Textbox(label="AI Analysis Output"),
    title="AI Piano Sound Analyzer 🎹",
    description="Upload a short piano recording to get a predicted piano type and estimated sound-quality score from the trained CNN model."
)


if __name__ == "__main__":
    demo.launch()