Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on 30 days ago

Commit

555f48a

verified ·

1 Parent(s): 647e467

House - Unit 8 Assignment

Browse files

Files changed (1) hide show

app.py +199 -43

app.py CHANGED Viewed

@@ -1,54 +1,210 @@
-from transformers import pipeline
-from PIL import Image
 import gradio as gr
-# VQA pipeline
-vqa_pipeline = pipeline(
-    "visual-question-answering",
-    model="dandelin/vilt-b32-finetuned-vqa"
-)
-# English -> Korean translator (reliable alternative)
-translator = pipeline(
-    "translation",
-    model="facebook/m2m100_418M"
 )
-def vqa_answer(image: Image.Image, question: str):
-    if image is None:
-        return "Please upload an image."
-    if not question or not question.strip():
-        return "Please enter a question about the image."
-    # VQA
-    result = vqa_pipeline(image=image, question=question)
-    top = result[0]
-    answer = top["answer"]
-    score = top.get("score", None)
-    score_str = f"{score:.3f}" if isinstance(score, (float, int)) else "N/A"
-    # Translate EN → KO
-    translated = translator(
-        answer,
-        src_lang="en",
-        tgt_lang="ko"
-    )[0]["translation_text"]
-    return (
-        f"Answer (EN): {answer} (score: {score_str})\n\n"
-        f"번역 (KO): {translated}"
-    )
 demo = gr.Interface(
-    fn=vqa_answer,
-    inputs=[
-        gr.Image(type="pil", label="Upload an image"),
-        gr.Textbox(lines=2, label="Question about the image")
-    ],
-    outputs=gr.Textbox(label="VQA Result (English + Korean)"),
-    title="Visual Question Answering + Korean Translation",
-    description="Upload an image, ask a question, and see the answer in English and Korean."
 )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+import numpy as np
+from datasets import load_dataset
+# ---------------------------
+# Constants
+# ---------------------------
+TARGET_SR = 44100
+N_FFT = 1024
+HOP_LENGTH = 512
+N_MELS = 64
+# ---------------------------
+# Load Dataset Metadata for Labels
+# ---------------------------
+dataset = load_dataset("ccmusic-database/pianos", name="8_class")
+label_names = dataset["train"].features["label"].names
+num_classes = len(label_names)
+# ---------------------------
+# Define the Same CNN Model as in Training
+# ---------------------------
+class PianoCNNMultiTask(nn.Module):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 16, kernel_size=3, padding=1),
+            nn.BatchNorm2d(16),
+            nn.ReLU(),
+            nn.MaxPool2d(2),   # 128 -> 64
+            nn.Conv2d(16, 32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.MaxPool2d(2),   # 64 -> 32
+            nn.Conv2d(32, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d(2),   # 32 -> 16
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool2d((4, 4))  # 4x4 feature map
+        )
+        self.flatten = nn.Flatten()
+        self.fc_shared = nn.Linear(128 * 4 * 4, 256)
+        self.dropout = nn.Dropout(0.3)
+        # Classification head
+        self.fc_class = nn.Linear(256, num_classes)
+        # Regression head (quality score)
+        self.fc_reg = nn.Linear(256, 1)
+    def forward(self, x):
+        x = self.features(x)
+        x = self.flatten(x)
+        x = F.relu(self.fc_shared(x))
+        x = self.dropout(x)
+        class_logits = self.fc_class(x)
+        quality_pred = self.fc_reg(x).squeeze(1)
+        return class_logits, quality_pred
+# ---------------------------
+# Initialize and Load Trained Model (CPU)
+# ---------------------------
+model = PianoCNNMultiTask(num_classes=num_classes)
+state_dict = torch.load("piano_cnn_multitask.pt", map_location=torch.device("cpu"))
+model.load_state_dict(state_dict)
+model.eval()  # inference mode
+# ---------------------------
+# Audio Preprocessing
+# ---------------------------
+mel_transform = torchaudio.transforms.MelSpectrogram(
+    sample_rate=TARGET_SR,
+    n_fft=N_FFT,
+    hop_length=HOP_LENGTH,
+    n_mels=N_MELS,
+    center=False  # we will handle padding manually
 )
+def preprocess_audio_to_mel_image(audio):
+    """
+    audio from gradio.Audio(type="numpy") is (sample_rate, data)
+    Returns a 3x128x128 tensor ready for the CNN.
+    """
+    sr, data = audio
+    # Convert to tensor
+    waveform = torch.tensor(data, dtype=torch.float32)
+    # If shape is (samples,), make it (1, samples)
+    if waveform.ndim == 1:
+        waveform = waveform.unsqueeze(0)
+    # If shape is (samples, channels), transpose to (channels, samples)
+    if waveform.ndim == 2 and waveform.shape[0] < waveform.shape[1]:
+        waveform = waveform.transpose(0, 1)
+    # Convert to mono if stereo
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    # Resample to TARGET_SR if needed
+    if sr != TARGET_SR:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SR)
+        waveform = resampler(waveform)
+    # Ensure minimum length for STFT
+    min_len = N_FFT
+    if waveform.shape[-1] < min_len:
+        pad_amount = min_len - waveform.shape[-1]
+        waveform = F.pad(waveform, (0, pad_amount))
+    # Compute Mel-spectrogram and convert to dB
+    mel = mel_transform(waveform)          # [1, n_mels, time]
+    mel_db = torchaudio.transforms.AmplitudeToDB()(mel)
+    # Normalize to 0–1
+    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
+    # Resize to 128x128 and make 3 channels
+    mel_db = mel_db.unsqueeze(0)  # [1, 1, H, W]
+    mel_resized = F.interpolate(mel_db, size=(128, 128), mode="bilinear", align_corners=False)
+    mel_rgb = mel_resized.repeat(1, 3, 1, 1)  # [1, 3, 128, 128]
+    return mel_rgb.squeeze(0)  # [3, 128, 128]
+# ---------------------------
+# Main Inference Function
+# ---------------------------
+def analyze_piano(audio):
+    if audio is None:
+        return "Please upload or record a piano audio clip (around 1–3 seconds)."
+    try:
+        # Preprocess input
+        mel_img = preprocess_audio_to_mel_image(audio)  # [3,128,128]
+        mel_batch = mel_img.unsqueeze(0)  # [1,3,128,128]
+        with torch.no_grad():
+            logits, q_pred = model(mel_batch)
+            class_idx = torch.argmax(logits, dim=1).item()
+            quality_score = float(q_pred.item())
+        piano_type = label_names[class_idx]
+        quality_score_rounded = round(quality_score, 2)
+        output_text = (
+            f"Piano Type Prediction: {piano_type}\n"
+            f"Estimated Sound Quality Score: {quality_score_rounded} / 10"
+        )
+        return output_text
+    except Exception as e:
+        return f"An error occurred while processing the audio: {e}"
+# ---------------------------
+# Gradio Interface
+# ---------------------------
 demo = gr.Interface(
+    fn=analyze_piano,
+    inputs=gr.Audio(
+        sources=["upload", "microphone"],
+        type="numpy",
+        label="Upload Piano Audio or Record with Microphone"
+    ),
+    outputs=gr.Textbox(label="AI Analysis Output"),
+    title="AI Piano Sound Analyzer 🎹",
+    description="Upload a short piano recording to get a predicted piano type and estimated sound-quality score from the trained CNN model."
 )
 if __name__ == "__main__":
     demo.launch()