Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on 28 days ago

Commit

523547d

verified ·

1 Parent(s): 921baab

Saar - Sanity Check

Browse files

Files changed (1) hide show

app.py +75 -187

app.py CHANGED Viewed

@@ -1,210 +1,98 @@
-import gradio as gr
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-import numpy as np
-from datasets import load_dataset
-# ---------------------------
-# Constants
-# ---------------------------
-TARGET_SR = 44100
-N_FFT = 1024
-HOP_LENGTH = 512
-N_MELS = 64
-# ---------------------------
-# Load Dataset Metadata for Labels
-# ---------------------------
-dataset = load_dataset("ccmusic-database/pianos", name="8_class")
-label_names = dataset["train"].features["label"].names
-num_classes = len(label_names)
-# ---------------------------
-# Define the Same CNN Model as in Training
-# ---------------------------
-class PianoCNNMultiTask(nn.Module):
-    def __init__(self, num_classes):
-        super().__init__()
-        self.features = nn.Sequential(
-            nn.Conv2d(3, 16, kernel_size=3, padding=1),
-            nn.BatchNorm2d(16),
-            nn.ReLU(),
-            nn.MaxPool2d(2),   # 128 -> 64
-            nn.Conv2d(16, 32, kernel_size=3, padding=1),
-            nn.BatchNorm2d(32),
-            nn.ReLU(),
-            nn.MaxPool2d(2),   # 64 -> 32
-            nn.Conv2d(32, 64, kernel_size=3, padding=1),
-            nn.BatchNorm2d(64),
-            nn.ReLU(),
-            nn.MaxPool2d(2),   # 32 -> 16
-            nn.Conv2d(64, 128, kernel_size=3, padding=1),
-            nn.BatchNorm2d(128),
-            nn.ReLU(),
-            nn.AdaptiveAvgPool2d((4, 4))  # 4x4 feature map
-        )
-        self.flatten = nn.Flatten()
-        self.fc_shared = nn.Linear(128 * 4 * 4, 256)
-        self.dropout = nn.Dropout(0.3)
-        # Classification head
-        self.fc_class = nn.Linear(256, num_classes)
-        # Regression head (quality score)
-        self.fc_reg = nn.Linear(256, 1)
-    def forward(self, x):
-        x = self.features(x)
-        x = self.flatten(x)
-        x = F.relu(self.fc_shared(x))
-        x = self.dropout(x)
-        class_logits = self.fc_class(x)
-        quality_pred = self.fc_reg(x).squeeze(1)
-        return class_logits, quality_pred
-# ---------------------------
-# Initialize and Load Trained Model (CPU)
-# ---------------------------
-model = PianoCNNMultiTask(num_classes=num_classes)
-state_dict = torch.load("piano_cnn_multitask.pt", map_location=torch.device("cpu"))
-model.load_state_dict(state_dict)
-model.eval()  # inference mode
-# ---------------------------
-# Audio Preprocessing
-# ---------------------------
-mel_transform = torchaudio.transforms.MelSpectrogram(
-    sample_rate=TARGET_SR,
-    n_fft=N_FFT,
-    hop_length=HOP_LENGTH,
-    n_mels=N_MELS,
-    center=False  # we will handle padding manually
 )
-def preprocess_audio_to_mel_image(audio):
-    """
-    audio from gradio.Audio(type="numpy") is (sample_rate, data)
-    Returns a 3x128x128 tensor ready for the CNN.
-    """
-    sr, data = audio
-    # Convert to tensor
-    waveform = torch.tensor(data, dtype=torch.float32)
-    # If shape is (samples,), make it (1, samples)
-    if waveform.ndim == 1:
-        waveform = waveform.unsqueeze(0)
-    # If shape is (samples, channels), transpose to (channels, samples)
-    if waveform.ndim == 2 and waveform.shape[0] < waveform.shape[1]:
-        waveform = waveform.transpose(0, 1)
-    # Convert to mono if stereo
-    if waveform.shape[0] > 1:
-        waveform = waveform.mean(dim=0, keepdim=True)
-    # Resample to TARGET_SR if needed
-    if sr != TARGET_SR:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SR)
-        waveform = resampler(waveform)
-    # Ensure minimum length for STFT
-    min_len = N_FFT
-    if waveform.shape[-1] < min_len:
-        pad_amount = min_len - waveform.shape[-1]
-        waveform = F.pad(waveform, (0, pad_amount))
-    # Compute Mel-spectrogram and convert to dB
-    mel = mel_transform(waveform)          # [1, n_mels, time]
-    mel_db = torchaudio.transforms.AmplitudeToDB()(mel)
-    # Normalize to 0–1
-    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
-    # Resize to 128x128 and make 3 channels
-    mel_db = mel_db.unsqueeze(0)  # [1, 1, H, W]
-    mel_resized = F.interpolate(mel_db, size=(128, 128), mode="bilinear", align_corners=False)
-    mel_rgb = mel_resized.repeat(1, 3, 1, 1)  # [1, 3, 128, 128]
-    return mel_rgb.squeeze(0)  # [3, 128, 128]
-# ---------------------------
-# Main Inference Function
-# ---------------------------
-def analyze_piano(audio):
-    if audio is None:
-        return "Please upload or record a piano audio clip (around 1–3 seconds)."
     try:
-        # Preprocess input
-        mel_img = preprocess_audio_to_mel_image(audio)  # [3,128,128]
-        mel_batch = mel_img.unsqueeze(0)  # [1,3,128,128]
-        with torch.no_grad():
-            logits, q_pred = model(mel_batch)
-            class_idx = torch.argmax(logits, dim=1).item()
-            quality_score = float(q_pred.item())
-        piano_type = label_names[class_idx]
-        quality_score_rounded = round(quality_score, 2)
-        output_text = (
-            f"Piano Type Prediction: {piano_type}\n"
-            f"Estimated Sound Quality Score: {quality_score_rounded} / 10"
-        )
-        return output_text
     except Exception as e:
-        return f"An error occurred while processing the audio: {e}"
-# ---------------------------
-# Gradio Interface
-# ---------------------------
 demo = gr.Interface(
-    fn=analyze_piano,
-    inputs=gr.Audio(
-        sources=["upload", "microphone"],
-        type="numpy",
-        label="Upload Piano Audio or Record with Microphone"
     ),
-    outputs=gr.Textbox(label="AI Analysis Output"),
-    title="AI Piano Sound Analyzer 🎹",
-    description="Upload a short piano recording to get a predicted piano type and estimated sound-quality score from the trained CNN model."
 )
 if __name__ == "__main__":
-    demo.launch()

 import torch
+import gradio as gr
+from PIL import Image
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    pipeline
 )
+# Select device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load BLIP captioning model directly
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+caption_model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-base",
+    torch_dtype=torch_dtype
+).to(device)
+# Load image classification model
+classifier = pipeline(
+    task="image-classification",
+    model="google/vit-base-patch16-224",
+    device=0 if torch.cuda.is_available() else -1
+)
+print("Models loaded successfully.")
+def generate_caption(image):
+    inputs = processor(images=image, return_tensors="pt").to(device, torch_dtype)
+    output = caption_model.generate(**inputs, max_new_tokens=30)
+    caption = processor.decode(output[0], skip_special_tokens=True)
+    return caption
+def build_summary(caption: str, label: str) -> str:
+    caption = caption.strip() if caption else "No caption available"
+    label = label.strip() if label else "unknown object"
+    return (
+        f"The captioning model describes the image as: {caption}. "
+        f"The image classification model identifies the main subject as: {label}. "
+        f"Taken together, the image appears to focus on this subject or scene."
+    )
+def analyze_image(image):
     try:
+        if image is None:
+            return (
+                "Please upload an image.",
+                "No classification available.",
+                "Please upload an image first."
+            )
+        image = image.convert("RGB")
+        # Captioning
+        caption = generate_caption(image)
+        print("CAPTION RESULT:", caption)
+        # Classification
+        class_result = classifier(image)
+        print("CLASSIFICATION RESULT:", class_result)
+        if isinstance(class_result, list) and len(class_result) > 0:
+            top_label = class_result[0].get("label", "Unknown")
+            top_score = class_result[0].get("score", 0.0)
+            classification_text = f"{top_label} (confidence: {top_score:.4f})"
+        else:
+            top_label = "Unknown"
+            classification_text = "No classification generated."
+        summary = build_summary(caption, top_label)
+        return caption, classification_text, summary
     except Exception as e:
+        print("ERROR:", str(e))
+        error_text = f"Error: {str(e)}"
+        return error_text, error_text, error_text
 demo = gr.Interface(
+    fn=analyze_image,
+    inputs=gr.Image(type="pil", label="Upload an Image"),
+    outputs=[
+        gr.Textbox(label="Generated Caption"),
+        gr.Textbox(label="Top Classification"),
+        gr.Textbox(label="Combined Summary", lines=4)
+    ],
+    title="Image Captioning, Classification, and Summary App",
+    description=(
+        "Upload an image to generate an automatic caption, predict the main image class, "
+        "and produce a short combined summary."
     ),
 )
 if __name__ == "__main__":
+    demo.launch()