laion/captioned-ai-music-snippets
Viewer • Updated • 2.91M • 267 • 12
This model predicts aesthetic quality scores for music audio files. It is a "Mixture of Experts" model that takes embeddings extracted from the https://huggingface.co/laion/music-whisper model and predicts 5 metrics defined by the SongEval dataset.
The model rates audio on a scale of 1.0 to 5.0 for the following qualities:
Overall Aesthetics Score: This is calculated by taking the average of the 5 individual metric scores.
This model operates on top of the OpenAI Whisper encoder (specifically the fine-tuned version linked above).
(1, 1500, 768).Mean, Max, and Min pooling.To use this model, you need librosa, transformers, torch, and huggingface_hub.
import os
import torch
import numpy as np
import librosa
from huggingface_hub import hf_hub_download
from transformers import WhisperModel, WhisperProcessor
from model_architecture import MusicAestheticsModel # Downloaded from this repo
# Configuration
# 1. The Audio Encoder (The Music Whisper Model)
WHISPER_REPO = "laion/music-whisper"
# 2. This Aesthetics Model
AESTHETICS_REPO = "laion/music-aesthetics"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def load_models():
print("Loading Whisper Encoder...")
processor = WhisperProcessor.from_pretrained(WHISPER_REPO)
# We only need the encoder part of Whisper
whisper = WhisperModel.from_pretrained(WHISPER_REPO).encoder.to(DEVICE)
whisper.eval()
print("Loading Aesthetics Experts...")
# Initialize the architecture
model = MusicAestheticsModel().to(DEVICE)
# Download and load weights
# 1. Load Shared Bottleneck
bt_path = hf_hub_download(repo_id=AESTHETICS_REPO, filename="stage1_bottleneck.pt")
model.bottleneck.load_state_dict(torch.load(bt_path, map_location=DEVICE))
# 2. Load Expert Heads
for metric in model.metrics:
head_path = hf_hub_download(repo_id=AESTHETICS_REPO, filename=f"expert_{metric}.pt")
model.heads[metric].load_state_dict(torch.load(head_path, map_location=DEVICE))
model.eval()
return processor, whisper, model
def predict_score(audio_path, processor, whisper, aesthetic_model):
# 1. Load and Preprocess Audio
# Resample to 16kHz and pad/crop to exactly 30s
audio, sr = librosa.load(audio_path, sr=16000)
target_len = 16000 * 30
if len(audio) > target_len:
start = (len(audio) - target_len) // 2
audio = audio[start : start + target_len]
else:
audio = np.pad(audio, (0, target_len - len(audio)))
# 2. Extract Whisper Features
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
# Get last hidden state from encoder
outputs = whisper(inputs.input_features.to(DEVICE))
last_hidden = outputs.last_hidden_state # (1, 1500, 768)
# 3. Apply Feature Pooling (Expert Model Logic)
# Reshape to (1, 10 segments, 150 frames, 768 dim)
feats = last_hidden.view(1, 10, 150, 768)
mean_pool = torch.mean(feats, dim=2)
max_pool = torch.max(feats, dim=2).values
min_pool = torch.min(feats, dim=2).values
# Concat -> Flatten -> (23040,)
concat = torch.cat([mean_pool, max_pool, min_pool], dim=2)
embedding = concat.view(-1).unsqueeze(0) # Add batch dim
# 4. Predict Scores
with torch.no_grad():
outputs = aesthetic_model(embedding)
results = {k: v.item() for k, v in outputs.items()}
# Calculate Average Global Score
avg_score = sum(results.values()) / len(results)
results["Overall_Aesthetics"] = avg_score
return results
# Example Usage
if __name__ == "__main__":
processor, whisper, model = load_models()
# Replace with your audio file
audio_file = "test_song.mp3"
if os.path.exists(audio_file):
scores = predict_score(audio_file, processor, whisper, model)
print("-" * 30)
print(f"Aesthetics Analysis for {audio_file}")
print("-" * 30)
for metric, score in scores.items():
print(f"{metric:<20}: {score:.2f} / 5.0")
print("-" * 30)
else:
print("Please provide a valid audio file path.")