Duplicate from oriyonay/musicnn-pytorch

Browse files

Co-authored-by: ori yonay <oriyonay@users.noreply.huggingface.co>

Files changed (12) hide show

.gitattributes +35 -0
README.md +83 -0
config.json +75 -0
configuration_musicnn.py +18 -0
inference.py +20 -0
model.safetensors +3 -0
modeling_musicnn.py +313 -0
musicnn.py +406 -0
musicnn_torch.py +255 -0
weights/MSD_musicnn.pt +3 -0
weights/MSD_musicnn_big.pt +3 -0
weights/MTT_musicnn.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+---
+license: apache-2.0
+tags:
+- audio
+- music
+- music-tagging
+- pytorch
+---
+# MusicNN-PyTorch
+This is a PyTorch reimplementation of the [MusicNN](https://github.com/jordipons/musicnn) library for music audio tagging.
+It contains the model architecture and converted weights from the original TensorFlow 1.x checkpoints.
+## Supported Models
+- `MTT_musicnn`: Trained on MagnaTagATune (50 tags) - **Default model**
+- `MSD_musicnn`: Trained on Million Song Dataset (50 tags)
+- `MSD_musicnn_big`: Larger version trained on MSD (512 filters)
+## Super Simple Usage (Hugging Face Transformers)
+```python
+from transformers import AutoModel
+# Load the model (downloads automatically)
+model = AutoModel.from_pretrained("oriyonay/musicnn-pytorch", trust_remote_code=True)
+# Use the model
+tags = model.predict_tags("your_audio.mp3", top_k=5)
+print(f"Top 5 tags: {tags}")
+```
+## Embeddings (Optional)
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained("oriyonay/musicnn-pytorch", trust_remote_code=True)
+# Extract embeddings from any layer
+emb = model.extract_embeddings("your_audio.mp3", layer="penultimate", pool="mean")
+print(emb.shape)
+```
+## Colab Example
+```python
+# Install dependencies
+!pip install transformers torch librosa soundfile
+# Load with AutoModel
+from transformers import AutoModel
+model = AutoModel.from_pretrained("oriyonay/musicnn-pytorch", trust_remote_code=True)
+# Use the model
+tags = model.predict_tags("your_audio.mp3", top_k=5)
+print(tags)
+```
+## Traditional Usage
+If you prefer to download the code manually:
+```python
+from musicnn_torch import top_tags
+# Get top 5 tags for an audio file
+tags = top_tags('path/to/audio.mp3', model='MTT_musicnn', topN=5)
+print(tags)
+```
+## Installation
+```bash
+pip install transformers torch librosa soundfile
+```
+## Credits
+Original implementation by [Jordi Pons](https://github.com/jordipons).
+PyTorch port by Gemini.

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+    "num_classes": 50,
+    "mid_filt": 64,
+    "backend_units": 200,
+    "dataset": "MTT",
+    "return_dict": true,
+    "output_hidden_states": false,
+    "output_attentions": false,
+    "torchscript": false,
+    "torch_dtype": "float32",
+    "use_bfloat16": false,
+    "tf_legacy_loss": false,
+    "pruned_heads": {},
+    "tie_word_embeddings": true,
+    "chunk_size_feed_forward": 0,
+    "is_encoder_decoder": false,
+    "is_decoder": false,
+    "cross_attention_hidden_size": null,
+    "add_cross_attention": false,
+    "tie_encoder_decoder": false,
+    "max_length": 20,
+    "min_length": 0,
+    "do_sample": false,
+    "early_stopping": false,
+    "num_beams": 1,
+    "num_beam_groups": 1,
+    "diversity_penalty": 0.0,
+    "temperature": 1.0,
+    "top_k": 50,
+    "top_p": 1.0,
+    "typical_p": 1.0,
+    "repetition_penalty": 1.0,
+    "length_penalty": 1.0,
+    "no_repeat_ngram_size": 0,
+    "encoder_no_repeat_ngram_size": 0,
+    "bad_words_ids": null,
+    "num_return_sequences": 1,
+    "output_scores": false,
+    "return_dict_in_generate": false,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "remove_invalid_values": false,
+    "exponential_decay_length_penalty": null,
+    "suppress_tokens": null,
+    "begin_suppress_tokens": null,
+    "architectures": [
+        "MusicNN"
+    ],
+    "finetuning_task": null,
+    "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+    },
+    "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+    },
+    "tokenizer_class": null,
+    "prefix": null,
+    "bos_token_id": null,
+    "pad_token_id": null,
+    "eos_token_id": null,
+    "sep_token_id": null,
+    "decoder_start_token_id": null,
+    "task_specific_params": null,
+    "problem_type": null,
+    "_name_or_path": "oriyonay/musicnn-pytorch",
+    "_attn_implementation_autoset": false,
+    "transformers_version": "4.48.0",
+    "model_type": "musicnn",
+    "auto_map": {
+        "AutoConfig": "musicnn.MusicNNConfig",
+        "AutoModel": "musicnn.MusicNN"
+    }
+}

configuration_musicnn.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import PretrainedConfig
+class MusicNNConfig(PretrainedConfig):
+    model_type = 'musicnn'
+    def __init__(
+        self,
+        num_classes=50,
+        mid_filt=64,
+        backend_units=200,
+        dataset='MTT',
+        **kwargs
+    ):
+        self.num_classes = num_classes
+        self.mid_filt = mid_filt
+        self.backend_units = backend_units
+        self.dataset = dataset
+        super().__init__(**kwargs)

inference.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from musicnn_torch import top_tags
+import os
+# Use the absolute paths you provided
+files = [
+    '/Users/oriyonay/Desktop/CRAZY BEAT.mp3',
+    '/Users/oriyonay/Desktop/burn the stage/bounces/02 the type of girl.mp3',
+    '/Users/oriyonay/Desktop/burn the stage/extras/jazzy red roses.mp3'
+]
+for f in files:
+    if os.path.exists(f):
+        print(f"\n--- Predicting top tags for {os.path.basename(f)} ---")
+        try:
+            tags = top_tags(f, model='MTT_musicnn', topN=5)
+            print(f"Top 5 tags: {tags}")
+        except Exception as e:
+            print(f"Error processing {f}: {e}")
+    else:
+        print(f"\nWarning: File not found at {f}")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc0b9400fcaed6e9ce7fbcfa97ec91e4fcb5f2ab34ca3a0cd6bef4af74753e1a
+size 3175212

modeling_musicnn.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import soundfile as sf
+import librosa
+from transformers import PreTrainedModel, PretrainedConfig
+class MusicNNConfig(PretrainedConfig):
+    model_type = 'musicnn'
+    def __init__(
+        self,
+        num_classes=50,
+        mid_filt=64,
+        backend_units=200,
+        dataset='MTT',
+        **kwargs
+    ):
+        self.num_classes = num_classes
+        self.mid_filt = mid_filt
+        self.backend_units = backend_units
+        self.dataset = dataset
+        super().__init__(**kwargs)
+# -------------------------
+# Building blocks
+# -------------------------
+class ConvReLUBN(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=padding)
+        self.bn = nn.BatchNorm2d(out_ch, eps=0.001, momentum=0.01)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+class TimbralBlock(nn.Module):
+    def __init__(self, mel_bins, out_ch):
+        super().__init__()
+        self.conv_block = ConvReLUBN(1, out_ch, kernel_size=(7, mel_bins), padding=0)
+    def forward(self, x):
+        x = F.pad(x, (0, 0, 3, 3))
+        x = self.conv_block(x)
+        return torch.max(x, dim=3).values
+class TemporalBlock(nn.Module):
+    def __init__(self, kernel_size, out_ch):
+        super().__init__()
+        self.conv_block = ConvReLUBN(1, out_ch, kernel_size=(kernel_size, 1), padding='same')
+    def forward(self, x):
+        x = self.conv_block(x)
+        return torch.max(x, dim=3).values
+class MidEnd(nn.Module):
+    def __init__(self, in_ch, num_filt):
+        super().__init__()
+        self.c1_conv = nn.Conv2d(1, num_filt, kernel_size=(7, in_ch), padding=0)
+        self.c1_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+        self.c2_conv = nn.Conv2d(1, num_filt, kernel_size=(7, num_filt), padding=0)
+        self.c2_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+        self.c3_conv = nn.Conv2d(1, num_filt, kernel_size=(7, num_filt), padding=0)
+        self.c3_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+    def forward(self, x):
+        x = x.transpose(1, 2).unsqueeze(3)
+        x_perm = x.permute(0, 2, 3, 1)
+        x1_pad = F.pad(x_perm, (3, 3, 0, 0))
+        x1 = x1_pad.permute(0, 2, 3, 1)
+        x1 = self.c1_bn(F.relu(self.c1_conv(x1)))
+        x1_t = x1.permute(0, 2, 1, 3)
+        x2_perm = x1_t.permute(0, 2, 3, 1)
+        x2_pad = F.pad(x2_perm, (3, 3, 0, 0))
+        x2 = x2_pad.permute(0, 2, 3, 1)
+        x2 = self.c2_bn(F.relu(self.c2_conv(x2)))
+        x2_t = x2.permute(0, 2, 1, 3)
+        res_conv2 = x2_t + x1_t
+        x3_perm = res_conv2.permute(0, 2, 3, 1)
+        x3_pad = F.pad(x3_perm, (3, 3, 0, 0))
+        x3 = x3_pad.permute(0, 2, 3, 1)
+        x3 = self.c3_bn(F.relu(self.c3_conv(x3)))
+        x3_t = x3.permute(0, 2, 1, 3)
+        res_conv3 = x3_t + res_conv2
+        return [x.squeeze(3), x1_t.squeeze(3), res_conv2.squeeze(3), res_conv3.squeeze(3)]
+class Backend(nn.Module):
+    def __init__(self, in_ch, num_classes, hidden):
+        super().__init__()
+        self.bn_in = nn.BatchNorm1d(in_ch * 2, eps=0.001, momentum=0.01)
+        self.fc1 = nn.Linear(in_ch * 2, hidden)
+        self.bn_fc1 = nn.BatchNorm1d(hidden, eps=0.001, momentum=0.01)
+        self.fc2 = nn.Linear(hidden, num_classes)
+    def forward(self, x):
+        max_pool = torch.max(x, dim=1).values
+        mean_pool = torch.mean(x, dim=1)
+        z = torch.stack([max_pool, mean_pool], dim=2)
+        z = z.view(z.size(0), -1)
+        z = self.bn_in(z)
+        z = F.dropout(z, p=0.5, training=self.training)
+        z = self.bn_fc1(F.relu(self.fc1(z)))
+        z = F.dropout(z, p=0.5, training=self.training)
+        logits = self.fc2(z)
+        return logits, mean_pool, max_pool
+class MusicNNModel(PreTrainedModel):
+    config_class = MusicNNConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.bn_input = nn.BatchNorm2d(1, eps=0.001, momentum=0.01)
+        self.timbral_1 = TimbralBlock(int(0.4 * 96), int(1.6 * 128))
+        self.timbral_2 = TimbralBlock(int(0.7 * 96), int(1.6 * 128))
+        self.temp_1 = TemporalBlock(128, int(1.6 * 32))
+        self.temp_2 = TemporalBlock(64, int(1.6 * 32))
+        self.temp_3 = TemporalBlock(32, int(1.6 * 32))
+        self.midend = MidEnd(in_ch=561, num_filt=config.mid_filt)
+        self.backend = Backend(in_ch=config.mid_filt * 3 + 561, num_classes=config.num_classes, hidden=config.backend_units)
+    def forward(self, x):
+        # x is [B, T, M]
+        x = x.unsqueeze(1)
+        x = self.bn_input(x)
+        f74 = self.timbral_1(x).transpose(1, 2)
+        f77 = self.timbral_2(x).transpose(1, 2)
+        s1 = self.temp_1(x).transpose(1, 2)
+        s2 = self.temp_2(x).transpose(1, 2)
+        s3 = self.temp_3(x).transpose(1, 2)
+        frontend_features = torch.cat([f74, f77, s1, s2, s3], dim=2)
+        mid_feats = self.midend(frontend_features.transpose(1, 2))
+        z = torch.cat(mid_feats, dim=2)
+        logits, mean_pool, max_pool = self.backend(z)
+        return logits, mean_pool, max_pool
+    @staticmethod
+    def preprocess_audio(audio_file, sr=16000):
+        # Try librosa first (works well for many formats)
+        try:
+            audio, file_sr = librosa.load(audio_file, sr=None)
+            if len(audio) == 0:
+                raise ValueError("Empty audio from librosa")
+        except Exception:
+            # Fallback to soundfile (better for some MP3s)
+            try:
+                audio, file_sr = sf.read(audio_file)
+                # Convert to mono if stereo
+                if len(audio.shape) > 1:
+                    audio = np.mean(audio, axis=1)
+            except Exception as e:
+                raise ValueError(f'Could not load audio file {audio_file}: {e}')
+        # Resample to target sample rate if necessary
+        if file_sr != sr:
+            audio = librosa.resample(audio, orig_sr=file_sr, target_sr=sr)
+        if len(audio) == 0:
+            raise ValueError(f'Audio file {audio_file} is empty or could not be loaded.')
+        # Create mel spectrogram
+        audio_rep = librosa.feature.melspectrogram(
+            y=audio, sr=sr, hop_length=256, n_fft=512, n_mels=96
+        ).T
+        audio_rep = audio_rep.astype(np.float32)
+        audio_rep = np.log10(10000 * audio_rep + 1)
+        return audio_rep
+    def predict_tags(self, audio_file, top_k=5):
+        # Use the same batching approach as the original implementation
+        # This matches musicnn_torch.py extractor function
+        # Load and preprocess audio (similar to batch_data in musicnn_torch.py)
+        audio, file_sr = sf.read(audio_file)
+        # Convert to mono if stereo
+        if len(audio.shape) > 1:
+            audio = np.mean(audio, axis=1)
+        # Resample to 16000 if necessary
+        if file_sr != 16000:
+            audio = librosa.resample(audio, orig_sr=file_sr, target_sr=16000)
+        if len(audio) == 0:
+            raise ValueError(f'Audio file {audio_file} is empty or could not be loaded.')
+        # Create mel spectrogram
+        audio_rep = librosa.feature.melspectrogram(
+            y=audio, sr=16000, hop_length=256, n_fft=512, n_mels=96
+        ).T
+        audio_rep = audio_rep.astype(np.float32)
+        audio_rep = np.log10(10000 * audio_rep + 1)
+        # Batch the data (same as original implementation)
+        n_frames = 187  # librosa.time_to_frames(3, sr=16000, n_fft=512, hop_length=256) + 1
+        overlap = n_frames  # No overlap for simplicity
+        last_frame = audio_rep.shape[0] - n_frames + 1
+        batches = []
+        if last_frame <= 0:
+            # Pad with zeros if audio is too short
+            patch = np.zeros((n_frames, 96), dtype=np.float32)
+            patch[:audio_rep.shape[0], :] = audio_rep
+            batches.append(patch)
+        else:
+            # Create overlapping windows
+            for time_stamp in range(0, last_frame, overlap):
+                patch = audio_rep[time_stamp : time_stamp + n_frames, :]
+                batches.append(patch)
+        # Convert to tensor and run inference
+        batch_tensor = torch.from_numpy(np.stack(batches))
+        all_probs = []
+        with torch.no_grad():
+            self.eval()
+            for i in range(0, len(batches), 1):  # Process in batches if needed
+                batch_subset = batch_tensor[i:i+1]
+                logits, _, _ = self(batch_subset)
+                probs = torch.sigmoid(logits).squeeze(0).numpy()
+                all_probs.append(probs)
+        # Average probabilities across all windows
+        avg_probs = np.mean(all_probs, axis=0)
+        # Get labels based on config
+        if self.config.dataset == 'MTT':
+            labels = [
+                'guitar', 'classical', 'slow', 'techno', 'strings', 'drums', 'electronic', 'rock',
+                'fast', 'piano', 'ambient', 'beat', 'violin', 'vocal', 'synth', 'female', 'indian',
+                'opera', 'male', 'singing', 'vocals', 'no vocals', 'harpsichord', 'loud', 'quiet',
+                'flute', 'woman', 'male vocal', 'no vocal', 'pop', 'soft', 'sitar', 'solo', 'man',
+                'classic', 'choir', 'voice', 'new age', 'dance', 'male voice', 'female vocal',
+                'beats', 'harp', 'cello', 'no voice', 'weird', 'country', 'metal', 'female voice',
+                'choral'
+            ]
+        elif self.config.dataset == 'MSD':
+            labels = [
+                'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance',
+                '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists',
+                'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s',
+                'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic',
+                'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country',
+                'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal',
+                'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy'
+            ]
+        else:
+            raise ValueError(f"Unknown dataset: {self.config.dataset}")
+        # Get top k tags
+        top_indices = np.argsort(avg_probs)[-top_k:][::-1]
+        return [labels[i] for i in top_indices]
+def create_musicnn_model(model_type='MTT_musicnn'):
+    """
+    Factory function to create MusicNN models with different configurations.
+    Args:
+        model_type (str): One of 'MTT_musicnn', 'MSD_musicnn', or 'MSD_musicnn_big'
+    Returns:
+        MusicNNModel: Configured model instance
+    """
+    from transformers import AutoConfig
+    # Model configurations
+    configs = {
+        'MTT_musicnn': {
+            'num_classes': 50,
+            'mid_filt': 64,
+            'backend_units': 200,
+            'dataset': 'MTT'
+        },
+        'MSD_musicnn': {
+            'num_classes': 50,
+            'mid_filt': 64,
+            'backend_units': 200,
+            'dataset': 'MSD'
+        },
+        'MSD_musicnn_big': {
+            'num_classes': 50,
+            'mid_filt': 512,
+            'backend_units': 500,
+            'dataset': 'MSD'
+        }
+    }
+    if model_type not in configs:
+        raise ValueError(f"Unknown model type: {model_type}. Choose from: {list(configs.keys())}")
+    # For now, we'll load the default model and modify its config
+    # In the future, we could have separate model files for each type
+    config = AutoConfig.from_pretrained("oriyonay/musicnn-pytorch", trust_remote_code=True)
+    config.num_classes = configs[model_type]['num_classes']
+    config.mid_filt = configs[model_type]['mid_filt']
+    config.backend_units = configs[model_type]['backend_units']
+    config.dataset = configs[model_type]['dataset']
+    model = MusicNNModel(config)
+    return model

musicnn.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import soundfile as sf
+import librosa
+from transformers import PretrainedConfig, PreTrainedModel
+from huggingface_hub import PyTorchModelHubMixin
+# Suppress warnings
+import warnings
+warnings.filterwarnings('ignore')
+class MusicNNConfig(PretrainedConfig):
+    model_type = 'musicnn'
+    def __init__(
+        self,
+        num_classes=50,
+        mid_filt=64,
+        backend_units=200,
+        dataset='MTT',
+        **kwargs
+    ):
+        self.num_classes = num_classes
+        self.mid_filt = mid_filt
+        self.backend_units = backend_units
+        self.dataset = dataset
+        super().__init__(**kwargs)
+# -------------------------
+# Building blocks
+# -------------------------
+class ConvReLUBN(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=padding)
+        self.bn = nn.BatchNorm2d(out_ch, eps=0.001, momentum=0.01)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+class TimbralBlock(nn.Module):
+    def __init__(self, mel_bins, out_ch):
+        super().__init__()
+        self.conv_block = ConvReLUBN(1, out_ch, kernel_size=(7, mel_bins), padding=0)
+    def forward(self, x):
+        x = F.pad(x, (0, 0, 3, 3))
+        x = self.conv_block(x)
+        return torch.max(x, dim=3).values
+class TemporalBlock(nn.Module):
+    def __init__(self, kernel_size, out_ch):
+        super().__init__()
+        self.conv_block = ConvReLUBN(1, out_ch, kernel_size=(kernel_size, 1), padding='same')
+    def forward(self, x):
+        x = self.conv_block(x)
+        return torch.max(x, dim=3).values
+class MidEnd(nn.Module):
+    def __init__(self, in_ch, num_filt):
+        super().__init__()
+        self.c1_conv = nn.Conv2d(1, num_filt, kernel_size=(7, in_ch), padding=0)
+        self.c1_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+        self.c2_conv = nn.Conv2d(1, num_filt, kernel_size=(7, num_filt), padding=0)
+        self.c2_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+        self.c3_conv = nn.Conv2d(1, num_filt, kernel_size=(7, num_filt), padding=0)
+        self.c3_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+    def forward(self, x):
+        x = x.transpose(1, 2).unsqueeze(3)
+        x_perm = x.permute(0, 2, 3, 1)
+        x1_pad = F.pad(x_perm, (3, 3, 0, 0))
+        x1 = x1_pad.permute(0, 2, 3, 1)
+        x1 = self.c1_bn(F.relu(self.c1_conv(x1)))
+        x1_t = x1.permute(0, 2, 1, 3)
+        x2_perm = x1_t.permute(0, 2, 3, 1)
+        x2_pad = F.pad(x2_perm, (3, 3, 0, 0))
+        x2 = x2_pad.permute(0, 2, 3, 1)
+        x2 = self.c2_bn(F.relu(self.c2_conv(x2)))
+        x2_t = x2.permute(0, 2, 1, 3)
+        res_conv2 = x2_t + x1_t
+        x3_perm = res_conv2.permute(0, 2, 3, 1)
+        x3_pad = F.pad(x3_perm, (3, 3, 0, 0))
+        x3 = x3_pad.permute(0, 2, 3, 1)
+        x3 = self.c3_bn(F.relu(self.c3_conv(x3)))
+        x3_t = x3.permute(0, 2, 1, 3)
+        res_conv3 = x3_t + res_conv2
+        return [x.squeeze(3), x1_t.squeeze(3), res_conv2.squeeze(3), res_conv3.squeeze(3)]
+class Backend(nn.Module):
+    def __init__(self, in_ch, num_classes, hidden):
+        super().__init__()
+        self.bn_in = nn.BatchNorm1d(in_ch * 2, eps=0.001, momentum=0.01)
+        self.fc1 = nn.Linear(in_ch * 2, hidden)
+        self.bn_fc1 = nn.BatchNorm1d(hidden, eps=0.001, momentum=0.01)
+        self.fc2 = nn.Linear(hidden, num_classes)
+    def forward(self, x):
+        max_pool = torch.max(x, dim=1).values
+        mean_pool = torch.mean(x, dim=1)
+        z = torch.stack([max_pool, mean_pool], dim=2)
+        z = z.view(z.size(0), -1)
+        z = self.bn_in(z)
+        z = F.dropout(z, p=0.5, training=self.training)
+        z = self.bn_fc1(F.relu(self.fc1(z)))
+        z = F.dropout(z, p=0.5, training=self.training)
+        logits = self.fc2(z)
+        return logits, mean_pool, max_pool
+class MusicNN(PreTrainedModel, PyTorchModelHubMixin):
+    config_class = MusicNNConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.bn_input = nn.BatchNorm2d(1, eps=0.001, momentum=0.01)
+        self.timbral_1 = TimbralBlock(int(0.4 * 96), int(1.6 * 128))
+        self.timbral_2 = TimbralBlock(int(0.7 * 96), int(1.6 * 128))
+        self.temp_1 = TemporalBlock(128, int(1.6 * 32))
+        self.temp_2 = TemporalBlock(64, int(1.6 * 32))
+        self.temp_3 = TemporalBlock(32, int(1.6 * 32))
+        self.midend = MidEnd(in_ch=561, num_filt=config.mid_filt)
+        self.backend = Backend(in_ch=config.mid_filt * 3 + 561, num_classes=config.num_classes, hidden=config.backend_units)
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = self.bn_input(x)
+        f74 = self.timbral_1(x).transpose(1, 2)
+        f77 = self.timbral_2(x).transpose(1, 2)
+        s1 = self.temp_1(x).transpose(1, 2)
+        s2 = self.temp_2(x).transpose(1, 2)
+        s3 = self.temp_3(x).transpose(1, 2)
+        frontend_features = torch.cat([f74, f77, s1, s2, s3], dim=2)
+        mid_feats = self.midend(frontend_features.transpose(1, 2))
+        z = torch.cat(mid_feats, dim=2)
+        logits, mean_pool, max_pool = self.backend(z)
+        return logits, mean_pool, max_pool
+    @staticmethod
+    def preprocess_audio(audio_file, sr=16000):
+        # Try librosa first (works well for many formats)
+        try:
+            audio, file_sr = librosa.load(audio_file, sr=None)
+            if len(audio) == 0:
+                raise ValueError("Empty audio from librosa")
+        except Exception:
+            # Fallback to soundfile (better for some MP3s)
+            try:
+                audio, file_sr = sf.read(audio_file)
+                # Convert to mono if stereo
+                if len(audio.shape) > 1:
+                    audio = np.mean(audio, axis=1)
+            except Exception as e:
+                raise ValueError(f'Could not load audio file {audio_file}: {e}')
+        # Resample to target sample rate if necessary
+        if file_sr != sr:
+            audio = librosa.resample(audio, orig_sr=file_sr, target_sr=sr)
+        if len(audio) == 0:
+            raise ValueError(f'Audio file {audio_file} is empty or could not be loaded.')
+        # Create mel spectrogram
+        audio_rep = librosa.feature.melspectrogram(
+            y=audio, sr=sr, hop_length=256, n_fft=512, n_mels=96
+        ).T
+        audio_rep = audio_rep.astype(np.float32)
+        audio_rep = np.log10(10000 * audio_rep + 1)
+        return audio_rep
+    def predict_tags(self, audio_file, top_k=5):
+        # Auto-detect device and move model to it
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(device)
+        # Use the same batching approach as the original implementation
+        # This matches musicnn_torch.py extractor function
+        # Load and preprocess audio (similar to batch_data in musicnn_torch.py)
+        audio, file_sr = sf.read(audio_file)
+        # Convert to mono if stereo
+        if len(audio.shape) > 1:
+            audio = np.mean(audio, axis=1)
+        # Resample to 16000 if necessary
+        if file_sr != 16000:
+            audio = librosa.resample(audio, orig_sr=file_sr, target_sr=16000)
+        if len(audio) == 0:
+            raise ValueError(f'Audio file {audio_file} is empty or could not be loaded.')
+        # Create mel spectrogram
+        audio_rep = librosa.feature.melspectrogram(
+            y=audio, sr=16000, hop_length=256, n_fft=512, n_mels=96
+        ).T
+        audio_rep = audio_rep.astype(np.float32)
+        audio_rep = np.log10(10000 * audio_rep + 1)
+        # Batch the data (same as original implementation)
+        n_frames = 187  # librosa.time_to_frames(3, sr=16000, n_fft=512, hop_length=256) + 1
+        overlap = n_frames  # No overlap for simplicity
+        last_frame = audio_rep.shape[0] - n_frames + 1
+        batches = []
+        if last_frame <= 0:
+            # Pad with zeros if audio is too short
+            patch = np.zeros((n_frames, 96), dtype=np.float32)
+            patch[:audio_rep.shape[0], :] = audio_rep
+            batches.append(patch)
+        else:
+            # Create overlapping windows
+            for time_stamp in range(0, last_frame, overlap):
+                patch = audio_rep[time_stamp : time_stamp + n_frames, :]
+                batches.append(patch)
+        # Convert to tensor and run inference
+        batch_tensor = torch.from_numpy(np.stack(batches)).to(device)
+        all_probs = []
+        with torch.no_grad():
+            self.eval()
+            for i in range(0, len(batches), 1):  # Process in batches if needed
+                batch_subset = batch_tensor[i:i+1]
+                logits, _, _ = self(batch_subset)
+                probs = torch.sigmoid(logits).squeeze(0).cpu().numpy()
+                all_probs.append(probs)
+        # Average probabilities across all windows
+        avg_probs = np.mean(all_probs, axis=0)
+        # Get labels based on config
+        if self.config.dataset == 'MTT':
+            labels = [
+                'guitar', 'classical', 'slow', 'techno', 'strings', 'drums', 'electronic', 'rock',
+                'fast', 'piano', 'ambient', 'beat', 'violin', 'vocal', 'synth', 'female', 'indian',
+                'opera', 'male', 'singing', 'vocals', 'no vocals', 'harpsichord', 'loud', 'quiet',
+                'flute', 'woman', 'male vocal', 'no vocal', 'pop', 'soft', 'sitar', 'solo', 'man',
+                'classic', 'choir', 'voice', 'new age', 'dance', 'male voice', 'female vocal',
+                'beats', 'harp', 'cello', 'no voice', 'weird', 'country', 'metal', 'female voice',
+                'choral'
+            ]
+        elif self.config.dataset == 'MSD':
+            labels = [
+                'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance',
+                '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists',
+                'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s',
+                'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic',
+                'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country',
+                'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal',
+                'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy'
+            ]
+        else:
+            raise ValueError(f"Unknown dataset: {self.config.dataset}")
+        # Get top k tags
+        top_indices = np.argsort(avg_probs)[-top_k:][::-1]
+        return [labels[i] for i in top_indices]
+    def extract_embeddings(self, audio_file, layer=None, pool='mean'):
+        """
+        Extract embeddings from audio file.
+        Args:
+            audio_file: path to audio file
+            layer: which layer to extract from (ignored for simplicity, uses final embeddings)
+            pool: pooling method ('mean', 'max', or 'both')
+        Returns:
+            embeddings as numpy array
+        """
+        # Auto-detect device and move model to it
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(device)
+        # Load and preprocess audio
+        audio, file_sr = sf.read(audio_file)
+        # Convert to mono if stereo
+        if len(audio.shape) > 1:
+            audio = np.mean(audio, axis=1)
+        # Resample to 16000 if necessary
+        if file_sr != 16000:
+            audio = librosa.resample(audio, orig_sr=file_sr, target_sr=16000)
+        if len(audio) == 0:
+            raise ValueError(f'Audio file {audio_file} is empty or could not be loaded.')
+        # Create mel spectrogram
+        audio_rep = librosa.feature.melspectrogram(
+            y=audio, sr=16000, hop_length=256, n_fft=512, n_mels=96
+        ).T
+        audio_rep = audio_rep.astype(np.float32)
+        audio_rep = np.log10(10000 * audio_rep + 1)
+        # Batch the data
+        n_frames = 187  # librosa.time_to_frames(3, sr=16000, n_fft=512, hop_length=256) + 1
+        overlap = n_frames  # No overlap
+        last_frame = audio_rep.shape[0] - n_frames + 1
+        batches = []
+        if last_frame <= 0:
+            # Pad with zeros if audio is too short
+            patch = np.zeros((n_frames, 96), dtype=np.float32)
+            patch[:audio_rep.shape[0], :] = audio_rep
+            batches.append(patch)
+        else:
+            # Create windows
+            for time_stamp in range(0, last_frame, overlap):
+                patch = audio_rep[time_stamp : time_stamp + n_frames, :]
+                batches.append(patch)
+        # Convert to tensor and run inference
+        batch_tensor = torch.from_numpy(np.stack(batches)).to(device)
+        all_embeddings = []
+        with torch.no_grad():
+            self.eval()
+            for i in range(0, len(batches), 1):
+                batch_subset = batch_tensor[i:i+1]
+                logits, mean_pool, max_pool = self(batch_subset)
+                if pool == 'mean':
+                    embeddings = mean_pool.squeeze(0).cpu().numpy()
+                elif pool == 'max':
+                    embeddings = max_pool.squeeze(0).cpu().numpy()
+                elif pool == 'both':
+                    embeddings = torch.cat([mean_pool, max_pool], dim=1).squeeze(0).cpu().numpy()
+                else:
+                    embeddings = mean_pool.squeeze(0).cpu().numpy()  # default to mean
+                all_embeddings.append(embeddings)
+        # Average embeddings across all windows
+        avg_embeddings = np.mean(all_embeddings, axis=0)
+        return avg_embeddings
+# For uploading to Hugging Face Hub
+if __name__ == '__main__':
+    import json
+    import os
+    from huggingface_hub import HfApi
+    import shutil
+    # Create the model with MTT config
+    config = MusicNNConfig(
+        num_classes=50,
+        mid_filt=64,
+        backend_units=200,
+        dataset='MTT'
+    )
+    model = MusicNN(config)
+    # Load the weights
+    state_dict = torch.load('weights/MTT_musicnn.pt')
+    model.load_state_dict(state_dict)
+    # Save and push to Hugging Face
+    save_dir = 'musicnn-pytorch'
+    os.makedirs(save_dir, exist_ok=True)
+    model.save_pretrained(save_dir)
+    shutil.copy('musicnn.py', save_dir)
+    # Create config.json
+    config_dict = config.to_dict()
+    config_dict.update({
+        '_name_or_path': 'oriyonay/musicnn-pytorch',
+        'architectures': ['MusicNN'],
+        'auto_map': {
+            'AutoConfig': 'musicnn.MusicNNConfig',
+            'AutoModel': 'musicnn.MusicNN'
+        },
+        'model_type': 'musicnn'
+    })
+    with open(os.path.join(save_dir, 'config.json'), 'w') as f:
+        json.dump(config_dict, f, indent=4)
+    # Push to Hugging Face
+    api = HfApi()
+    api.upload_folder(
+        folder_path=save_dir,
+        repo_id='oriyonay/musicnn-pytorch',
+        repo_type='model'
+    )
+    print("✅ Model uploaded to Hugging Face!")
+    print("Usage: model = MusicNN.from_pretrained('oriyonay/musicnn-pytorch')")

musicnn_torch.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import librosa
+import soundfile as sf
+import warnings
+# Suppress the PyTorch padding warning and other user warnings
+warnings.filterwarnings('ignore', category=UserWarning)
+# hyperparams
+SR = 16000
+N_MELS = 96
+FFT_HOP = 256
+FFT_SIZE = 512
+MTT_LABELS = [
+    'guitar', 'classical', 'slow', 'techno', 'strings', 'drums', 'electronic', 'rock',
+    'fast', 'piano', 'ambient', 'beat', 'violin', 'vocal', 'synth', 'female', 'indian',
+    'opera', 'male', 'singing', 'vocals', 'no vocals', 'harpsichord', 'loud', 'quiet',
+    'flute', 'woman', 'male vocal', 'no vocal', 'pop', 'soft', 'sitar', 'solo', 'man',
+    'classic', 'choir', 'voice', 'new age', 'dance', 'male voice', 'female vocal',
+    'beats', 'harp', 'cello', 'no voice', 'weird', 'country', 'metal', 'female voice',
+    'choral'
+]
+MSD_LABELS = [
+    'rock', 'pop', 'alternative', 'indie', 'electronic', 'female vocalists', 'dance',
+    '00s', 'alternative rock', 'jazz', 'beautiful', 'metal', 'chillout', 'male vocalists',
+    'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica', '80s', 'folk', '90s',
+    'chill', 'instrumental', 'punk', 'oldies', 'blues', 'hard rock', 'ambient', 'acoustic',
+    'experimental', 'female vocalist', 'guitar', 'Hip-Hop', '70s', 'party', 'country',
+    'easy listening', 'sexy', 'catchy', 'funk', 'electro', 'heavy metal',
+    'Progressive rock', '60s', 'rnb', 'indie pop', 'sad', 'House', 'happy'
+]
+# -------------------------
+# Building blocks
+# -------------------------
+class ConvReLUBN(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, padding=0):
+        super().__init__()
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=padding)
+        self.bn = nn.BatchNorm2d(out_ch, eps=0.001, momentum=0.01)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+class TimbralBlock(nn.Module):
+    def __init__(self, mel_bins, out_ch):
+        super().__init__()
+        self.conv_block = ConvReLUBN(1, out_ch, kernel_size=(7, mel_bins), padding=0)
+    def forward(self, x):
+        x = F.pad(x, (0, 0, 3, 3))
+        x = self.conv_block(x)
+        return torch.max(x, dim=3).values
+class TemporalBlock(nn.Module):
+    def __init__(self, kernel_size, out_ch):
+        super().__init__()
+        self.conv_block = ConvReLUBN(1, out_ch, kernel_size=(kernel_size, 1), padding='same')
+    def forward(self, x):
+        x = self.conv_block(x)
+        return torch.max(x, dim=3).values
+class MidEnd(nn.Module):
+    def __init__(self, in_ch, num_filt):
+        super().__init__()
+        self.c1_conv = nn.Conv2d(1, num_filt, kernel_size=(7, in_ch), padding=0)
+        self.c1_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+        self.c2_conv = nn.Conv2d(1, num_filt, kernel_size=(7, num_filt), padding=0)
+        self.c2_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+        self.c3_conv = nn.Conv2d(1, num_filt, kernel_size=(7, num_filt), padding=0)
+        self.c3_bn = nn.BatchNorm2d(num_filt, eps=0.001, momentum=0.01)
+    def forward(self, x):
+        x = x.transpose(1, 2).unsqueeze(3)
+        x_perm = x.permute(0, 2, 3, 1)
+        x1_pad = F.pad(x_perm, (3, 3, 0, 0))
+        x1 = x1_pad.permute(0, 2, 3, 1)
+        x1 = self.c1_bn(F.relu(self.c1_conv(x1)))
+        x1_t = x1.permute(0, 2, 1, 3)
+        x2_perm = x1_t.permute(0, 2, 3, 1)
+        x2_pad = F.pad(x2_perm, (3, 3, 0, 0))
+        x2 = x2_pad.permute(0, 2, 3, 1)
+        x2 = self.c2_bn(F.relu(self.c2_conv(x2)))
+        x2_t = x2.permute(0, 2, 1, 3)
+        res_conv2 = x2_t + x1_t
+        x3_perm = res_conv2.permute(0, 2, 3, 1)
+        x3_pad = F.pad(x3_perm, (3, 3, 0, 0))
+        x3 = x3_pad.permute(0, 2, 3, 1)
+        x3 = self.c3_bn(F.relu(self.c3_conv(x3)))
+        x3_t = x3.permute(0, 2, 1, 3)
+        res_conv3 = x3_t + res_conv2
+        return [x.squeeze(3), x1_t.squeeze(3), res_conv2.squeeze(3), res_conv3.squeeze(3)]
+class Backend(nn.Module):
+    def __init__(self, in_ch, num_classes, hidden):
+        super().__init__()
+        self.bn_in = nn.BatchNorm1d(in_ch * 2, eps=0.001, momentum=0.01)
+        self.fc1 = nn.Linear(in_ch * 2, hidden)
+        self.bn_fc1 = nn.BatchNorm1d(hidden, eps=0.001, momentum=0.01)
+        self.fc2 = nn.Linear(hidden, num_classes)
+    def forward(self, x):
+        max_pool = torch.max(x, dim=1).values
+        mean_pool = torch.mean(x, dim=1)
+        z = torch.stack([max_pool, mean_pool], dim=2)
+        z = z.view(z.size(0), -1)
+        z = self.bn_in(z)
+        z = F.dropout(z, p=0.5, training=self.training)
+        z = self.bn_fc1(F.relu(self.fc1(z)))
+        z = F.dropout(z, p=0.5, training=self.training)
+        logits = self.fc2(z)
+        return logits, mean_pool, max_pool
+# -------------------------
+# MusicNN
+# -------------------------
+class MusicNN(nn.Module):
+    def __init__(self, num_classes, mid_filt=64, backend_units=200):
+        super().__init__()
+        self.bn_input = nn.BatchNorm2d(1, eps=0.001, momentum=0.01)
+        self.timbral_1 = TimbralBlock(int(0.4 * N_MELS), int(1.6 * 128))
+        self.timbral_2 = TimbralBlock(int(0.7 * N_MELS), int(1.6 * 128))
+        self.temp_1 = TemporalBlock(128, int(1.6 * 32))
+        self.temp_2 = TemporalBlock(64, int(1.6 * 32))
+        self.temp_3 = TemporalBlock(32, int(1.6 * 32))
+        self.midend = MidEnd(in_ch=561, num_filt=mid_filt)
+        self.backend = Backend(in_ch=mid_filt * 3 + 561, num_classes=num_classes, hidden=backend_units)
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = self.bn_input(x)
+        f74 = self.timbral_1(x).transpose(1, 2)
+        f77 = self.timbral_2(x).transpose(1, 2)
+        s1 = self.temp_1(x).transpose(1, 2)
+        s2 = self.temp_2(x).transpose(1, 2)
+        s3 = self.temp_3(x).transpose(1, 2)
+        frontend_features = torch.cat([f74, f77, s1, s2, s3], dim=2)
+        mid_feats = self.midend(frontend_features.transpose(1, 2))
+        z = torch.cat(mid_feats, dim=2)
+        logits, mean_pool, max_pool = self.backend(z)
+        return logits, mean_pool, max_pool
+# inference utils
+def batch_data(audio_file, n_frames, overlap):
+    # Use soundfile as it handles MP3 more reliably in some local environments
+    audio, sr = sf.read(audio_file)
+    # Convert to mono if stereo
+    if len(audio.shape) > 1:
+        audio = np.mean(audio, axis=1)
+    # Resample to 16000 if necessary
+    if sr != SR:
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=SR)
+    if len(audio) == 0:
+        raise ValueError(f'Audio file {audio_file} is empty or could not be loaded.')
+    audio_rep = librosa.feature.melspectrogram(
+        y=audio, sr=SR, hop_length=FFT_HOP, n_fft=FFT_SIZE, n_mels=N_MELS
+    ).T
+    audio_rep = audio_rep.astype(np.float32)
+    audio_rep = np.log10(10000 * audio_rep + 1)
+    last_frame = audio_rep.shape[0] - n_frames + 1
+    batches = []
+    if last_frame <= 0:
+        patch = np.zeros((n_frames, N_MELS), dtype=np.float32)
+        patch[:audio_rep.shape[0], :] = audio_rep
+        batches.append(patch)
+    else:
+        for time_stamp in range(0, last_frame, overlap):
+            patch = audio_rep[time_stamp : time_stamp + n_frames, :]
+            batches.append(patch)
+    return np.stack(batches), audio_rep
+def extractor(file_name, model='MTT_musicnn', input_length=3, input_overlap=False, device=None):
+    # Auto-detect device if not specified
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    if 'MTT' in model:
+        labels = MTT_LABELS
+        config = {'num_classes': 50, 'mid_filt': 64, 'backend_units': 200}
+    elif 'MSD' in model:
+        labels = MSD_LABELS
+        if 'big' in model:
+            config = {'num_classes': 50, 'mid_filt': 512, 'backend_units': 500}
+        else:
+            config = {'num_classes': 50, 'mid_filt': 64, 'backend_units': 200}
+    else:
+        raise ValueError('Model not supported')
+    # Load model
+    net = MusicNN(**config)
+    weight_path = f'{model}.pt'
+    if not os.path.exists(weight_path):
+        weight_path = os.path.join('weights', f'{model}.pt')
+    if os.path.exists(weight_path):
+        net.load_state_dict(torch.load(weight_path, map_location=device))
+    else:
+        print(f'Warning: Weights not found at {weight_path}')
+    net.to(device)
+    net.eval()
+    # Prep data
+    n_frames = librosa.time_to_frames(input_length, sr=SR, n_fft=FFT_SIZE, hop_length=FFT_HOP) + 1
+    if not input_overlap:
+        overlap = n_frames
+    else:
+        overlap = librosa.time_to_frames(input_overlap, sr=SR, n_fft=FFT_SIZE, hop_length=FFT_HOP)
+    batch, _ = batch_data(file_name, n_frames, overlap)
+    batch_torch = torch.from_numpy(batch).to(device)
+    with torch.no_grad():
+        logits, _, _ = net(batch_torch)
+        probs = torch.sigmoid(logits).cpu().numpy()
+    return probs, labels
+def top_tags(file_name, model='MTT_musicnn', topN=3, device=None):
+    # Auto-detect device if not specified
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    probs, labels = extractor(file_name, model=model, device=device)
+    avg_probs = np.mean(probs, axis=0)
+    top_indices = avg_probs.argsort()[-topN:][::-1]
+    return [labels[i] for i in top_indices]

weights/MSD_musicnn.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6db4c22908da50888d6a259d41980988d3b9cecc5f96fd725ede09166996dd00
+size 3191473

weights/MSD_musicnn_big.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8312eddea265984e0315ecbc87a88b6fe2ab6c341a692741390880a4d1f9abe
+size 31998829

weights/MTT_musicnn.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32cb8bc12786302edc7dde58be340082c06559d979bec06615d1035fa2474f8d
+size 3191473