Upload folder using huggingface_hub
Browse files- README.md +80 -0
- config.json +49 -0
- model.safetensors +3 -0
- preprocessor_config.json +11 -0
README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language: en
|
| 4 |
+
tags:
|
| 5 |
+
- audio-classification
|
| 6 |
+
- carnatic-music
|
| 7 |
+
- raga-classification
|
| 8 |
+
- indian-classical-music
|
| 9 |
+
datasets:
|
| 10 |
+
- sarayusapa/carnatic-ragas
|
| 11 |
+
metrics:
|
| 12 |
+
- accuracy
|
| 13 |
+
- f1
|
| 14 |
+
pipeline_tag: audio-classification
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# SAM-Audio: Carnatic Raga Classifier
|
| 18 |
+
|
| 19 |
+
A CNN + Segment Attention model for classifying Carnatic ragas from audio.
|
| 20 |
+
|
| 21 |
+
## Model Details
|
| 22 |
+
|
| 23 |
+
- **Architecture**: SAM-Audio (CNN mel-spectrogram encoder + latent segmentation tokens + masked segment prediction + contrastive learning)
|
| 24 |
+
- **Parameters**: 2.6M
|
| 25 |
+
- **Training data**: [sarayusapa/carnatic-ragas](https://huggingface.co/datasets/sarayusapa/carnatic-ragas) with 3x pitch-shift augmentation
|
| 26 |
+
- **Best validation accuracy**: 99.62%
|
| 27 |
+
- **Best epoch**: 17
|
| 28 |
+
|
| 29 |
+
## Supported Ragas
|
| 30 |
+
|
| 31 |
+
| ID | Raga |
|
| 32 |
+
|----|------|
|
| 33 |
+
| 0 | Amritavarshini |
|
| 34 |
+
| 1 | Hamsanaadam |
|
| 35 |
+
| 2 | Kalyani |
|
| 36 |
+
| 3 | Kharaharapriya |
|
| 37 |
+
| 4 | Mayamalavagoulai |
|
| 38 |
+
| 5 | Sindhubhairavi |
|
| 39 |
+
| 6 | Todi |
|
| 40 |
+
| 7 | Varali |
|
| 41 |
+
|
| 42 |
+
## Usage
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
import torch
|
| 46 |
+
import librosa
|
| 47 |
+
from safetensors.torch import load_file
|
| 48 |
+
|
| 49 |
+
# Load model
|
| 50 |
+
from train import SAMAudioModel
|
| 51 |
+
|
| 52 |
+
config = json.load(open("config.json"))
|
| 53 |
+
model = SAMAudioModel(
|
| 54 |
+
encoder_config=config["encoder"],
|
| 55 |
+
num_classes=config["num_classes"],
|
| 56 |
+
num_segments=config["num_segments"],
|
| 57 |
+
)
|
| 58 |
+
state_dict = load_file("model.safetensors")
|
| 59 |
+
model.load_state_dict(state_dict)
|
| 60 |
+
model.eval()
|
| 61 |
+
|
| 62 |
+
# Load audio
|
| 63 |
+
y, sr = librosa.load("audio.mp3", sr=16000, mono=True)
|
| 64 |
+
waveform = torch.from_numpy(y[:320000]).float().unsqueeze(0)
|
| 65 |
+
|
| 66 |
+
# Predict
|
| 67 |
+
with torch.no_grad():
|
| 68 |
+
outputs = model(input_audio=waveform)
|
| 69 |
+
probs = torch.softmax(outputs["raga_logits"], dim=-1)
|
| 70 |
+
pred = probs.argmax(dim=-1).item()
|
| 71 |
+
print(f"Predicted: {config['id2label'][str(pred)]} ({probs[0][pred]:.1%})")
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## Training
|
| 75 |
+
|
| 76 |
+
- 3x pitch-shift augmentation (original + random up [1-4 semitones] + random down [1-4 semitones])
|
| 77 |
+
- Tanpura reference pitch shifts with audio, forcing the model to learn relative intervals
|
| 78 |
+
- BFloat16 mixed precision on RTX 4090
|
| 79 |
+
- Cosine annealing LR with warmup
|
| 80 |
+
- Early stopping with patience=5
|
config.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "sam-audio",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"SAMAudioModel"
|
| 5 |
+
],
|
| 6 |
+
"num_classes": 8,
|
| 7 |
+
"id2label": {
|
| 8 |
+
"0": "Amritavarshini",
|
| 9 |
+
"1": "Hamsanaadam",
|
| 10 |
+
"2": "Kalyani",
|
| 11 |
+
"3": "Kharaharapriya",
|
| 12 |
+
"4": "Mayamalavagoulai",
|
| 13 |
+
"5": "Sindhubhairavi",
|
| 14 |
+
"6": "Todi",
|
| 15 |
+
"7": "Varali"
|
| 16 |
+
},
|
| 17 |
+
"label2id": {
|
| 18 |
+
"Amritavarshini": 0,
|
| 19 |
+
"Hamsanaadam": 1,
|
| 20 |
+
"Kalyani": 2,
|
| 21 |
+
"Kharaharapriya": 3,
|
| 22 |
+
"Mayamalavagoulai": 4,
|
| 23 |
+
"Sindhubhairavi": 5,
|
| 24 |
+
"Todi": 6,
|
| 25 |
+
"Varali": 7
|
| 26 |
+
},
|
| 27 |
+
"encoder": {
|
| 28 |
+
"input_dim": 1,
|
| 29 |
+
"hidden_dims": [
|
| 30 |
+
64,
|
| 31 |
+
128,
|
| 32 |
+
256,
|
| 33 |
+
512
|
| 34 |
+
],
|
| 35 |
+
"kernel_size": 3,
|
| 36 |
+
"stride": 2,
|
| 37 |
+
"dropout_rate": 0.25,
|
| 38 |
+
"use_layer_norm": true,
|
| 39 |
+
"use_mel": true,
|
| 40 |
+
"n_mels": 80,
|
| 41 |
+
"sample_rate": 16000
|
| 42 |
+
},
|
| 43 |
+
"num_segments": 64,
|
| 44 |
+
"mask_ratio": 0.0,
|
| 45 |
+
"contrastive_temperature": 0.07,
|
| 46 |
+
"hidden_size": 512,
|
| 47 |
+
"best_val_accuracy": 0.9961977186311787,
|
| 48 |
+
"best_epoch": 17
|
| 49 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3b3928ed89212e8f272945fffa9d77e8977f7d9d667ab06f145693bdacc05af
|
| 3 |
+
size 10597768
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"processor_type": "AudioPreprocessor",
|
| 3 |
+
"sample_rate": 16000,
|
| 4 |
+
"max_length": 320000,
|
| 5 |
+
"chunk_duration_s": 20,
|
| 6 |
+
"feature_extractor_type": "MelSpectrogram",
|
| 7 |
+
"n_fft": 1024,
|
| 8 |
+
"hop_length": 256,
|
| 9 |
+
"n_mels": 80,
|
| 10 |
+
"normalize": true
|
| 11 |
+
}
|