Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +3 -33
README.md +209 -0
audio_preprocessor.py +191 -0
best_model.pth +3 -0
config.json +39 -0
modeling_vit_emotion.py +135 -0
requirements.txt +5 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+language: en
+license: mit
+tags:
+  - audio
+  - emotion-recognition
+  - valence-arousal
+  - vision-transformer
+  - pytorch
+  - music-emotion-recognition
+datasets:
+  - custom
+metrics:
+  - mse
+  - mae
+pipeline_tag: audio-classification
+---
+# ViT for Audio Emotion Recognition (Valence-Arousal)
+This model is a fine-tuned Vision Transformer (ViT) for audio emotion recognition, predicting valence and arousal values in the continuous range of -1 to 1.
+## Model Description
+- **Base Model**: google/vit-base-patch16-224-in21k
+- **Task**: Audio emotion recognition (regression)
+- **Output**: Valence and Arousal predictions (2D continuous emotion space)
+- **Range**: [-1, 1] for both dimensions
+- **Input**: Mel spectrogram images (224x224 RGB)
+## Architecture
+```
+ViT Base (86M parameters)
+    ↓
+CLS Token Output (768-dim)
+    ↓
+LayerNorm + Dropout
+    ↓
+Linear (768 → 512) + GELU + Dropout
+    ↓
+Linear (512 → 128) + GELU + Dropout
+    ↓
+Linear (128 → 2) + Tanh
+    ↓
+[Valence, Arousal] ∈ [-1, 1]²
+```
+## Usage
+### Prerequisites
+```bash
+pip install torch transformers librosa numpy pillow
+```
+### Loading the Model
+```python
+import torch
+from transformers import ViTModel
+import torch.nn as nn
+class ViTForEmotionRegression(nn.Module):
+    def __init__(self, model_name='google/vit-base-patch16-224-in21k', num_emotions=2, dropout=0.1):
+        super().__init__()
+        self.vit = ViTModel.from_pretrained(model_name)
+        hidden_size = self.vit.config.hidden_size
+        self.head = nn.Sequential(
+            nn.LayerNorm(hidden_size),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size, 512),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(512, 128),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, num_emotions),
+            nn.Tanh()
+        )
+    def forward(self, pixel_values):
+        outputs = self.vit(pixel_values)
+        cls_output = outputs.last_hidden_state[:, 0]
+        return self.head(cls_output)
+# Load the model
+model = ViTForEmotionRegression()
+model.load_state_dict(torch.load('best_model.pth', map_location='cpu'))
+model.eval()
+```
+### Audio Preprocessing
+```python
+import librosa
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+def preprocess_audio(audio_path):
+    # Load audio
+    y, sr = librosa.load(audio_path, sr=22050, duration=30)
+    # Generate mel spectrogram
+    mel_spec = librosa.feature.melspectrogram(
+        y=y, sr=sr, n_mels=128, hop_length=512, n_fft=2048
+    )
+    mel_db = librosa.power_to_db(mel_spec, ref=np.max)
+    # Normalize to 0-255 for RGB conversion
+    mel_normalized = ((mel_db - mel_db.min()) / (mel_db.max() - mel_db.min()) * 255).astype(np.uint8)
+    # Convert to RGB image
+    image = Image.fromarray(mel_normalized).convert('RGB')
+    image = image.resize((224, 224))
+    # Apply ImageNet normalization
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    return transform(image).unsqueeze(0)
+# Process audio
+audio_tensor = preprocess_audio('your_audio.mp3')
+# Predict emotions
+with torch.no_grad():
+    predictions = model(audio_tensor)
+    valence, arousal = predictions[0].tolist()
+print(f"Valence: {valence:.3f}, Arousal: {arousal:.3f}")
+```
+### Emotion Quadrant Mapping
+```python
+def classify_emotion(valence, arousal):
+    if valence >= 0 and arousal >= 0:
+        return "HAPPY" if valence > arousal else "EXCITED"
+    elif valence >= 0 and arousal < 0:
+        return "CALM" if abs(arousal) > valence else "CONTENT"
+    elif valence < 0 and arousal < 0:
+        return "SAD" if abs(valence) > abs(arousal) else "BORED"
+    else:  # valence < 0 and arousal >= 0
+        return "TENSE" if arousal > abs(valence) else "ANGRY"
+```
+## Model Details
+- **Parameters**: ~86.8M
+- **Model Size**: ~331 MB
+- **Framework**: PyTorch
+- **Base Architecture**: ViT-Base (12 layers, 768 hidden, 12 heads)
+- **Custom Head**: 3-layer MLP with GELU activations
+- **Training Data**: Custom audio emotion dataset
+- **Training**: Fine-tuned with MSE loss on valence-arousal targets
+## Emotion Space
+The model predicts emotions in the 2D circumplex model:
+```
+        High Arousal
+             |
+    Angry  Tense  Excited
+             |
+Sad -------- + -------- Happy
+             |
+    Bored  Calm  Content
+             |
+         Low Arousal
+```
+- **Valence**: Negative (unpleasant) ↔ Positive (pleasant)
+- **Arousal**: Low (calm) ↔ High (energetic)
+## Performance
+The model outputs continuous predictions that can be:
+- Used directly for emotion intensity analysis
+- Mapped to discrete emotion categories
+- Visualized on emotion quadrant plots
+## Limitations
+- Trained on music/audio, performance may vary on speech
+- Requires mel spectrogram preprocessing
+- Fixed 30-second audio duration (or first 30s)
+- Cultural bias depending on training data
+## Citation
+```bibtex
+@misc{sentio-vit-emotion,
+  title={Vision Transformer for Audio Emotion Recognition},
+  author={SentioApp Team},
+  year={2025},
+  publisher={HuggingFace}
+}
+```
+## License
+MIT License

audio_preprocessor.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Audio Preprocessing for Model Inference
+This module handles loading audio files and converting them to spectrograms
+for input to the emotion prediction models.
+"""
+import librosa
+import numpy as np
+import torch
+from PIL import Image
+class AudioPreprocessor:
+    """
+    Preprocessor for converting audio files to model-ready spectrograms.
+    """
+    def __init__(self,
+                 sample_rate=22050,
+                 duration=30,
+                 n_mels=128,
+                 hop_length=512,
+                 n_fft=2048,
+                 fmin=20,
+                 fmax=8000,
+                 image_size=224):
+        """
+        Initialize audio preprocessor.
+        Args:
+            sample_rate: Audio sampling rate (Hz)
+            duration: Audio clip duration (seconds)
+            n_mels: Number of mel-frequency bins
+            hop_length: Hop length for STFT
+            n_fft: FFT window size
+            fmin: Minimum frequency
+            fmax: Maximum frequency
+            image_size: Target image size for model input (224 for ViT)
+        """
+        self.sample_rate = sample_rate
+        self.duration = duration
+        self.n_mels = n_mels
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.fmin = fmin
+        self.fmax = fmax
+        self.image_size = image_size
+        # ImageNet normalization (used by ViT)
+        self.imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+        self.imagenet_std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+    def load_audio(self, audio_path):
+        """
+        Load audio file.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            audio: Audio waveform
+            sr: Sample rate
+        """
+        try:
+            audio, sr = librosa.load(
+                audio_path,
+                sr=self.sample_rate,
+                duration=self.duration,
+                mono=True
+            )
+            # Pad or truncate to exact duration
+            target_length = self.sample_rate * self.duration
+            if len(audio) < target_length:
+                audio = np.pad(audio, (0, target_length - len(audio)))
+            else:
+                audio = audio[:target_length]
+            return audio, sr
+        except Exception as e:
+            raise RuntimeError(f"Failed to load audio from {audio_path}: {e}")
+    def audio_to_melspectrogram(self, audio):
+        """
+        Convert audio waveform to mel spectrogram.
+        Args:
+            audio: Audio waveform
+        Returns:
+            mel_spec: Mel spectrogram in dB scale
+        """
+        # Compute mel spectrogram
+        mel_spec = librosa.feature.melspectrogram(
+            y=audio,
+            sr=self.sample_rate,
+            n_mels=self.n_mels,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            fmin=self.fmin,
+            fmax=self.fmax
+        )
+        # Convert to dB scale
+        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+        return mel_spec_db
+    def spectrogram_to_image(self, mel_spec):
+        """
+        Convert mel spectrogram to RGB image tensor for ViT input.
+        Args:
+            mel_spec: Mel spectrogram (n_mels, time_steps)
+        Returns:
+            image_tensor: Tensor of shape (3, 224, 224) normalized for ViT
+        """
+        # Normalize to [0, 1]
+        spec_min = mel_spec.min()
+        spec_max = mel_spec.max()
+        spec_norm = (mel_spec - spec_min) / (spec_max - spec_min + 1e-8)
+        # Resize to 224x224 using PIL
+        spec_pil = Image.fromarray((spec_norm * 255).astype(np.uint8))
+        spec_resized = spec_pil.resize(
+            (self.image_size, self.image_size),
+            Image.Resampling.BILINEAR
+        )
+        # Convert back to numpy and normalize
+        spec_array = np.array(spec_resized).astype(np.float32) / 255.0
+        # Convert grayscale to RGB by replicating channels
+        spec_rgb = np.stack([spec_array, spec_array, spec_array], axis=0)
+        # Convert to torch tensor
+        image_tensor = torch.from_numpy(spec_rgb).float()
+        # Apply ImageNet normalization
+        image_tensor = (image_tensor - self.imagenet_mean) / self.imagenet_std
+        return image_tensor
+    def preprocess(self, audio_path):
+        """
+        Complete preprocessing pipeline: audio file -> model-ready tensor.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            image_tensor: Tensor of shape (3, 224, 224) ready for model input
+            mel_spec: Raw mel spectrogram (for visualization)
+        """
+        # Load audio
+        audio, _ = self.load_audio(audio_path)
+        # Convert to mel spectrogram
+        mel_spec = self.audio_to_melspectrogram(audio)
+        # Convert to image tensor
+        image_tensor = self.spectrogram_to_image(mel_spec)
+        return image_tensor, mel_spec
+    def preprocess_batch(self, audio_paths):
+        """
+        Preprocess multiple audio files.
+        Args:
+            audio_paths: List of audio file paths
+        Returns:
+            batch_tensor: Tensor of shape (batch_size, 3, 224, 224)
+            mel_specs: List of mel spectrograms
+        """
+        tensors = []
+        mel_specs = []
+        for audio_path in audio_paths:
+            tensor, mel_spec = self.preprocess(audio_path)
+            tensors.append(tensor)
+            mel_specs.append(mel_spec)
+        # Stack into batch
+        batch_tensor = torch.stack(tensors, dim=0)
+        return batch_tensor, mel_specs

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d63bd6d99e635cb349a509259e5d20c9645135eac728f685d7204caee8890c6f
+size 347485262

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "architectures": [
+    "ViTForEmotionRegression"
+  ],
+  "model_type": "vit-emotion",
+  "task": "audio-emotion-recognition",
+  "base_model": "google/vit-base-patch16-224-in21k",
+  "num_emotions": 2,
+  "emotion_dimensions": ["valence", "arousal"],
+  "output_range": [-1, 1],
+  "input_size": [224, 224],
+  "num_channels": 3,
+  "patch_size": 16,
+  "hidden_size": 768,
+  "num_hidden_layers": 12,
+  "num_attention_heads": 12,
+  "intermediate_size": 3072,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "attention_probs_dropout_prob": 0.0,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-12,
+  "image_size": 224,
+  "qkv_bias": true,
+  "audio_processing": {
+    "sample_rate": 22050,
+    "n_mels": 128,
+    "hop_length": 512,
+    "n_fft": 2048,
+    "mel_spectrogram_format": "RGB",
+    "normalization": "imagenet"
+  },
+  "regression_head": {
+    "architecture": "768 -> 512 -> 128 -> 2",
+    "activation": "gelu",
+    "dropout": 0.1,
+    "output_activation": "tanh"
+  }
+}

modeling_vit_emotion.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Vision Transformer (ViT) Model Definition for Emotion Regression
+This file defines the ViT model architecture used for valence-arousal prediction.
+"""
+import torch
+import torch.nn as nn
+from transformers import ViTModel, ViTConfig
+class ViTForEmotionRegression(nn.Module):
+    """
+    Vision Transformer for emotion regression (valence and arousal prediction).
+    Architecture:
+    - Pre-trained ViT backbone (google/vit-base-patch16-224-in21k)
+    - Custom regression head for 2D emotion prediction
+    - Dropout for regularization
+    """
+    def __init__(self, model_name='google/vit-base-patch16-224-in21k',
+                 num_emotions=2, freeze_backbone=False, dropout=0.1):
+        super().__init__()
+        # Load pre-trained ViT model
+        try:
+            self.vit = ViTModel.from_pretrained(model_name)
+            print(f"✅ Loaded pre-trained ViT from {model_name}")
+        except Exception as e:
+            print(f"⚠️ Could not load pre-trained model: {e}")
+            print("   Initializing with random weights...")
+            config = ViTConfig()
+            self.vit = ViTModel(config)
+        # Freeze backbone if specified
+        if freeze_backbone:
+            for param in self.vit.parameters():
+                param.requires_grad = False
+            print(f"❄️ Frozen ViT backbone")
+        # Get hidden size from ViT config
+        hidden_size = self.vit.config.hidden_size
+        # Regression head for emotion prediction (named 'head' to match saved checkpoint)
+        # Architecture: 768 -> 512 -> 128 -> 2
+        self.head = nn.Sequential(
+            nn.LayerNorm(hidden_size),  # [0] weight: [768], bias: [768]
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size, 512),  # [2] weight: [512, 768], bias: [512]
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(512, 128),  # [5] weight: [128, 512], bias: [128]
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, num_emotions),  # [8] weight: [2, 128], bias: [2]
+            nn.Tanh()  # Output in range [-1, 1]
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass through the model.
+        Args:
+            pixel_values: Input images tensor of shape (batch_size, 3, 224, 224)
+        Returns:
+            Emotion predictions tensor of shape (batch_size, 2) [valence, arousal]
+        """
+        # Get ViT outputs
+        outputs = self.vit(pixel_values)
+        cls_output = outputs.last_hidden_state[:, 0]
+        # Predict emotions
+        emotion_predictions = self.head(cls_output)
+        return emotion_predictions
+class MobileViTStudent(nn.Module):
+    """
+    Lightweight MobileViT student model for emotion regression.
+    Used in distilled version for faster inference.
+    """
+    def __init__(self, num_emotions=2, dropout=0.1):
+        super().__init__()
+        # Lightweight CNN backbone
+        self.conv_stem = nn.Sequential(
+            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(inplace=True),
+        )
+        # Mobile inverted bottleneck blocks
+        self.blocks = nn.Sequential(
+            self._make_mb_block(32, 64, stride=2),
+            self._make_mb_block(64, 128, stride=2),
+            self._make_mb_block(128, 256, stride=2),
+        )
+        # Global pooling
+        self.global_pool = nn.AdaptiveAvgPool2d(1)
+        # Regression head (named 'head' to match saved checkpoint)
+        self.head = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(256, 128),
+            nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(128, num_emotions),
+            nn.Tanh()
+        )
+    def _make_mb_block(self, in_channels, out_channels, stride=1):
+        """Create Mobile Inverted Bottleneck block"""
+        return nn.Sequential(
+            # Depthwise
+            nn.Conv2d(in_channels, in_channels, kernel_size=3,
+                     stride=stride, padding=1, groups=in_channels),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(inplace=True),
+            # Pointwise
+            nn.Conv2d(in_channels, out_channels, kernel_size=1),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        """Forward pass"""
+        x = self.conv_stem(x)
+        x = self.blocks(x)
+        x = self.global_pool(x)
+        emotions = self.head(x)
+        return emotions

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+transformers>=4.30.0
+librosa>=0.10.0
+numpy>=1.24.0
+pillow>=10.0.0