Spaces:

Thanh-Lam
/

vietnamese-speaker-profiling-v2

Running

App Files Files Community

Thanh-Lam commited on 5 days ago

Commit

c3418e9

0 Parent(s):

Vietnamese Speaker Profiling with wav2vec2-base-vi-vlsp2020

Browse files

Files changed (20) hide show

.gitattributes +2 -0
README.md +13 -0
app.py +316 -0
configs/eval.yaml +60 -0
configs/eval.yaml.example +165 -0
configs/finetune.yaml +89 -0
configs/finetune.yaml.example +186 -0
configs/infer.yaml +40 -0
configs/infer.yaml.example +80 -0
configs/train_ecapa.yaml +90 -0
model/vulehuubinh/model.safetensors +3 -0
model/vulehuubinh/preprocessor_config.json +10 -0
model/vulehuubinh/training_args.bin +3 -0
requirements.txt +11 -0
src/__init__.py +42 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/models.cpython-311.pyc +0 -0
src/__pycache__/utils.cpython-311.pyc +0 -0
src/models.py +648 -0
src/utils.py +261 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ *.bin filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Vietnamese Speaker Profiling
+emoji: 📈
+colorFrom: indigo
+colorTo: yellow
+sdk: gradio
+sdk_version: 6.0.2
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Gradio Web Interface for Speaker Profiling
+Usage:
+    python app.py
+    python app.py --config configs/infer.yaml --share
+"""
+import os
+import argparse
+import tempfile
+import time
+import numpy as np
+import torch
+import librosa
+import gradio as gr
+from pathlib import Path
+from src.models import MultiTaskSpeakerModel
+from src.utils import (
+    setup_logging,
+    get_logger,
+    load_config,
+    get_device,
+    load_model_checkpoint,
+    preprocess_audio
+)
+class SpeakerProfilerApp:
+    """Gradio application for speaker profiling"""
+    def __init__(self, config_path: str):
+        self.logger = setup_logging(name="gradio_app")
+        self.config = load_config(config_path)
+        self.device = get_device(self.config['inference']['device'])
+        self.sampling_rate = self.config['audio']['sampling_rate']
+        self.max_duration = self.config['audio']['max_duration']
+        self.gender_labels = self.config['labels']['gender']
+        self.dialect_labels = self.config['labels']['dialect']
+        self._load_model()
+    def _load_model(self):
+        """Load model and feature extractor"""
+        from transformers import Wav2Vec2FeatureExtractor, WhisperFeatureExtractor
+        self.logger.info("Loading model...")
+        model_name = self.config['model']['name']
+        is_ecapa = 'ecapa' in model_name.lower() or 'speechbrain' in model_name.lower()
+        # Check if this is a Whisper/PhoWhisper model
+        self.is_whisper = 'whisper' in model_name.lower() or 'phowhisper' in model_name.lower()
+        if is_ecapa:
+            # ECAPA-TDNN: use Wav2Vec2 feature extractor for audio normalization
+            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+                "facebook/wav2vec2-base"
+            )
+        elif self.is_whisper:
+            # Whisper/PhoWhisper: use WhisperFeatureExtractor
+            self.feature_extractor = WhisperFeatureExtractor.from_pretrained(
+                model_name
+            )
+        else:
+            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+                self.config['model']['checkpoint']
+            )
+        self.model = MultiTaskSpeakerModel(model_name)
+        self.model = load_model_checkpoint(
+            self.model,
+            self.config['model']['checkpoint'],
+            str(self.device)
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        self.logger.info(f"Model loaded on {self.device}")
+    def predict(self, audio_input):
+        """
+        Predict gender and dialect from audio
+        Args:
+            audio_input: Tuple of (sample_rate, audio_array) from Gradio
+        Returns:
+            Tuple of (gender_result, dialect_result, details)
+        """
+        if audio_input is None:
+            return "No audio", "No audio", "Please upload or record audio"
+        try:
+            sr, audio = audio_input
+            if len(audio.shape) > 1:
+                audio = audio.mean(axis=1)
+            audio = audio.astype(np.float32)
+            if audio.max() > 1.0:
+                audio = audio / 32768.0
+            if sr != self.sampling_rate:
+                audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sampling_rate)
+            # Calculate original audio duration BEFORE preprocessing
+            audio_duration = len(audio) / self.sampling_rate
+            # Whisper requires 30 seconds of audio
+            if self.is_whisper:
+                max_duration = 30
+            else:
+                max_duration = self.max_duration
+            audio = preprocess_audio(
+                audio,
+                sampling_rate=self.sampling_rate,
+                max_duration=max_duration
+            )
+            # Whisper needs exactly 30 seconds - pad if necessary
+            if self.is_whisper:
+                target_len = self.sampling_rate * 30
+                if len(audio) < target_len:
+                    audio = np.pad(audio, (0, target_len - len(audio)))
+            inputs = self.feature_extractor(
+                audio,
+                sampling_rate=self.sampling_rate,
+                return_tensors="pt",
+                padding=True
+            )
+            # Whisper uses 'input_features', WavLM/HuBERT/Wav2Vec2 use 'input_values'
+            if self.is_whisper:
+                input_values = inputs.input_features.to(self.device)
+            else:
+                input_values = inputs.input_values.to(self.device)
+            # Measure inference time
+            start_time = time.perf_counter()
+            with torch.no_grad():
+                outputs = self.model(input_values)
+                gender_logits = outputs['gender_logits']
+                dialect_logits = outputs['dialect_logits']
+            # Calculate inference time
+            infer_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
+            gender_probs = torch.softmax(gender_logits, dim=-1).cpu().numpy()[0]
+            dialect_probs = torch.softmax(dialect_logits, dim=-1).cpu().numpy()[0]
+            gender_pred = int(np.argmax(gender_probs))
+            dialect_pred = int(np.argmax(dialect_probs))
+            gender_name = self.gender_labels[gender_pred]
+            dialect_name = self.dialect_labels[dialect_pred]
+            gender_conf = gender_probs[gender_pred] * 100
+            dialect_conf = dialect_probs[dialect_pred] * 100
+            gender_result = f"{gender_name} ({gender_conf:.1f}%)"
+            dialect_result = f"{dialect_name} ({dialect_conf:.1f}%)"
+            details = self._format_details(gender_probs, dialect_probs, infer_time, audio_duration)
+            self.logger.info(f"Prediction: Gender={gender_name}, Dialect={dialect_name} | Inference time: {infer_time:.2f}ms | Audio: {audio_duration:.2f}s")
+            return gender_result, dialect_result, details
+        except Exception as e:
+            self.logger.error(f"Prediction error: {e}")
+            return "Error", "Error", f"Error: {str(e)}"
+    def _format_details(self, gender_probs: np.ndarray, dialect_probs: np.ndarray, infer_time: float = None, audio_duration: float = None) -> str:
+        """Format detailed prediction results"""
+        # Gender label names
+        gender_names = ['Female', 'Male']
+        # Dialect label names
+        dialect_names = ['North', 'Central', 'South']
+        lines = []
+        lines.append("Gender Probabilities:")
+        for i, name in enumerate(gender_names):
+            lines.append(f"  {name}: {gender_probs[i]*100:.2f}%")
+        lines.append("")
+        lines.append("Dialect Probabilities:")
+        for i, name in enumerate(dialect_names):
+            lines.append(f"  {name}: {dialect_probs[i]*100:.2f}%")
+        lines.append("")
+        lines.append("─" * 30)
+        if audio_duration is not None:
+            lines.append(f"Audio Duration: {audio_duration:.2f} s")
+        if infer_time is not None:
+            lines.append(f"Inference Time: {infer_time:.2f} ms")
+        return "\n".join(lines)
+    def create_interface(self) -> gr.Blocks:
+        """Create Gradio interface"""
+        # Gradio < 4.0 doesn't support theme in Blocks
+        with gr.Blocks(title="Vietnamese Speaker Profiling") as demo:
+            gr.Markdown(
+                """
+                # Vietnamese Speaker Profiling
+                Identify gender and dialect from Vietnamese speech audio.
+                **Model:** Encoder + Attentive Pooling + LayerNorm + MultiHead Classifier
+                **Supported dialects:** North, Central, South
+                """
+            )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    audio_input = gr.Audio(
+                        label="Input Audio",
+                        type="numpy",
+                        sources=["upload", "microphone"]
+                    )
+                    submit_btn = gr.Button("Analyze", variant="primary")
+                    clear_btn = gr.Button("Clear")
+                with gr.Column(scale=1):
+                    gender_output = gr.Textbox(
+                        label="Gender",
+                        interactive=False
+                    )
+                    dialect_output = gr.Textbox(
+                        label="Dialect",
+                        interactive=False
+                    )
+                    details_output = gr.Textbox(
+                        label="Details",
+                        lines=8,
+                        interactive=False
+                    )
+            gr.Markdown(
+                """
+                ---
+                **Notes:**
+                - Supported formats: WAV, MP3
+                - Recommended duration: 3-10 seconds
+                """
+            )
+            submit_btn.click(
+                fn=self.predict,
+                inputs=[audio_input],
+                outputs=[gender_output, dialect_output, details_output]
+            )
+            clear_btn.click(
+                fn=lambda: (None, "", "", ""),
+                inputs=[],
+                outputs=[audio_input, gender_output, dialect_output, details_output]
+            )
+        return demo
+def main():
+    """Main function"""
+    parser = argparse.ArgumentParser(description="Speaker Profiling Web Interface")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/infer.yaml",
+        help="Path to config file"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create public link"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Port number (default: 7860)"
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="0.0.0.0",
+        help="Server name (default: 0.0.0.0)"
+    )
+    args = parser.parse_args()
+    app = SpeakerProfilerApp(args.config)
+    demo = app.create_interface()
+    demo.launch(
+        server_name=args.server_name,
+        server_port=args.port,
+        share=args.share
+    )
+if __name__ == "__main__":
+    main()

configs/eval.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+# Evaluation Configuration
+# Architecture: Encoder + Attentive Pooling + LayerNorm
+# Model
+model:
+  checkpoint: "output/speaker-profiling/best_model"
+  name: "microsoft/wavlm-base-plus"
+  head_hidden_dim: 256
+# Audio Processing
+audio:
+  sampling_rate: 16000
+  max_duration: 5
+# Evaluation
+evaluation:
+  batch_size: 32
+  dataloader_num_workers: 2
+# Data Paths (relative to repo root)
+data:
+  # === ViSpeech (CSV format) ===
+  clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/clean_testset.csv"
+  clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/clean_testset"
+  noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
+  noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filling/vispeech_data/ViSpeech/noisy_testset"
+  # === ViMD (HuggingFace format) ===
+  vimd_path: "/kaggle/input/vimd-dataset"
+# Output
+output:
+  dir: "output/evaluation"
+  save_predictions: true
+  save_confusion_matrix: true
+# Label Mappings
+labels:
+  gender:
+    Male: 0
+    Female: 1
+    0: 0
+    1: 1
+  dialect:
+    North: 0
+    Central: 1
+    South: 2
+  region_to_dialect:
+    North: 0
+    Central: 1
+    South: 2
+# Baseline Comparison (PACLIC 2024 - ResNet34)
+baseline:
+  gender:
+    clean: 98.73
+    noisy: 98.14
+  dialect:
+    clean: 81.47
+    noisy: 74.80

configs/eval.yaml.example ADDED Viewed

	@@ -0,0 +1,165 @@

+# Evaluation Configuration# Evaluation Configuration# Evaluation Configuration
+# Evaluate model on test sets from raw audio
+# Copy this file to eval.yaml and update paths# Evaluate model on test sets from raw audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads
+# Model# Copy this file to eval.yaml and update paths# Copy this file to eval.yaml and update paths
+model:
+  checkpoint: "path/to/best_model"
+  name: "microsoft/wavlm-base-plus"
+  head_hidden_dim: 256# Model# Model
+# Audio Processingmodel:model:
+audio:
+  sampling_rate: 16000  checkpoint: "path/to/best_model"  checkpoint: "path/to/best_model"
+  max_duration: 5
+  name: "microsoft/wavlm-base-plus"  name: "microsoft/wavlm-base-plus"
+# Evaluation
+evaluation:  head_hidden_dim: 256  head_hidden_dim: 256
+  batch_size: 32
+  dataloader_num_workers: 2
+# Data Paths# Audio Processing# Audio Processing
+data:
+  # === ViSpeech (CSV format) ===audio:audio:
+  clean_test_meta: "path/to/metadata/clean_testset.csv"
+  clean_test_audio: "path/to/clean_testset"  sampling_rate: 16000  sampling_rate: 16000
+  noisy_test_meta: "path/to/metadata/noisy_testset.csv"
+  noisy_test_audio: "path/to/noisy_testset"  max_duration: 5  max_duration: 5
+  # === ViMD (HuggingFace format) ===
+  vimd_path: "/path/to/vimd-dataset"
+# Evaluation# Evaluation
+# Output
+output:evaluation:evaluation:
+  dir: "output/evaluation"
+  save_predictions: true  batch_size: 32  batch_size: 32
+  save_confusion_matrix: true
+  dataloader_num_workers: 2  dataloader_num_workers: 2
+# Label Mappings
+labels:
+  gender:
+    Male: 0# Data Paths# Data Paths (UPDATE THESE PATHS)
+    Female: 1
+    0: 0data:data:
+    1: 1
+  dialect:  clean_test_meta: "path/to/metadata/clean_testset.csv"  clean_test_meta: "path/to/metadata/clean_testset.csv"
+    North: 0
+    Central: 1  clean_test_audio: "path/to/clean_testset"  clean_test_audio: "path/to/clean_testset"
+    South: 2
+  region_to_dialect:  noisy_test_meta: "path/to/metadata/noisy_testset.csv"  noisy_test_meta: "path/to/metadata/noisy_testset.csv"
+    North: 0
+    Central: 1  noisy_test_audio: "path/to/noisy_testset"  noisy_test_audio: "path/to/noisy_testset"
+    South: 2
+# Baseline Comparison (PACLIC 2024 - ResNet34)
+baseline:# Output# Output
+  gender:
+    clean: 98.73output:output:
+    noisy: 98.14
+  dialect:  dir: "output/evaluation"  dir: "output/evaluation"
+    clean: 81.47
+    noisy: 74.80  save_predictions: true  save_predictions: true
+  save_confusion_matrix: true  save_confusion_matrix: true
+# Label Mappings# Label Mappings
+labels:labels:
+  gender:  gender:
+    Male: 0    Male: 0
+    Female: 1    Female: 1
+    0: 0  dialect:
+    1: 1    North: 0
+  dialect:    Central: 1
+    North: 0    South: 2
+    Central: 1
+    South: 2# Baseline Comparison (PACLIC 2024 - ResNet34)
+baseline:
+# Baseline Comparison (PACLIC 2024 - ResNet34)  gender:
+baseline:    clean: 98.73
+  gender:    noisy: 98.14
+    clean: 98.73  dialect:
+    noisy: 98.14    clean: 81.47
+  dialect:    noisy: 74.80
+    clean: 81.47
+    noisy: 74.80

configs/finetune.yaml ADDED Viewed

	@@ -0,0 +1,89 @@

+# Model (for classification heads only - features are pre-extracted)
+model:
+  name: "microsoft/wavlm-base-plus"  # Used for hidden_size reference
+  num_genders: 2
+  num_dialects: 3
+  dropout: 0.1
+  head_hidden_dim: 256
+# Audio processing
+audio:
+  sampling_rate: 16000
+  max_duration: 5  # seconds
+# Training
+training:
+  batch_size: 32
+  learning_rate: 5e-5
+  num_epochs: 15
+  warmup_ratio: 0.125
+  weight_decay: 0.0125
+  gradient_clip: 0.5
+  lr_scheduler: "linear"
+  fp16: true
+  dataloader_num_workers: 4
+# Data Augmentation
+augmentation:
+  enabled: true
+  prob: 0.8
+# Loss
+loss:
+  dialect_weight: 3.0
+# WandB Configuration
+wandb:
+  enabled: true
+  api_key: "f05e29c3466ec288e97041e0e3d541c4087096a6"
+  project: "speaker-profiling"
+  run_name: null
+# Dataset paths
+# source: "vispeech" (CSV format) or "vimd" (HuggingFace format)
+data:
+  source: "vispeech"  # Options: vispeech, vimd
+  # === ViSpeech (CSV format) ===
+  vispeech_root: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech"
+  train_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/trainset.csv"
+  train_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/trainset"
+  clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/clean_testset.csv"
+  clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/clean_testset"
+  noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
+  noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/noisy_testset"
+  val_split: 0.15
+  # === ViMD (HuggingFace format) ===
+  vimd_path: "/kaggle/input/vimd-dataset"
+# Output
+output:
+  dir: "output/speaker-profiling"
+  save_total_limit: 3
+  metric_for_best_model: "dialect_acc"
+# Early Stopping
+early_stopping:
+  patience: 3
+  threshold: 0.0025
+# Label Mappings
+labels:
+  gender:
+    Male: 0
+    Female: 1
+    0: 0  # Support int labels (ViMD)
+    1: 1
+  dialect:
+    North: 0
+    Central: 1
+    South: 2
+  # ViMD uses 'region' column
+  region_to_dialect:
+    North: 0
+    Central: 1
+    South: 2
+# Reproducibility
+seed: 42

configs/finetune.yaml.example ADDED Viewed

	@@ -0,0 +1,186 @@

+# Finetune Configuration# Finetune Configuration
+# Full model finetuning from raw audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads
+# Supports: ViSpeech (CSV) and ViMD (HuggingFace)# Uses pre-extracted features from prepare_data.py
+# Copy this file to finetune.yaml and update paths# Copy this file to finetune.yaml and update paths
+# Model# Model (for classification heads only - features are pre-extracted)
+model:model:
+  name: "microsoft/wavlm-base-plus"  name: "microsoft/wavlm-base-plus"  # Used for hidden_size reference
+  num_genders: 2  hidden_size: 768                   # WavLM base hidden dimension
+  num_dialects: 3  num_genders: 2
+  dropout: 0.1  num_dialects: 3
+  head_hidden_dim: 256  dropout: 0.1
+  head_hidden_dim: 256
+# Audio processing
+audio:# Training
+  sampling_rate: 16000training:
+  max_duration: 5  # seconds  batch_size: 32
+  learning_rate: 5e-5
+# Training  num_epochs: 15
+training:  warmup_ratio: 0.125
+  batch_size: 32  weight_decay: 0.0125
+  learning_rate: 5e-5  gradient_clip: 1.0
+  num_epochs: 15  lr_scheduler: "linear"
+  warmup_ratio: 0.125  fp16: true
+  weight_decay: 0.0125  dataloader_num_workers: 4
+  gradient_clip: 1.0
+  lr_scheduler: "linear"# Loss
+  fp16: trueloss:
+  dataloader_num_workers: 4  dialect_weight: 3.0
+# Data Augmentation# MLflow Configuration
+augmentation:mlflow:
+  enabled: true  enabled: true
+  prob: 0.8  tracking_uri: "mlruns"
+  experiment_name: "speaker-profiling"
+# Loss  run_name: null
+loss:  registered_model_name: null
+  dialect_weight: 3.0
+# Dataset paths
+# MLflow Configuration# ============================================================
+mlflow:# STEP 1: Update RAW DATASET PATHS to your local ViSpeech location
+  enabled: true# STEP 2: Run prepare_data.py to extract features
+  tracking_uri: "mlruns"# STEP 3: Features will be saved to train_dir/val_dir folders
+  experiment_name: "speaker-profiling"# ============================================================
+  run_name: nulldata:
+  registered_model_name: null  # === RAW DATASET PATHS (for prepare_data.py) ===
+  # Download ViSpeech: https://drive.google.com/file/d/1-BbOHf42o6eBje2WqQiiRKMtNxmZiRf9
+# Dataset  # Update these paths to match your local dataset location
+# source: "vispeech" (CSV format) or "vimd" (HuggingFace format)  vispeech_root: "/path/to/ViSpeech"  # <-- UPDATE THIS
+data:
+  source: "vispeech"  # Options: vispeech, vimd  # Training data
+    train_meta: "/path/to/ViSpeech/metadata/trainset.csv"      # <-- UPDATE
+  # === ViSpeech (CSV format) ===  train_audio: "/path/to/ViSpeech/trainset"                   # <-- UPDATE
+  vispeech_root: "/path/to/ViSpeech"
+  train_meta: "/path/to/ViSpeech/metadata/trainset.csv"  # Test data
+  train_audio: "/path/to/ViSpeech/trainset"  clean_test_meta: "/path/to/ViSpeech/metadata/clean_testset.csv"
+  clean_test_meta: "/path/to/ViSpeech/metadata/clean_testset.csv"  clean_test_audio: "/path/to/ViSpeech/clean_testset"
+  clean_test_audio: "/path/to/ViSpeech/clean_testset"  noisy_test_meta: "/path/to/ViSpeech/metadata/noisy_testset.csv"
+  noisy_test_meta: "/path/to/ViSpeech/metadata/noisy_testset.csv"  noisy_test_audio: "/path/to/ViSpeech/noisy_testset"
+  noisy_test_audio: "/path/to/ViSpeech/noisy_testset"
+  val_split: 0.15  # Validation split ratio (extracted from trainset)
+    val_split: 0.15
+  # === ViMD (HuggingFace format) ===
+  vimd_path: "/path/to/vimd-dataset"  # === EXTRACTED FEATURES PATHS (for finetune.py) ===
+  # After running prepare_data.py, features will be saved here
+# Output  # These paths are relative to project root
+output:  train_dir: "datasets/ViSpeech/train"
+  dir: "output/speaker-profiling"  val_dir: "datasets/ViSpeech/val"
+  save_total_limit: 3
+  metric_for_best_model: "dialect_acc"# Output
+output:
+# Early Stopping  dir: "output/speaker-profiling"
+early_stopping:  save_total_limit: 3
+  patience: 3  metric_for_best_model: "dialect_acc"
+  threshold: 0.0025
+# Early Stopping
+# Label Mappingsearly_stopping:
+labels:  patience: 3
+  gender:  threshold: 0.0025
+    Male: 0
+    Female: 1# Label Mappings (must match prepare_data.py)
+    0: 0labels:
+    1: 1  gender:
+  dialect:    Male: 0
+    North: 0    Female: 1
+    Central: 1  dialect:
+    South: 2    North: 0
+  region_to_dialect:    Central: 1
+    North: 0    South: 2
+    Central: 1
+    South: 2# Reproducibility
+seed: 42
+# Reproducibility
+seed: 42

configs/infer.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+# Inference Configuration
+# Model
+model:
+  checkpoint: "model/vulehuubinh"
+  name: "nguyenvulebinh/wav2vec2-base-vi-vlsp2020"
+  head_hidden_dim: 256
+# Audio Processing
+audio:
+  sampling_rate: 16000
+  max_duration: 5
+# Inference
+inference:
+  batch_size: 1
+  device: "cuda"
+# Input
+input:
+  audio_path: null
+  audio_dir: null
+# Output
+output:
+  dir: "output/predictions"
+  save_results: true
+  format: "json"
+# Label Mappings
+# NOTE: Model was trained with Female=0, Male=1 (opposite of finetune.yaml order)
+# This is because pandas .map() may have processed labels in different order
+labels:
+  gender:
+    0: "Female"
+    1: "Male"
+  dialect:
+    0: "North"
+    1: "Central"
+    2: "South"

configs/infer.yaml.example ADDED Viewed

	@@ -0,0 +1,80 @@

+# Inference Configuration# Inference Configuration
+# Predict gender and dialect from audio# Architecture: WavLM + Attentive Pooling + LayerNorm + Deeper Heads
+# Copy this file to infer.yaml and update paths# Copy this file to infer.yaml and update paths
+# Model# Model
+model:model:
+  checkpoint: "path/to/best_model"  checkpoint: "path/to/best_model"
+  name: "microsoft/wavlm-base-plus"  name: "microsoft/wavlm-base-plus"
+  head_hidden_dim: 256  head_hidden_dim: 256
+# Audio Processing# Audio Processing
+audio:audio:
+  sampling_rate: 16000  sampling_rate: 16000
+  max_duration: 5  max_duration: 5
+# Inference# Inference
+inference:inference:
+  batch_size: 1  batch_size: 1
+  device: "cuda"  device: "cuda"
+# Input# Input
+input:input:
+  audio_path: null  audio_path: null
+  audio_dir: null  audio_dir: null
+# Output# Output
+output:output:
+  dir: "output/predictions"  dir: "output/predictions"
+  save_results: true  save_results: true
+  format: "json"  format: "json"
+# Label Mappings# Label Mappings
+labels:labels:
+  gender:  gender:
+    0: "Male"    0: "Male"
+    1: "Female"    1: "Female"
+  dialect:  dialect:
+    0: "North"    0: "North"
+    1: "Central"    1: "Central"
+    2: "South"    2: "South"

configs/train_ecapa.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+# Config for ECAPA-TDNN (SpeechBrain)
+# Model: speechbrain/spkrec-ecapa-voxceleb
+# Model
+model:
+  name: "speechbrain/spkrec-ecapa-voxceleb"
+  num_genders: 2
+  num_dialects: 3
+  dropout: 0.1
+  head_hidden_dim: 128  # Smaller head for 192-dim embeddings
+# Audio processing
+audio:
+  sampling_rate: 16000
+  max_duration: 5  # seconds
+# Training
+training:
+  batch_size: 32
+  learning_rate: 1e-4  # Higher LR since only training heads
+  num_epochs: 15
+  warmup_ratio: 0.1
+  weight_decay: 0.01
+  gradient_clip: 1.0
+  lr_scheduler: "linear"
+  fp16: false  # ECAPA-TDNN does not support fp16
+  dataloader_num_workers: 4
+# Data Augmentation
+augmentation:
+  enabled: true
+  prob: 0.8
+# Loss
+loss:
+  dialect_weight: 3.0
+# WandB Configuration
+wandb:
+  enabled: true
+  api_key: "f05e29c3466ec288e97041e0e3d541c4087096a6"
+  project: "vispeech-speaker-profiling"
+  run_name: "ecapa-tdnn"
+# Dataset paths
+data:
+  source: "vispeech"  # Options: vispeech, vimd
+  # === ViSpeech (CSV format) ===
+  vispeech_root: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech"
+  train_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/trainset.csv"
+  train_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/trainset"
+  clean_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/clean_testset.csv"
+  clean_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/clean_testset"
+  noisy_test_meta: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/metadata/noisy_testset.csv"
+  noisy_test_audio: "/home/ubuntu/DataScience/Voice_Pro_filing/vispeech_data/ViSpeech/noisy_testset"
+  val_split: 0.15
+  # === ViMD (HuggingFace format) ===
+  vimd_path: "/kaggle/input/vimd-dataset"
+# Output
+output:
+  dir: "output/ecapa-tdnn"
+  save_total_limit: 3
+  metric_for_best_model: "dialect_acc"
+# Early Stopping
+early_stopping:
+  patience: 3
+  threshold: 0.0025
+# Label Mappings
+labels:
+  gender:
+    Male: 0
+    Female: 1
+    0: 0
+    1: 1
+  dialect:
+    North: 0
+    Central: 1
+    South: 2
+  region_to_dialect:
+    North: 0
+    Central: 1
+    South: 2
+# Reproducibility
+seed: 42

model/vulehuubinh/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5b4a3417c2d783e44b7cfd701b083b979c076fde257fc0ea80c12fab5705ad
+size 381595388

model/vulehuubinh/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2ProcessorWithLM",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

model/vulehuubinh/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a059e8720c9e406f538f14e191d903e9efad04de1f27661fc918fefecbd6bea1
+size 5176

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# HuggingFace Spaces requirements
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers==4.44.0
+librosa>=0.10.0
+soundfile>=0.12.0
+numpy<2.0
+safetensors>=0.4.0
+gradio>=4.0.0
+pyyaml>=6.0
+omegaconf

src/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Speaker Profiling Source Package
+"""
+from .models import (
+    AttentivePooling,
+    MultiTaskSpeakerModel,
+    MultiTaskSpeakerModelFromConfig
+)
+from .utils import (
+    setup_logging,
+    get_logger,
+    load_config,
+    set_seed,
+    load_audio,
+    preprocess_audio,
+    load_and_preprocess_audio,
+    load_model_checkpoint,
+    get_device,
+    count_parameters,
+    format_number
+)
+__all__ = [
+    # Models
+    'AttentivePooling',
+    'MultiTaskSpeakerModel',
+    'MultiTaskSpeakerModelFromConfig',
+    # Utils
+    'setup_logging',
+    'get_logger',
+    'load_config',
+    'set_seed',
+    'load_audio',
+    'preprocess_audio',
+    'load_and_preprocess_audio',
+    'load_model_checkpoint',
+    'get_device',
+    'count_parameters',
+    'format_number'
+]

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (925 Bytes). View file

src/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (28.2 kB). View file

src/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

src/models.py ADDED Viewed

	@@ -0,0 +1,648 @@

+"""
+Model Architecture for Speaker Profiling
+Supports multiple encoders: WavLM, HuBERT, Wav2Vec2, Whisper, ECAPA-TDNN
+Architecture: Encoder + Attentive Pooling + LayerNorm + Classification Heads
+"""
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (
+    WavLMModel,
+    HubertModel,
+    Wav2Vec2Model,
+    WhisperModel,
+    AutoConfig
+)
+# SpeechBrain ECAPA-TDNN support - lazy import to avoid torchaudio issues
+SPEECHBRAIN_AVAILABLE = None  # Will be set on first use
+EncoderClassifier = None  # Will be imported lazily
+def _check_speechbrain():
+    """Lazily check and import SpeechBrain"""
+    global SPEECHBRAIN_AVAILABLE, EncoderClassifier
+    if SPEECHBRAIN_AVAILABLE is None:
+        try:
+            from speechbrain.inference.speaker import EncoderClassifier as _EncoderClassifier
+            EncoderClassifier = _EncoderClassifier
+            SPEECHBRAIN_AVAILABLE = True
+        except (ImportError, AttributeError) as e:
+            SPEECHBRAIN_AVAILABLE = False
+            logger.warning(f"SpeechBrain not available: {e}")
+    return SPEECHBRAIN_AVAILABLE
+logger = logging.getLogger("speaker_profiling")
+# ECAPA-TDNN wrapper class for consistent interface
+class ECAPATDNNEncoder(nn.Module):
+    """
+    Wrapper for SpeechBrain ECAPA-TDNN encoder.
+    ECAPA-TDNN outputs fixed-size embeddings (192 or 512 dim) instead of
+    frame-level features like WavLM/HuBERT. This wrapper handles the difference.
+    Supported models:
+        - speechbrain/spkrec-ecapa-voxceleb: 192-dim embeddings
+        - speechbrain/spkrec-xvect-voxceleb: 512-dim embeddings (x-vector)
+    """
+    def __init__(self, model_name: str = "speechbrain/spkrec-ecapa-voxceleb"):
+        super().__init__()
+        # Lazy import SpeechBrain
+        if not _check_speechbrain():
+            raise ImportError(
+                "SpeechBrain is required for ECAPA-TDNN. "
+                "Install with: pip install speechbrain"
+            )
+        self.model_name = model_name
+        # Detect if CUDA is available
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.encoder = EncoderClassifier.from_hparams(
+            source=model_name,
+            savedir=f"pretrained_models/{model_name.split('/')[-1]}",
+            run_opts={"device": device}
+        )
+        # Force float32 for all encoder parameters
+        self.encoder.mods.float()
+        # Determine embedding size
+        if "ecapa" in model_name.lower():
+            self.embedding_size = 192
+        elif "xvect" in model_name.lower():
+            self.embedding_size = 512
+        else:
+            self.embedding_size = 192  # default
+        # Config-like object for compatibility
+        class Config:
+            def __init__(self, hidden_size):
+                self.hidden_size = hidden_size
+        self.config = Config(self.embedding_size)
+        # Track current device
+        self._current_device = device
+    def forward(self, input_values: torch.Tensor, attention_mask: torch.Tensor = None):
+        """
+        Extract embeddings from audio.
+        Args:
+            input_values: Audio waveform [B, T]
+            attention_mask: Not used for ECAPA-TDNN
+        Returns:
+            Object with last_hidden_state attribute [B, 1, H]
+        """
+        # Get device from input
+        device = input_values.device
+        # Move encoder to same device as input if needed
+        if str(device) != str(self._current_device):
+            self.encoder.to(device)
+            self.encoder.mods.float()  # Ensure float32 after move
+            self._current_device = device
+        # Ensure input is float32 and on correct device
+        input_values = input_values.float().to(device)
+        # SpeechBrain expects [B, T] audio at 16kHz
+        # encode_batch handles feature extraction internally
+        with torch.no_grad():
+            # Set encoder to eval mode to handle BatchNorm properly
+            self.encoder.eval()
+            embeddings = self.encoder.encode_batch(input_values)  # [B, 1, H]
+        # Ensure output is float32
+        embeddings = embeddings.float()
+        # Return object compatible with HuggingFace models
+        class Output:
+            def __init__(self, hidden_state):
+                self.last_hidden_state = hidden_state
+        return Output(embeddings)
+# Encoder registry - maps model type to class and hidden size
+ENCODER_REGISTRY = {
+    # WavLM variants
+    "microsoft/wavlm-base": {"class": WavLMModel, "hidden_size": 768},
+    "microsoft/wavlm-base-plus": {"class": WavLMModel, "hidden_size": 768},
+    "microsoft/wavlm-large": {"class": WavLMModel, "hidden_size": 1024},
+    # HuBERT variants
+    "facebook/hubert-base-ls960": {"class": HubertModel, "hidden_size": 768},
+    "facebook/hubert-large-ls960-ft": {"class": HubertModel, "hidden_size": 1024},
+    "facebook/hubert-xlarge-ls960-ft": {"class": HubertModel, "hidden_size": 1280},
+    # Wav2Vec2 variants
+    "facebook/wav2vec2-base": {"class": Wav2Vec2Model, "hidden_size": 768},
+    "facebook/wav2vec2-base-960h": {"class": Wav2Vec2Model, "hidden_size": 768},
+    "facebook/wav2vec2-large": {"class": Wav2Vec2Model, "hidden_size": 1024},
+    "facebook/wav2vec2-large-960h": {"class": Wav2Vec2Model, "hidden_size": 1024},
+    "facebook/wav2vec2-xls-r-300m": {"class": Wav2Vec2Model, "hidden_size": 1024},
+    # Vietnamese Wav2Vec2 (VLSP2020)
+    "nguyenvulebinh/wav2vec2-base-vi-vlsp2020": {"class": Wav2Vec2Model, "hidden_size": 768},
+    # Whisper variants (encoder only)
+    "openai/whisper-tiny": {"class": WhisperModel, "hidden_size": 384, "is_whisper": True},
+    "openai/whisper-base": {"class": WhisperModel, "hidden_size": 512, "is_whisper": True},
+    "openai/whisper-small": {"class": WhisperModel, "hidden_size": 768, "is_whisper": True},
+    "openai/whisper-medium": {"class": WhisperModel, "hidden_size": 1024, "is_whisper": True},
+    "openai/whisper-large": {"class": WhisperModel, "hidden_size": 1280, "is_whisper": True},
+    "openai/whisper-large-v2": {"class": WhisperModel, "hidden_size": 1280, "is_whisper": True},
+    "openai/whisper-large-v3": {"class": WhisperModel, "hidden_size": 1280, "is_whisper": True},
+    # PhoWhisper - Vietnamese fine-tuned Whisper (VinAI)
+    "vinai/PhoWhisper-tiny": {"class": WhisperModel, "hidden_size": 384, "is_whisper": True},
+    "vinai/PhoWhisper-base": {"class": WhisperModel, "hidden_size": 512, "is_whisper": True},
+    "vinai/PhoWhisper-small": {"class": WhisperModel, "hidden_size": 768, "is_whisper": True},
+    "vinai/PhoWhisper-medium": {"class": WhisperModel, "hidden_size": 1024, "is_whisper": True},
+    "vinai/PhoWhisper-large": {"class": WhisperModel, "hidden_size": 1280, "is_whisper": True},
+    # ECAPA-TDNN (SpeechBrain)
+    "speechbrain/spkrec-ecapa-voxceleb": {
+        "class": ECAPATDNNEncoder,
+        "hidden_size": 192,
+        "is_ecapa": True
+    },
+    "speechbrain/spkrec-xvect-voxceleb": {
+        "class": ECAPATDNNEncoder,
+        "hidden_size": 512,
+        "is_ecapa": True
+    },
+}
+def get_encoder_info(model_name: str) -> dict:
+    """Get encoder class and hidden size for a model name"""
+    if model_name in ENCODER_REGISTRY:
+        return ENCODER_REGISTRY[model_name]
+    # Check for ECAPA-TDNN / SpeechBrain models
+    # Note: We don't check SPEECHBRAIN_AVAILABLE here - the actual import
+    # will happen lazily in ECAPATDNNEncoder.__init__() when the model is used
+    if 'ecapa' in model_name.lower() or 'speechbrain' in model_name.lower():
+        hidden_size = 512 if 'xvect' in model_name.lower() else 192
+        return {"class": ECAPATDNNEncoder, "hidden_size": hidden_size, "is_ecapa": True}
+    # Try to auto-detect from config
+    try:
+        config = AutoConfig.from_pretrained(model_name)
+        hidden_size = getattr(config, 'hidden_size', 768)
+        if 'wavlm' in model_name.lower():
+            return {"class": WavLMModel, "hidden_size": hidden_size}
+        elif 'hubert' in model_name.lower():
+            return {"class": HubertModel, "hidden_size": hidden_size}
+        elif 'wav2vec2' in model_name.lower():
+            return {"class": Wav2Vec2Model, "hidden_size": hidden_size}
+        elif 'whisper' in model_name.lower() or 'phowhisper' in model_name.lower():
+            return {"class": WhisperModel, "hidden_size": hidden_size, "is_whisper": True}
+        else:
+            # Default to Wav2Vec2 architecture
+            return {"class": Wav2Vec2Model, "hidden_size": hidden_size}
+    except Exception as e:
+        logger.warning(f"Could not auto-detect encoder for {model_name}: {e}")
+        return {"class": WavLMModel, "hidden_size": 768}
+class AttentivePooling(nn.Module):
+    """
+    Attention-based pooling for temporal aggregation
+    Takes sequence of hidden states and produces a single vector
+    by computing attention weights and performing weighted sum.
+    """
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.attention = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.Tanh(),
+            nn.Linear(hidden_size, 1, bias=False)
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor = None):
+        """
+        Args:
+            x: Hidden states [B, T, H]
+            mask: Attention mask [B, T]
+        Returns:
+            pooled: Pooled representation [B, H]
+            attn_weights: Attention weights [B, T]
+        """
+        attn_weights = self.attention(x)  # [B, T, 1]
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            attn_weights = attn_weights.masked_fill(mask == 0, -1e9)
+        attn_weights = F.softmax(attn_weights, dim=1)
+        pooled = torch.sum(x * attn_weights, dim=1)
+        return pooled, attn_weights.squeeze(-1)
+class MultiTaskSpeakerModel(nn.Module):
+    """
+    Multi-task model for gender and dialect classification
+    Architecture:
+        Audio -> Encoder (WavLM/HuBERT/Wav2Vec2/Whisper/ECAPA-TDNN) -> Last Hidden [B,T,H]
+                              |
+                     Attentive Pooling [B,H] (skipped for ECAPA-TDNN)
+                              |
+                     Layer Normalization
+                              |
+                         Dropout(0.1)
+                              |
+              +---------------+---------------+
+              |                               |
+        Gender Head (2 layers)     Dialect Head (3 layers)
+              |                               |
+            [B,2]                           [B,3]
+    Supported encoders:
+        - WavLM: microsoft/wavlm-base-plus, microsoft/wavlm-large
+        - HuBERT: facebook/hubert-base-ls960, facebook/hubert-large-ls960-ft
+        - Wav2Vec2: facebook/wav2vec2-base, facebook/wav2vec2-large-960h
+        - Whisper: openai/whisper-base, openai/whisper-small, openai/whisper-medium
+        - ECAPA-TDNN: speechbrain/spkrec-ecapa-voxceleb (192-dim embeddings)
+    Args:
+        model_name: Pretrained encoder model name or path
+        num_genders: Number of gender classes (default: 2)
+        num_dialects: Number of dialect classes (default: 3)
+        dropout: Dropout probability (default: 0.1)
+        head_hidden_dim: Hidden dimension for classification heads (default: 256)
+        freeze_encoder: Whether to freeze encoder (default: False)
+        dialect_loss_weight: Weight for dialect loss in multi-task learning (default: 3.0)
+    """
+    def __init__(
+        self,
+        model_name: str,
+        num_genders: int = 2,
+        num_dialects: int = 3,
+        dropout: float = 0.1,
+        head_hidden_dim: int = 256,
+        freeze_encoder: bool = False,
+        dialect_loss_weight: float = 3.0
+    ):
+        super().__init__()
+        self.model_name = model_name
+        self.dialect_loss_weight = dialect_loss_weight
+        # Get encoder info and load model
+        encoder_info = get_encoder_info(model_name)
+        encoder_class = encoder_info["class"]
+        self.is_whisper = encoder_info.get("is_whisper", False)
+        self.is_ecapa = encoder_info.get("is_ecapa", False)
+        logger.info(f"Loading encoder: {model_name}")
+        logger.info(f"Encoder class: {encoder_class.__name__}")
+        # Load pretrained encoder
+        if self.is_ecapa:
+            # ECAPA-TDNN uses different loading mechanism
+            self.encoder = encoder_class(model_name)
+        else:
+            self.encoder = encoder_class.from_pretrained(model_name)
+        hidden_size = self.encoder.config.hidden_size
+        self.hidden_size = hidden_size
+        logger.info(f"Hidden size: {hidden_size}")
+        # Optionally freeze encoder
+        if freeze_encoder:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+            logger.info("Encoder weights frozen")
+        # Pooling and normalization (ECAPA-TDNN already outputs pooled embeddings)
+        self.attentive_pooling = AttentivePooling(hidden_size)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        # Gender classification head (2 layers)
+        self.gender_head = nn.Sequential(
+            nn.Linear(hidden_size, head_hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(head_hidden_dim, num_genders)
+        )
+        # Dialect classification head (3 layers - deeper for harder task)
+        self.dialect_head = nn.Sequential(
+            nn.Linear(hidden_size, head_hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(head_hidden_dim, head_hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(head_hidden_dim // 2, num_dialects)
+        )
+    def forward(
+        self,
+        input_values: torch.Tensor = None,
+        input_features: torch.Tensor = None,
+        attention_mask: torch.Tensor = None,
+        gender_labels: torch.Tensor = None,
+        dialect_labels: torch.Tensor = None
+    ):
+        """
+        Forward pass - supports both raw audio and pre-extracted features
+        Args:
+            input_values: Audio waveform [B, T] (for raw audio mode)
+            input_features: Pre-extracted features [B, T, H] or [B, 1, H] for ECAPA
+            attention_mask: Attention mask [B, T]
+            gender_labels: Gender labels [B] (optional, for training)
+            dialect_labels: Dialect labels [B] (optional, for training)
+        Returns:
+            dict with keys:
+                - loss: Combined loss (if labels provided)
+                - gender_logits: Gender predictions [B, num_genders]
+                - dialect_logits: Dialect predictions [B, num_dialects]
+                - attention_weights: Attention weights from pooling [B, T] (None for ECAPA)
+        """
+        # Get hidden states from either raw audio or pre-extracted features
+        if input_features is not None:
+            # Use pre-extracted features directly
+            hidden_states = input_features
+        elif input_values is not None:
+            # Extract features from encoder
+            hidden_states = self._encode(input_values, attention_mask)
+        else:
+            raise ValueError("Either input_values or input_features must be provided")
+        # Handle ECAPA-TDNN (outputs [B, 1, H] - already pooled embeddings)
+        if self.is_ecapa or hidden_states.shape[1] == 1:
+            # ECAPA-TDNN outputs already pooled embeddings
+            pooled = hidden_states.squeeze(1)  # [B, H]
+            attn_weights = None
+        else:
+            # Create proper attention mask for hidden states (encoder downsamples audio)
+            # Hidden states have different sequence length than input audio
+            if attention_mask is not None and hidden_states.shape[1] != attention_mask.shape[1]:
+                # Create new mask based on hidden states length
+                batch_size, seq_len, _ = hidden_states.shape
+                pooled_mask = torch.ones(batch_size, seq_len, device=hidden_states.device)
+            else:
+                pooled_mask = attention_mask
+            # Attentive pooling
+            pooled, attn_weights = self.attentive_pooling(hidden_states, pooled_mask)
+        # Normalization and dropout
+        pooled = self.layer_norm(pooled)
+        pooled = self.dropout(pooled)
+        # Classification heads
+        gender_logits = self.gender_head(pooled)
+        dialect_logits = self.dialect_head(pooled)
+        # Compute loss if labels provided
+        loss = None
+        if gender_labels is not None and dialect_labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            gender_loss = loss_fct(gender_logits, gender_labels)
+            dialect_loss = loss_fct(dialect_logits, dialect_labels)
+            loss = gender_loss + self.dialect_loss_weight * dialect_loss
+        return {
+            'loss': loss,
+            'gender_logits': gender_logits,
+            'dialect_logits': dialect_logits,
+            'attention_weights': attn_weights
+        }
+    def _encode(
+        self,
+        input_values: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Extract hidden states from encoder
+        Args:
+            input_values: Audio waveform [B, T]
+            attention_mask: Attention mask [B, T]
+        Returns:
+            hidden_states: Hidden states [B, T, H] or [B, 1, H] for ECAPA-TDNN
+        """
+        if self.is_ecapa:
+            # ECAPA-TDNN outputs fixed-size embeddings [B, 1, H]
+            outputs = self.encoder(input_values, attention_mask)
+            hidden_states = outputs.last_hidden_state
+        elif self.is_whisper:
+            # Whisper uses encoder-decoder, we only use encoder
+            outputs = self.encoder.encoder(input_values)
+            hidden_states = outputs.last_hidden_state
+        else:
+            # WavLM, HuBERT, Wav2Vec2
+            outputs = self.encoder(input_values, attention_mask=attention_mask)
+            hidden_states = outputs.last_hidden_state
+        return hidden_states
+    def get_embeddings(
+        self,
+        input_values: torch.Tensor,
+        attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Extract speaker embeddings (pooled representations)
+        Args:
+            input_values: Audio waveform [B, T]
+            attention_mask: Attention mask [B, T]
+        Returns:
+            embeddings: Speaker embeddings [B, H]
+        """
+        hidden_states = self._encode(input_values, attention_mask)
+        if self.is_ecapa or hidden_states.shape[1] == 1:
+            # ECAPA-TDNN already outputs pooled embeddings
+            pooled = hidden_states.squeeze(1)
+        else:
+            pooled, _ = self.attentive_pooling(hidden_states, attention_mask)
+        pooled = self.layer_norm(pooled)
+        return pooled
+class MultiTaskSpeakerModelFromConfig(MultiTaskSpeakerModel):
+    """
+    Multi-task model initialized from OmegaConf config
+    Supports multiple encoders: WavLM, HuBERT, Wav2Vec2, Whisper
+    Use this for inference with raw audio input.
+    Usage:
+        config = OmegaConf.load('configs/finetune.yaml')
+        model = MultiTaskSpeakerModelFromConfig(config)
+    """
+    def __init__(self, config):
+        model_config = config['model']
+        super().__init__(
+            model_name=model_config['name'],
+            num_genders=model_config.get('num_genders', 2),
+            num_dialects=model_config.get('num_dialects', 3),
+            dropout=model_config.get('dropout', 0.1),
+            head_hidden_dim=model_config.get('head_hidden_dim', 256),
+            freeze_encoder=model_config.get('freeze_encoder', False),
+            dialect_loss_weight=config.get('loss', {}).get('dialect_weight', 3.0)
+        )
+        logger.info(f"Architecture: {model_config['name']} + Attentive Pooling + LayerNorm")
+        logger.info(f"Hidden size: {self.hidden_size}")
+        logger.info(f"Head hidden dim: {model_config.get('head_hidden_dim', 256)}")
+        logger.info(f"Dropout: {model_config.get('dropout', 0.1)}")
+class ClassificationHeadModel(nn.Module):
+    """
+    Lightweight model with only classification heads (no encoder).
+    Use this for training with pre-extracted features to save memory.
+    Hidden_size depends on encoder: WavLM-base=768, WavLM-large=1024, etc.
+    Usage:
+        model = ClassificationHeadModel(config)
+        output = model(input_features=features, gender_labels=y_gender, dialect_labels=y_dialect)
+    """
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_genders: int = 2,
+        num_dialects: int = 3,
+        dropout: float = 0.1,
+        head_hidden_dim: int = 256,
+        dialect_loss_weight: float = 3.0
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.dialect_loss_weight = dialect_loss_weight
+        # Pooling and normalization
+        self.attentive_pooling = AttentivePooling(hidden_size)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.dropout = nn.Dropout(dropout)
+        # Gender classification head (2 layers)
+        self.gender_head = nn.Sequential(
+            nn.Linear(hidden_size, head_hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(head_hidden_dim, num_genders)
+        )
+        # Dialect classification head (3 layers - deeper for harder task)
+        self.dialect_head = nn.Sequential(
+            nn.Linear(hidden_size, head_hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(head_hidden_dim, head_hidden_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(head_hidden_dim // 2, num_dialects)
+        )
+        logger.info(f"ClassificationHeadModel initialized (hidden_size={hidden_size})")
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        gender_labels: torch.Tensor = None,
+        dialect_labels: torch.Tensor = None
+    ):
+        """
+        Forward pass for pre-extracted features
+        Args:
+            input_features: Pre-extracted WavLM features [B, T, H]
+            attention_mask: Attention mask [B, T]
+            gender_labels: Gender labels [B] (optional, for training)
+            dialect_labels: Dialect labels [B] (optional, for training)
+        Returns:
+            dict with keys:
+                - loss: Combined loss (if labels provided)
+                - gender_logits: Gender predictions [B, num_genders]
+                - dialect_logits: Dialect predictions [B, num_dialects]
+                - attention_weights: Attention weights from pooling [B, T]
+        """
+        # Attentive pooling
+        pooled, attn_weights = self.attentive_pooling(input_features, attention_mask)
+        # Normalization and dropout
+        pooled = self.layer_norm(pooled)
+        pooled = self.dropout(pooled)
+        # Classification heads
+        gender_logits = self.gender_head(pooled)
+        dialect_logits = self.dialect_head(pooled)
+        # Compute loss if labels provided
+        loss = None
+        if gender_labels is not None and dialect_labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            gender_loss = loss_fct(gender_logits, gender_labels)
+            dialect_loss = loss_fct(dialect_logits, dialect_labels)
+            loss = gender_loss + self.dialect_loss_weight * dialect_loss
+        return {
+            'loss': loss,
+            'gender_logits': gender_logits,
+            'dialect_logits': dialect_logits,
+            'attention_weights': attn_weights
+        }
+class ClassificationHeadModelFromConfig(ClassificationHeadModel):
+    """
+    Lightweight classification model initialized from OmegaConf config.
+    Use this for training with pre-extracted features.
+    """
+    def __init__(self, config):
+        model_config = config['model']
+        super().__init__(
+            hidden_size=model_config.get('hidden_size', 768),  # WavLM base hidden size
+            num_genders=model_config.get('num_genders', 2),
+            num_dialects=model_config.get('num_dialects', 3),
+            dropout=model_config.get('dropout', 0.1),
+            head_hidden_dim=model_config.get('head_hidden_dim', 256),
+            dialect_loss_weight=config.get('loss', {}).get('dialect_weight', 3.0)
+        )
+        logger.info("Architecture: Attentive Pooling + LayerNorm + Classification Heads")
+        logger.info(f"Hidden size: {self.hidden_size}")
+        logger.info(f"Head hidden dim: {model_config.get('head_hidden_dim', 256)}")
+        logger.info(f"Dropout: {model_config.get('dropout', 0.1)}")

src/utils.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Utility functions for Speaker Profiling
+"""
+import os
+import logging
+import random
+import numpy as np
+import torch
+import librosa
+from pathlib import Path
+from omegaconf import OmegaConf
+from typing import Union, Optional, Tuple
+def setup_logging(
+    name: str = "speaker_profiling",
+    level: int = logging.INFO,
+    log_file: Optional[str] = None
+) -> logging.Logger:
+    """
+    Setup logging configuration
+    Args:
+        name: Logger name
+        level: Logging level
+        log_file: Optional path to log file
+    Returns:
+        Configured logger instance
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    if logger.handlers:
+        logger.handlers.clear()
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(level)
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    if log_file:
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
+        file_handler = logging.FileHandler(log_file, encoding='utf-8')
+        file_handler.setLevel(level)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger
+def get_logger(name: str = "speaker_profiling") -> logging.Logger:
+    """Get existing logger or create new one"""
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        return setup_logging(name)
+    return logger
+def load_config(config_path: str) -> OmegaConf:
+    """
+    Load configuration from yaml file
+    Args:
+        config_path: Path to yaml config file
+    Returns:
+        OmegaConf configuration object
+    """
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    return OmegaConf.load(config_path)
+def set_seed(seed: int) -> None:
+    """
+    Set random seed for reproducibility
+    Args:
+        seed: Random seed value
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+def load_audio(
+    audio_path: Union[str, Path],
+    sampling_rate: int = 16000,
+    mono: bool = True
+) -> Tuple[np.ndarray, int]:
+    """
+    Load audio file
+    Args:
+        audio_path: Path to audio file
+        sampling_rate: Target sampling rate
+        mono: Whether to convert to mono
+    Returns:
+        Tuple of (audio array, sampling rate)
+    """
+    audio, sr = librosa.load(audio_path, sr=sampling_rate, mono=mono)
+    return audio, sr
+def preprocess_audio(
+    audio: np.ndarray,
+    sampling_rate: int = 16000,
+    max_duration: float = 10.0,
+    trim_db: int = 20,
+    normalize: bool = True,
+    center_crop: bool = True
+) -> np.ndarray:
+    """
+    Preprocess audio for model input
+    Args:
+        audio: Raw audio array
+        sampling_rate: Audio sampling rate
+        max_duration: Maximum duration in seconds
+        trim_db: Threshold for silence trimming
+        normalize: Whether to normalize audio
+        center_crop: If True, center crop; else random crop (for training)
+    Returns:
+        Preprocessed audio array
+    """
+    max_length = int(sampling_rate * max_duration)
+    audio, _ = librosa.effects.trim(audio, top_db=trim_db)
+    if normalize:
+        audio = audio / (np.max(np.abs(audio)) + 1e-8)
+    if len(audio) < max_length:
+        audio = np.pad(audio, (0, max_length - len(audio)))
+    elif len(audio) > max_length:
+        if center_crop:
+            start = (len(audio) - max_length) // 2
+        else:
+            start = np.random.randint(0, len(audio) - max_length + 1)
+        audio = audio[start:start + max_length]
+    return audio
+def load_and_preprocess_audio(
+    audio_path: Union[str, Path],
+    sampling_rate: int = 16000,
+    max_duration: float = 10.0,
+    trim_db: int = 20,
+    normalize: bool = True,
+    center_crop: bool = True
+) -> np.ndarray:
+    """
+    Load and preprocess audio file in one step
+    Args:
+        audio_path: Path to audio file
+        sampling_rate: Target sampling rate
+        max_duration: Maximum duration in seconds
+        trim_db: Threshold for silence trimming
+        normalize: Whether to normalize audio
+        center_crop: If True, center crop; else random crop
+    Returns:
+        Preprocessed audio array
+    """
+    audio, _ = load_audio(audio_path, sampling_rate)
+    return preprocess_audio(
+        audio,
+        sampling_rate,
+        max_duration,
+        trim_db,
+        normalize,
+        center_crop
+    )
+def load_model_checkpoint(
+    model: torch.nn.Module,
+    checkpoint_path: str,
+    device: str = 'cpu'
+) -> torch.nn.Module:
+    """
+    Load model from checkpoint
+    Args:
+        model: PyTorch model instance
+        checkpoint_path: Path to checkpoint directory
+        device: Device to load model on
+    Returns:
+        Model with loaded weights
+    """
+    logger = get_logger()
+    safetensors_path = os.path.join(checkpoint_path, 'model.safetensors')
+    pytorch_path = os.path.join(checkpoint_path, 'pytorch_model.bin')
+    if os.path.exists(safetensors_path):
+        from safetensors.torch import load_file
+        state_dict = load_file(safetensors_path)
+        logger.info(f"Loading checkpoint from {safetensors_path}")
+    elif os.path.exists(pytorch_path):
+        state_dict = torch.load(pytorch_path, map_location=device)
+        logger.info(f"Loading checkpoint from {pytorch_path}")
+    else:
+        raise FileNotFoundError(
+            f"No checkpoint found in {checkpoint_path}. "
+            f"Expected 'model.safetensors' or 'pytorch_model.bin'"
+        )
+    model.load_state_dict(state_dict)
+    return model
+def get_device(device_str: str = 'cuda') -> torch.device:
+    """
+    Get torch device, fallback to CPU if CUDA not available
+    Args:
+        device_str: Desired device string ('cuda' or 'cpu')
+    Returns:
+        torch.device instance
+    """
+    if device_str == 'cuda' and torch.cuda.is_available():
+        return torch.device('cuda')
+    return torch.device('cpu')
+def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
+    """
+    Count model parameters
+    Args:
+        model: PyTorch model
+    Returns:
+        Tuple of (total_params, trainable_params)
+    """
+    total = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return total, trainable
+def format_number(num: int) -> str:
+    """Format large numbers with commas"""
+    return f"{num:,}"