Spaces:

Thanh-Lam
/

vietnamese-speaker-profiling-v2

Running

App Files Files Community

Thanh-Lam commited on 6 days ago

Commit

ec8293e

1 Parent(s): 8f4a2bc

Fix: correct gender/dialect label mapping (Female=0, Male=1) and remove trim-causing sources param

Browse files

Files changed (1) hide show

app.py +37 -20

app.py CHANGED Viewed

@@ -27,9 +27,17 @@ MODELS_CONFIG = {
     }
 }
-# Labels
-GENDER_LABELS = ["Male", "Female"]
-DIALECT_LABELS = ["Northern", "Central", "Southern"]
 class MultiModelProfiler:
@@ -38,6 +46,7 @@ class MultiModelProfiler:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.sampling_rate = 16000
         self.models = {}
         self.processors = {}
         self.current_model = None
@@ -129,20 +138,29 @@ class MultiModelProfiler:
             processor = self.processors[model_name]
             is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
-            # Load audio using librosa (more compatible)
             waveform, sr = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
             # Process based on model type
             if is_whisper:
-                # Whisper requires exactly 30 seconds of audio
-                whisper_length = self.sampling_rate * 30  # 480000 samples
                 if len(waveform) < whisper_length:
-                    waveform_padded = np.pad(waveform, (0, whisper_length - len(waveform)))
-                else:
-                    waveform_padded = waveform[:whisper_length]
                 inputs = processor(
-                    waveform_padded,
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt"
                 )
@@ -163,14 +181,14 @@ class MultiModelProfiler:
                 gender_logits = outputs['gender_logits']
                 dialect_logits = outputs['dialect_logits']
-                gender_probs = torch.softmax(gender_logits, dim=-1)
-                dialect_probs = torch.softmax(dialect_logits, dim=-1)
-                gender_idx = gender_probs.argmax(dim=-1).item()
-                dialect_idx = dialect_probs.argmax(dim=-1).item()
-                gender_conf = gender_probs[0, gender_idx].item() * 100
-                dialect_conf = dialect_probs[0, dialect_idx].item() * 100
             gender_result = f"{GENDER_LABELS[gender_idx]} ({gender_conf:.1f}%)"
             dialect_result = f"{DIALECT_LABELS[dialect_idx]} ({dialect_conf:.1f}%)"
@@ -223,8 +241,7 @@ def create_interface():
                 gr.Markdown("### Input")
                 audio_input = gr.Audio(
                     label="Upload or Record Audio",
-                    type="filepath",
-                    sources=["upload", "microphone"]
                 )
                 model_dropdown = gr.Dropdown(
@@ -247,9 +264,9 @@ def create_interface():
                 gr.Markdown(
                     """
                     ### Dialect Regions
-                    - **Northern**: Hanoi and surrounding areas
                     - **Central**: Hue, Da Nang, and Central Vietnam
-                    - **Southern**: Ho Chi Minh City and Southern Vietnam
                     """
                 )

     }
 }
+# Labels - IMPORTANT: Must match training order!
+# Model was trained with Female=0, Male=1
+GENDER_LABELS = {
+    0: "Female",
+    1: "Male"
+}
+DIALECT_LABELS = {
+    0: "North",
+    1: "Central",
+    2: "South"
+}
 class MultiModelProfiler:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.sampling_rate = 16000
+        self.max_duration = 5  # seconds for non-whisper models
         self.models = {}
         self.processors = {}
         self.current_model = None
             processor = self.processors[model_name]
             is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
+            # Set max duration based on model type
+            if is_whisper:
+                max_duration = 30  # Whisper requires 30 seconds
+            else:
+                max_duration = self.max_duration
+            # Load audio using librosa
             waveform, sr = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
+            # Trim to max duration
+            max_samples = int(max_duration * self.sampling_rate)
+            if len(waveform) > max_samples:
+                waveform = waveform[:max_samples]
             # Process based on model type
             if is_whisper:
+                # Whisper requires exactly 30 seconds - pad if needed
+                whisper_length = self.sampling_rate * 30
                 if len(waveform) < whisper_length:
+                    waveform = np.pad(waveform, (0, whisper_length - len(waveform)))
                 inputs = processor(
+                    waveform,
                     sampling_rate=self.sampling_rate,
                     return_tensors="pt"
                 )
                 gender_logits = outputs['gender_logits']
                 dialect_logits = outputs['dialect_logits']
+                gender_probs = torch.softmax(gender_logits, dim=-1).cpu().numpy()[0]
+                dialect_probs = torch.softmax(dialect_logits, dim=-1).cpu().numpy()[0]
+                gender_idx = int(np.argmax(gender_probs))
+                dialect_idx = int(np.argmax(dialect_probs))
+                gender_conf = float(gender_probs[gender_idx]) * 100
+                dialect_conf = float(dialect_probs[dialect_idx]) * 100
             gender_result = f"{GENDER_LABELS[gender_idx]} ({gender_conf:.1f}%)"
             dialect_result = f"{DIALECT_LABELS[dialect_idx]} ({dialect_conf:.1f}%)"
                 gr.Markdown("### Input")
                 audio_input = gr.Audio(
                     label="Upload or Record Audio",
+                    type="filepath"
                 )
                 model_dropdown = gr.Dropdown(
                 gr.Markdown(
                     """
                     ### Dialect Regions
+                    - **North**: Hanoi and surrounding areas
                     - **Central**: Hue, Da Nang, and Central Vietnam
+                    - **South**: Ho Chi Minh City and Southern Vietnam
                     """
                 )