MU-NLPC
/

F0_Energy_joint_VQVAE_embeddings-preprocessor

Transformers

Model card Files Files and versions

xet

Community

Daporte commited on Aug 23, 2025

Commit

9336925

verified ·

1 Parent(s): 46dac76

Update prosody_preprocessor.py

Browse files

Files changed (1) hide show

prosody_preprocessor.py +41 -4

prosody_preprocessor.py CHANGED Viewed

@@ -9,6 +9,8 @@ import dataclasses
 import parselmouth
 from transformers import PreTrainedModel,PretrainedConfig, FeatureExtractionMixin
 from datasets import Dataset
 @dataclass
 class SpeakerStats:
@@ -75,6 +77,10 @@ class ProsodyPreprocessor(FeatureExtractionMixin):
     def extract_features(self, audio):
         """Extract F0 and intensity features"""
         audio = torch.Tensor(audio)
@@ -114,7 +120,6 @@ class ProsodyPreprocessor(FeatureExtractionMixin):
     def collect_stats(self, dataset: Dataset, num_proc: int = 4, batch_size: int = 32) -> Dict[str, SpeakerStats]:
         """First pass: collect speaker statistics using dataset.map"""
-        # Step 1: Extract features using map
         def extract_features_batch(examples):
             features_list = []
             for audio in examples['audio']:
@@ -127,7 +132,6 @@ class ProsodyPreprocessor(FeatureExtractionMixin):
                 'speaker_id': examples['speaker_id']
             }
-        # Extract features for all samples
         features_dataset = dataset.map(
             extract_features_batch,
             batched=True,
@@ -138,7 +142,6 @@ class ProsodyPreprocessor(FeatureExtractionMixin):
         )
-        # Step 2: Group features by speaker
         speaker_features = {}
         for item in features_dataset:
@@ -149,7 +152,6 @@ class ProsodyPreprocessor(FeatureExtractionMixin):
             speaker_features[speaker_id]['f0'].append(item['f0'])
             speaker_features[speaker_id]['intensity'].append(item['intensity'])
-        # Step 3: Calculate stats per speaker
         self.speaker_stats = {
             spk: SpeakerStats.from_features(
                 feats['f0'],
@@ -217,3 +219,38 @@ class ProsodyPreprocessor(FeatureExtractionMixin):
     #     #     spk: SpeakerStats(**stats)
     #     #     for spk, stats in state_dict.items()
     #     # }

 import parselmouth
 from transformers import PreTrainedModel,PretrainedConfig, FeatureExtractionMixin
 from datasets import Dataset
+from scipy.signal import medfilt
+import scipy.interpolate as scipy_interp
 @dataclass
 class SpeakerStats:
     def extract_features(self, audio):
         """Extract F0 and intensity features"""
+        # Override the original method to fix a bug
+        pYAAPT.PitchObj.interpolate = interpolate
         audio = torch.Tensor(audio)
     def collect_stats(self, dataset: Dataset, num_proc: int = 4, batch_size: int = 32) -> Dict[str, SpeakerStats]:
         """First pass: collect speaker statistics using dataset.map"""
         def extract_features_batch(examples):
             features_list = []
             for audio in examples['audio']:
                 'speaker_id': examples['speaker_id']
             }
         features_dataset = dataset.map(
             extract_features_batch,
             batched=True,
         )
         speaker_features = {}
         for item in features_dataset:
             speaker_features[speaker_id]['f0'].append(item['f0'])
             speaker_features[speaker_id]['intensity'].append(item['intensity'])
         self.speaker_stats = {
             spk: SpeakerStats.from_features(
                 feats['f0'],
     #     #     spk: SpeakerStats(**stats)
     #     #     for spk, stats in state_dict.items()
     #     # }
+def interpolate(self):
+    pitch = np.zeros((self.nframes))
+    pitch[:] = self.samp_values
+    pitch2 = medfilt(self.samp_values, self.SMOOTH_FACTOR)
+    # This part in the original code is kind of confused and caused
+    # some problems with the extrapolated points before the first
+    # voiced frame and after the last voiced frame. So, I made some
+    # small modifications in order to make it work better.
+    edges = self.edges_finder(pitch)
+    first_sample = pitch[0]
+    last_sample = pitch[-1]
+    if len(np.nonzero(pitch2)[0]) < 2:
+        pitch[pitch == 0] = self.PTCH_TYP
+    else:
+        nz_pitch = pitch2[pitch2 > 0]
+        pitch2 = scipy_interp.pchip(np.nonzero(pitch2)[0],
+                                    nz_pitch)(range(self.nframes))
+        pitch[pitch == 0] = pitch2[pitch == 0]
+    if self.SMOOTH > 0:
+        pitch = medfilt(pitch, self.SMOOTH_FACTOR)
+    try:
+        if first_sample == 0:
+            # This is statement fixes the bug that caused the whole f0 to be flattened
+            if edges[0] == 0:
+                edges[0] = 1
+            pitch[:edges[0]-1] = pitch[edges[0]]
+        if last_sample == 0:
+            pitch[edges[-1]+1:] = pitch[edges[-1]]
+    except:
+        pass
+    self.samp_interp = pitch