flax-community
/

hubert-dementia-screening

birgermoell commited on Jul 8, 2021

Commit

f9c55bd

1 Parent(s): 8093841

Updated to input wav file directly

Files changed (1) hide show

feature_extractor.py CHANGED Viewed

@@ -40,24 +40,31 @@ def change_sample_rate(y, sample_rate, new_sample_rate):
     value = librosa.resample(y, sample_rate, new_sample_rate)
     return value
 def get_wav2vecembeddings_from_audiofile(wav_file):
     print("the file is", wav_file)
     speech, sample_rate = sf.read(wav_file)
      # change sample rate to 16 000 hertz
     resampled = change_sample_rate(speech, sample_rate, new_sample_rate)
     print("the speech is", speech)
-    input_values = processor(resampled, return_tensors="pt", padding=True, sampling_rate=new_sample_rate) # there is no truncation param anymore
     print("input values", input_values)
     # import pdb
     # pdb.set_trace()
     with torch.no_grad():
         encoded_states = model(
-            **input_values,
             # attention_mask=input_values["attention_mask"],
             output_hidden_states=True
         )
         last_hidden_state = encoded_states.hidden_states[-1] # The last hidden-state is the first element of the output tuple
         print("getting wav2vec2 embeddings")
         print(last_hidden_state)

     value = librosa.resample(y, sample_rate, new_sample_rate)
     return value
+def stereo_to_mono(audio_input):
+    X = audio_input.mean(axis=1, keepdims=True)
+    X = np.squeeze(X)
+    return X
 def get_wav2vecembeddings_from_audiofile(wav_file):
     print("the file is", wav_file)
     speech, sample_rate = sf.read(wav_file)
+    if len(speech.shape) > 1:
+        speech = stereo_to_mono(speech)
      # change sample rate to 16 000 hertz
     resampled = change_sample_rate(speech, sample_rate, new_sample_rate)
     print("the speech is", speech)
+    input_values = processor(wav_file, return_tensors="pt", padding=True, sampling_rate=new_sample_rate) # there is no truncation param anymore
     print("input values", input_values)
     # import pdb
     # pdb.set_trace()
     with torch.no_grad():
         encoded_states = model(
+            input_values=input_values["input_ids"],
             # attention_mask=input_values["attention_mask"],
             output_hidden_states=True
         )
         last_hidden_state = encoded_states.hidden_states[-1] # The last hidden-state is the first element of the output tuple
         print("getting wav2vec2 embeddings")
         print(last_hidden_state)