Commit
·
b8eb4fa
1
Parent(s):
177ebd7
Update README.md
Browse files
README.md
CHANGED
|
@@ -101,7 +101,10 @@ processor = Wav2Vec2ProcessorWithLM.from_pretrained("SLPL/Sharif-wav2vec2")
|
|
| 101 |
def speech_file_to_array_fn(batch):
|
| 102 |
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
| 103 |
speech_array = speech_array.squeeze().numpy()
|
| 104 |
-
speech_array = librosa.resample(
|
|
|
|
|
|
|
|
|
|
| 105 |
batch["speech"] = speech_array
|
| 106 |
return batch
|
| 107 |
|
|
@@ -112,24 +115,30 @@ def predict(batch):
|
|
| 112 |
return_tensors="pt",
|
| 113 |
padding=True
|
| 114 |
)
|
| 115 |
-
|
| 116 |
-
input_values = features.input_values
|
| 117 |
-
attention_mask = features.attention_mask
|
| 118 |
|
| 119 |
with torch.no_grad():
|
| 120 |
-
logits = model(
|
|
|
|
|
|
|
| 121 |
batch["prediction"] = processor.batch_decode(logits.numpy()).text
|
| 122 |
return batch
|
| 123 |
|
| 124 |
-
dataset = load_dataset(
|
|
|
|
|
|
|
|
|
|
| 125 |
dataset = dataset.map(speech_file_to_array_fn)
|
| 126 |
|
| 127 |
result = dataset.map(predict, batched=True, batch_size=4)
|
| 128 |
wer = load_metric("wer")
|
| 129 |
cer = load_metric("cer")
|
| 130 |
|
| 131 |
-
print("WER: {:.2f}".format(
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
```
|
| 134 |
|
| 135 |
*Result (WER) on common-voice 6.1*:
|
|
|
|
| 101 |
def speech_file_to_array_fn(batch):
|
| 102 |
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
| 103 |
speech_array = speech_array.squeeze().numpy()
|
| 104 |
+
speech_array = librosa.resample(
|
| 105 |
+
np.asarray(speech_array),
|
| 106 |
+
sampling_rate,
|
| 107 |
+
processor.feature_extractor.sampling_rate)
|
| 108 |
batch["speech"] = speech_array
|
| 109 |
return batch
|
| 110 |
|
|
|
|
| 115 |
return_tensors="pt",
|
| 116 |
padding=True
|
| 117 |
)
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
with torch.no_grad():
|
| 120 |
+
logits = model(
|
| 121 |
+
features.input_values,
|
| 122 |
+
attention_mask=features.attention_mask).logits
|
| 123 |
batch["prediction"] = processor.batch_decode(logits.numpy()).text
|
| 124 |
return batch
|
| 125 |
|
| 126 |
+
dataset = load_dataset(
|
| 127 |
+
"csv",
|
| 128 |
+
ata_files={"test":"dataset.eval.csv"},
|
| 129 |
+
delimiter=",")["test"]
|
| 130 |
dataset = dataset.map(speech_file_to_array_fn)
|
| 131 |
|
| 132 |
result = dataset.map(predict, batched=True, batch_size=4)
|
| 133 |
wer = load_metric("wer")
|
| 134 |
cer = load_metric("cer")
|
| 135 |
|
| 136 |
+
print("WER: {:.2f}".format(wer.compute(
|
| 137 |
+
predictions=result["prediction"],
|
| 138 |
+
references=result["reference"])))
|
| 139 |
+
print("CER: {:.2f}".format(cer.compute(
|
| 140 |
+
predictions=result["prediction"],
|
| 141 |
+
references=result["reference"])))
|
| 142 |
```
|
| 143 |
|
| 144 |
*Result (WER) on common-voice 6.1*:
|