Spaces:
Running
Running
html + js
Browse files- helper.js +205 -0
- index.html +271 -17
helper.js
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const langFlags = {'fr': '🇫🇷', 'es': '🇪🇸', 'pt': '🇵🇹', 'de': '🇩🇪', 'it': '🇮🇹'};
|
| 2 |
+
const langNames = {'fr': 'French', 'es': 'Spanish', 'pt': 'Portuguese', 'de': 'German', 'it': 'Italian'};
|
| 3 |
+
|
| 4 |
+
const shortformFilenamesPerLang = {
|
| 5 |
+
'fr': ["30ef344ae8687926.mp3", "4539f03d07ce7fbf.mp3"], // "6d6261093edc78c2.mp3", "6d6261093edc78c2.mp3"],
|
| 6 |
+
'es': ["5dc1d533e21f43b2.mp3", "963de6cbb0eaee36.mp3"], // "a22a3eff8576211c.mp3", "ff65061e3b636834.mp3"],
|
| 7 |
+
'pt': ["1263b98457966b2a.mp3", "3a2a8fd3a3bd2feb.mp3"], // "6cf8e09e87612d2f.mp3", "70a4955ff0149f5f.mp3"],
|
| 8 |
+
'de': ["2d05ea9d4a065778.mp3", "3f5d622c2955df4c.mp3"], // "64fbd8fd8ecd4d63.mp3", "93cce2bd8093062f.mp3"],
|
| 9 |
+
'it': ["61fada964460ad67.mp3", "9c6657d3fe647ecb.mp3", "84fbf6f8271c43b4.mp3", "83fcc138b2a8df7f.mp3"], // "a1fa8e69d4019e03.mp3", "f30cef780a80ca78.mp3"],
|
| 10 |
+
};
|
| 11 |
+
|
| 12 |
+
const longformFilenamesPerLang = {
|
| 13 |
+
'fr': ["ee67adf3f3768b1d_11labs.mp3", "f9fcfb48c566cfad_11labs.mp3"],
|
| 14 |
+
'es': ["02fc8ce1843e4638_11labs.mp3", "bb3e91e3f0488a24_11labs.mp3"],
|
| 15 |
+
'pt': ["73725fb3cf2cf669_cartesia.mp3 ", "7b42a118f93b1867_cartesia.mp3"],
|
| 16 |
+
'de': ["02df47e0d27a8b80_cartesia.mp3", "b0e7b4b91e9d91db_gradium.mp3"],
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
function createAudioHTML(path) {
|
| 20 |
+
return '<audio controls controlslist="nodownload" class="px-1"> <source src=' +
|
| 21 |
+
path +
|
| 22 |
+
' type="audio/wav">Your browser does not support the audio element.</audio>';
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
function generateExampleRow(table_row, base_dir, lang, dirs, filename, row_idx, n_files_per_lang) {
|
| 26 |
+
// Put the flag and full language name in the first column
|
| 27 |
+
if (row_idx % n_files_per_lang === 0) {
|
| 28 |
+
table_row.cells[0].innerHTML = `
|
| 29 |
+
<div style="display: flex; align-items: center; justify-content: center; gap: 0.5em">
|
| 30 |
+
<span style="font-size: 1em;">${langNames[lang] || lang}</span>
|
| 31 |
+
<span style="font-size: 2em;">${langFlags[lang] || ''}</span>
|
| 32 |
+
</div>
|
| 33 |
+
`;
|
| 34 |
+
table_row.cells[0].setAttribute('rowspan', n_files_per_lang);
|
| 35 |
+
table_row.cells[0].style.verticalAlign = "middle";
|
| 36 |
+
// Remove the first cell from the next row because of row span
|
| 37 |
+
let nextRow = table_row.parentElement.rows[row_idx + 1];
|
| 38 |
+
if (nextRow) {
|
| 39 |
+
nextRow.deleteCell(0);
|
| 40 |
+
}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
// because of rowspan, odd row indexes have no source language cell
|
| 44 |
+
let col_offset = (row_idx % n_files_per_lang === 0) ? 1 : 0;
|
| 45 |
+
|
| 46 |
+
for (var col_idx = 0; col_idx < dirs.length; col_idx++) {
|
| 47 |
+
|
| 48 |
+
let cell = table_row.cells[col_idx + col_offset];
|
| 49 |
+
let p = base_dir + '/' + lang + '/' + dirs[col_idx] + '/' + filename;
|
| 50 |
+
|
| 51 |
+
let container = cell.querySelector('div') || cell;
|
| 52 |
+
|
| 53 |
+
if (p.endsWith('txt')) {
|
| 54 |
+
var req = new XMLHttpRequest();
|
| 55 |
+
req.onreadystatechange = function() {
|
| 56 |
+
if (this.readyState === this.DONE) {
|
| 57 |
+
container.innerHTML += '<font size="-1">' + req.responseText + '</font>';
|
| 58 |
+
}
|
| 59 |
+
};
|
| 60 |
+
req.open('GET', p);
|
| 61 |
+
req.send(null);
|
| 62 |
+
} else {
|
| 63 |
+
// container.innerHTML += createAudioHTML(p);
|
| 64 |
+
container.innerHTML += `
|
| 65 |
+
<div style="display: flex; justify-content: center; align-items: center;">
|
| 66 |
+
${createAudioHTML(p)}
|
| 67 |
+
</div>
|
| 68 |
+
`;
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
function generateSamplesTable(tableId, base_dir, fnames_per_lang, langs) {
|
| 74 |
+
let tbody = document.getElementById(tableId).querySelector('tbody');
|
| 75 |
+
let n_files_per_lang = fnames_per_lang[langs[0]].length; // all langs must have the same number of samples
|
| 76 |
+
let dirs = ['source', 'hibiki-zero', 'seamless'];
|
| 77 |
+
for (var lang_idx = 0; lang_idx < langs.length; lang_idx++) {
|
| 78 |
+
let lang = langs[lang_idx];
|
| 79 |
+
let fnames = fnames_per_lang[lang];
|
| 80 |
+
for (var sample_idx = 0; sample_idx < fnames.length; sample_idx++) {
|
| 81 |
+
let row_idx = n_files_per_lang * lang_idx + sample_idx
|
| 82 |
+
generateExampleRow(tbody.rows[row_idx], base_dir, lang, dirs, fnames[sample_idx], row_idx, n_files_per_lang);
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
generateSamplesTable('shortform-table', 'data/europarl_st', shortformFilenamesPerLang, ['fr', 'es', 'pt', 'de'])
|
| 88 |
+
generateSamplesTable('longform-table', 'data/audio_ntrex_4L', longformFilenamesPerLang, ['fr', 'es', 'pt', 'de'])
|
| 89 |
+
generateSamplesTable('shortform-table-it', 'data/europarl_st', shortformFilenamesPerLang, ['it'])
|
| 90 |
+
|
| 91 |
+
// Borrowed from https://nu-dialogue.github.io/j-moshi/
|
| 92 |
+
$(document).ready(function () {
|
| 93 |
+
|
| 94 |
+
const columns = ['Hibiki-Zero', 'Seamless'];
|
| 95 |
+
|
| 96 |
+
const rowLangIds = ['fr', 'es', 'pt', 'de'];
|
| 97 |
+
|
| 98 |
+
const rows = [
|
| 99 |
+
[
|
| 100 |
+
'data-stereo/hibiki-zero_fr_3963c038b9f8d311_gradium.mp3',
|
| 101 |
+
'data-stereo/seamless_fr_3963c038b9f8d311_gradium.mp3'
|
| 102 |
+
],
|
| 103 |
+
[
|
| 104 |
+
'data-stereo/hibiki-zero_es_949ebe18ff5f86ec_cartesia.mp3',
|
| 105 |
+
'data-stereo/seamless_es_949ebe18ff5f86ec_cartesia.mp3'
|
| 106 |
+
],
|
| 107 |
+
[
|
| 108 |
+
'data-stereo/hibiki-zero_pt_4bb12dfdfd3877d8_11labs.mp3',
|
| 109 |
+
'data-stereo/seamless_pt_4bb12dfdfd3877d8_11labs.mp3'
|
| 110 |
+
],
|
| 111 |
+
[
|
| 112 |
+
'data-stereo/hibiki-zero_de_3bf4c877f039e01a_11labs.mp3',
|
| 113 |
+
'data-stereo/seamless_de_3bf4c877f039e01a_11labs.mp3'
|
| 114 |
+
],
|
| 115 |
+
];
|
| 116 |
+
|
| 117 |
+
const table = $('#multistream-table');
|
| 118 |
+
|
| 119 |
+
/* ---------- Header ---------- */
|
| 120 |
+
const thead = $('<thead>');
|
| 121 |
+
const headerRow = $('<tr>');
|
| 122 |
+
|
| 123 |
+
headerRow.append($('<th>').text('Source language').css({'white-space': 'nowrap', 'text-align': 'center'}));
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
columns.forEach(header => {
|
| 127 |
+
headerRow.append($('<th style="text-align: center">').text(header));
|
| 128 |
+
});
|
| 129 |
+
|
| 130 |
+
thead.append(headerRow);
|
| 131 |
+
table.append(thead);
|
| 132 |
+
|
| 133 |
+
/* ---------- Body ---------- */
|
| 134 |
+
const tbody = $('<tbody>');
|
| 135 |
+
|
| 136 |
+
rows.forEach((files, i) => {
|
| 137 |
+
const row = $('<tr>');
|
| 138 |
+
|
| 139 |
+
// Language label cell with big flag
|
| 140 |
+
const langName = langNames[rowLangIds[i]]
|
| 141 |
+
const flag = langFlags[rowLangIds[i]] || '';
|
| 142 |
+
row.append(
|
| 143 |
+
$('<td>')
|
| 144 |
+
.css({
|
| 145 |
+
'font-weight': 'bold',
|
| 146 |
+
'white-space': 'nowrap',
|
| 147 |
+
'vertical-align': 'middle'
|
| 148 |
+
})
|
| 149 |
+
.html(
|
| 150 |
+
`<div style="display: flex; align-items: center; justify-content: center; gap: 0.5em">
|
| 151 |
+
<span style="font-size: 1em;">${langName}</span>
|
| 152 |
+
<span style="font-size: 2em;">${flag}</span>
|
| 153 |
+
</div>`
|
| 154 |
+
)
|
| 155 |
+
);
|
| 156 |
+
|
| 157 |
+
files.forEach((file, j) => {
|
| 158 |
+
const waveCell = $('<td style="text-align: center; vertical-align: middle;">');
|
| 159 |
+
const waveform = $('<div>').attr('id', `waveform-${i}-${j}`);
|
| 160 |
+
waveCell.append(waveform);
|
| 161 |
+
|
| 162 |
+
const playPauseButton = `
|
| 163 |
+
<button class="btn btn-secondary mt-1" id="play-pause-${i}-${j}">
|
| 164 |
+
<i class="bi bi-play-fill"></i> Play /
|
| 165 |
+
<i class="bi bi-pause-fill"></i> Pause
|
| 166 |
+
</button>
|
| 167 |
+
`;
|
| 168 |
+
waveCell.append(playPauseButton);
|
| 169 |
+
row.append(waveCell);
|
| 170 |
+
});
|
| 171 |
+
|
| 172 |
+
tbody.append(row);
|
| 173 |
+
});
|
| 174 |
+
|
| 175 |
+
table.append(tbody);
|
| 176 |
+
|
| 177 |
+
/* ---------- WaveSurfer ---------- */
|
| 178 |
+
rows.forEach((files, i) => {
|
| 179 |
+
files.forEach((file, j) => {
|
| 180 |
+
const wavesurfer = WaveSurfer.create({
|
| 181 |
+
container: `#waveform-${i}-${j}`,
|
| 182 |
+
url: file,
|
| 183 |
+
splitChannels: [
|
| 184 |
+
{
|
| 185 |
+
waveColor: '#39f2aeff',
|
| 186 |
+
progressColor: '#808080',
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
waveColor: '#ffab40ff',
|
| 190 |
+
progressColor: '#000000',
|
| 191 |
+
}
|
| 192 |
+
],
|
| 193 |
+
barWidth: 2,
|
| 194 |
+
height: 55,
|
| 195 |
+
width: 650,
|
| 196 |
+
normalize: true,
|
| 197 |
+
});
|
| 198 |
+
|
| 199 |
+
$(`#play-pause-${i}-${j}`).click(() => {
|
| 200 |
+
wavesurfer.playPause();
|
| 201 |
+
});
|
| 202 |
+
});
|
| 203 |
+
});
|
| 204 |
+
|
| 205 |
+
});
|
index.html
CHANGED
|
@@ -1,19 +1,273 @@
|
|
| 1 |
-
<!
|
| 2 |
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
</html>
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<title>Hibiki</title>
|
| 5 |
+
<link
|
| 6 |
+
href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
| 7 |
+
rel="stylesheet"
|
| 8 |
+
/>
|
| 9 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css">
|
| 10 |
+
<meta charset="utf-8" />
|
| 11 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 12 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
|
| 13 |
+
<script src="https://unpkg.com/wavesurfer.js@7"></script>
|
| 14 |
+
<script src="helper.js" defer></script>
|
| 15 |
+
<!-- <script>
|
| 16 |
+
window.addEventListener('DOMContentLoaded', () => {
|
| 17 |
+
const disclaimer = document.getElementById('browser-disclaimer');
|
| 18 |
+
if (!disclaimer) return;
|
| 19 |
+
const isChrome = /Chrome/.test(navigator.userAgent) && /Google Inc/.test(navigator.vendor);
|
| 20 |
+
if (isChrome) {
|
| 21 |
+
disclaimer.style.display = 'none';
|
| 22 |
+
}
|
| 23 |
+
});
|
| 24 |
+
</script> -->
|
| 25 |
+
<style>
|
| 26 |
+
h1, h2, h3, h4, h5, h6, body, b, strong {color: #595959}
|
| 27 |
+
.container {max-width: 1620px}
|
| 28 |
+
.no-hover:hover td {box-shadow: none !important}
|
| 29 |
+
.centered-video {
|
| 30 |
+
display: block;
|
| 31 |
+
margin: 0 auto;
|
| 32 |
+
max-width: 90%;
|
| 33 |
+
min-width: 400px;
|
| 34 |
+
}
|
| 35 |
+
.video-legend {
|
| 36 |
+
margin-top: 6px;
|
| 37 |
+
font-size: 1em;
|
| 38 |
+
color: #555;
|
| 39 |
+
text-align: center;
|
| 40 |
+
}
|
| 41 |
+
</style>
|
| 42 |
+
</head>
|
| 43 |
+
|
| 44 |
+
<body>
|
| 45 |
+
|
| 46 |
+
<!-- DISCLAIMER -->
|
| 47 |
+
<div id="browser-disclaimer" style="
|
| 48 |
+
background-color: rgb(221, 255, 243);
|
| 49 |
+
color: #333;
|
| 50 |
+
padding: 10px 20px;
|
| 51 |
+
text-align: center;
|
| 52 |
+
font-family: sans-serif;
|
| 53 |
+
font-size: 14px;
|
| 54 |
+
">
|
| 55 |
+
For faster loading of audio samples, we recommend using <strong>Google Chrome</strong>.
|
| 56 |
+
</div>
|
| 57 |
+
|
| 58 |
+
<!-- HEADER -->
|
| 59 |
+
<div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
|
| 60 |
+
<div class="text-center">
|
| 61 |
+
<h1>Simultaneous Speech-to-Speech Translation Without Aligned Data</h1>
|
| 62 |
+
<p class="lead">
|
| 63 |
+
[...]
|
| 64 |
+
</p>
|
| 65 |
+
<p class="fst-italic mb-0">
|
| 66 |
+
Anonymous Authors
|
| 67 |
+
</p>
|
| 68 |
+
<p><b>Anonymous Institution</b></p>
|
| 69 |
+
</div>
|
| 70 |
+
|
| 71 |
+
<p>
|
| 72 |
+
<b>Abstract.</b>
|
| 73 |
+
Simultaneous speech translation requires translating source speech into a target language in real-time while handling non-monotonic word dependencies.
|
| 74 |
+
Traditional approaches rely on supervised training with word-level aligned data, which is difficult to collect at scale and thus depends on synthetic alignments using language-specific heuristics that are suboptimal.
|
| 75 |
+
We propose <i>Hibiki-Zero</i>, which eliminates the need for word-level alignments entirely.
|
| 76 |
+
This fundamentally simplifies the training pipeline and enables seamless scaling to diverse languages with varying grammatical structures, removing the bottleneck of designing language-specific alignment heuristics.
|
| 77 |
+
We first train on sentence-level aligned data to learn speech translation at high latency, then apply a novel reinforcement learning strategy using GRPO to optimize latency while preserving translation quality.
|
| 78 |
+
Hibiki-Zero achieves state-of-the-art performance in translation accuracy, latency, voice transfer, and naturalness across five X-to-English tasks.
|
| 79 |
+
Moreover, we demonstrate that our model can be adapted to support a new input language with less than 1000h of speech data.
|
| 80 |
+
We provide examples as well as models and we release a benchmark containing 15h of multilingual data for speech translation evaluation.
|
| 81 |
+
</p>
|
| 82 |
+
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
<!-- IN THE WILD -->
|
| 86 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 87 |
+
<h3>In the Wild Examples 🇫🇷🇪🇸🇵🇹🇩🇪</h3>
|
| 88 |
+
<p class="mb-0">
|
| 89 |
+
</p>
|
| 90 |
+
<div class="container pt-3 table-responsive">
|
| 91 |
+
<table class="table table-hover" width="100%">
|
| 92 |
+
<tr class="no-hover">
|
| 93 |
+
<!-- FR video -->
|
| 94 |
+
<td>
|
| 95 |
+
<video class="embed-responsive-item centered-video" controls>
|
| 96 |
+
<source src="videos/clip_fr_translated.mp4" type="video/mp4">
|
| 97 |
+
Your browser does not support HTML video.
|
| 98 |
+
</video>
|
| 99 |
+
<div class="video-legend">
|
| 100 |
+
Source:
|
| 101 |
+
<a href="https://www.youtube.com/watch?v=3nox96KbhV0" target="_blank">
|
| 102 |
+
The legendary Paris 2024 Olympic Games of Léon Marchand.
|
| 103 |
+
</a>
|
| 104 |
+
- <i>Eurosport France</i>
|
| 105 |
+
</div>
|
| 106 |
+
</td>
|
| 107 |
+
<!-- DE video -->
|
| 108 |
+
<td>
|
| 109 |
+
<video class="embed-responsive-item centered-video" controls>
|
| 110 |
+
<source src="videos/clip_de_translated.mp4" type="video/mp4">
|
| 111 |
+
Your browser does not support HTML video.
|
| 112 |
+
</video>
|
| 113 |
+
<div class="video-legend">
|
| 114 |
+
Source:
|
| 115 |
+
<a href="https://www.youtube.com/watch?v=4kFw5gi9JKI" target="_blank">
|
| 116 |
+
Biathlon 2025: Franziska Preuß wins her first World Championship.
|
| 117 |
+
</a>
|
| 118 |
+
- <i>Eurosport Germany</i>
|
| 119 |
+
</div>
|
| 120 |
+
</td>
|
| 121 |
+
</tr>
|
| 122 |
+
<tr class="no-hover">
|
| 123 |
+
<!-- ES video -->
|
| 124 |
+
<td>
|
| 125 |
+
<video class="embed-responsive-item centered-video" controls>
|
| 126 |
+
<source src="videos/clip_es_translated.mp4" type="video/mp4">
|
| 127 |
+
Your browser does not support HTML video.
|
| 128 |
+
</video>
|
| 129 |
+
<div class="video-legend">
|
| 130 |
+
Source:
|
| 131 |
+
<a href="https://www.youtube.com/watch?v=O0M-o7CnmUE" target="_blank">
|
| 132 |
+
Australian Open 2026 Final: Carlos Alcaraz vs. Novak Djokovic.
|
| 133 |
+
</a>
|
| 134 |
+
- <i>Eurosport España</i>
|
| 135 |
+
</div>
|
| 136 |
+
</td>
|
| 137 |
+
<!-- PT video -->
|
| 138 |
+
<td>
|
| 139 |
+
<video class="embed-responsive-item centered-video" controls>
|
| 140 |
+
<source src="videos/clip_pt_translated.mp4" type="video/mp4">
|
| 141 |
+
Your browser does not support HTML video.
|
| 142 |
+
</video>
|
| 143 |
+
<div class="video-legend">
|
| 144 |
+
Source:
|
| 145 |
+
<a href="https://www.facebook.com/share/v/1HAYhUFVm3/" target="_blank">
|
| 146 |
+
Iuri Leitao and Rui Oliveira win gold for Portugal at the Paris 2024 Olympics.
|
| 147 |
+
</a>
|
| 148 |
+
- <i>Facebook</i>
|
| 149 |
+
</div>
|
| 150 |
+
</td>
|
| 151 |
+
</tr>
|
| 152 |
+
</table>
|
| 153 |
+
</div>
|
| 154 |
+
</div>
|
| 155 |
+
|
| 156 |
+
<!-- MULTISTREAM -->
|
| 157 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 158 |
+
<h3>Multistream Visualization</h3>
|
| 159 |
+
<p class="mb-0">
|
| 160 |
+
The source audios (from our long-form evaluation dataset Audio-NTREX-4L) and translated versions are on different channels.
|
| 161 |
+
The volume of the sources are reduced so that it's easier to hear the translations.
|
| 162 |
+
</p>
|
| 163 |
+
<div class="container pt-3 table-responsive">
|
| 164 |
+
<table class="table" id="multistream-table"></table>
|
| 165 |
+
</div>
|
| 166 |
+
</div>
|
| 167 |
+
|
| 168 |
+
<!-- SHORTFORM -->
|
| 169 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 170 |
+
<h3>Short-form Simultaneous Translations</h3>
|
| 171 |
+
<p class="mb-0"> The source audios come from our Europarl-ST evaluation data. </p>
|
| 172 |
+
<div class="container pt-3 table-responsive">
|
| 173 |
+
<table class="table" id="shortform-table">
|
| 174 |
+
<thead>
|
| 175 |
+
<tr>
|
| 176 |
+
<th style="text-align: center; min-width: 150px;">Source language</th>
|
| 177 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
| 178 |
+
<th style="text-align: center;">Hibiki-Zero</th>
|
| 179 |
+
<th style="text-align: center">Seamless</th>
|
| 180 |
+
</tr>
|
| 181 |
+
</thead>
|
| 182 |
+
<tbody>
|
| 183 |
+
<!-- fr -->
|
| 184 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 185 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 186 |
+
<!-- <tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 187 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr> -->
|
| 188 |
+
<!-- es -->
|
| 189 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 190 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 191 |
+
<!-- <tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 192 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr> -->
|
| 193 |
+
<!-- pt -->
|
| 194 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 195 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 196 |
+
<!-- <tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 197 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr> -->
|
| 198 |
+
<!-- de -->
|
| 199 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 200 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 201 |
+
<!-- <tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 202 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr> -->
|
| 203 |
+
</tbody>
|
| 204 |
+
</table>
|
| 205 |
+
</div>
|
| 206 |
+
</div>
|
| 207 |
+
|
| 208 |
+
<!-- LONGFORM -->
|
| 209 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 210 |
+
<h3>Long-form Simultaneous Translations</h3>
|
| 211 |
+
<p class="mb-0"> The source audios come from taken from our Audio-NTREX-4L evaluation dataset.</p>
|
| 212 |
+
<div class="container pt-3 table-responsive">
|
| 213 |
+
<table class="table" id="longform-table">
|
| 214 |
+
<thead>
|
| 215 |
+
<tr>
|
| 216 |
+
<th style="text-align: center; min-width: 150px;">Source language</th>
|
| 217 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
| 218 |
+
<th style="text-align: center;">Hibiki-Zero</th>
|
| 219 |
+
<th style="text-align: center;">Seamless</th>
|
| 220 |
+
</tr>
|
| 221 |
+
</thead>
|
| 222 |
+
<tbody>
|
| 223 |
+
<!-- fr -->
|
| 224 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 225 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 226 |
+
<!-- es -->
|
| 227 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 228 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 229 |
+
<!-- pt -->
|
| 230 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 231 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 232 |
+
<!-- de -->
|
| 233 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 234 |
+
<tr> <td></td> <td></td> <td></td> <td></td> </tr>
|
| 235 |
+
</tbody>
|
| 236 |
+
</table>
|
| 237 |
+
</div>
|
| 238 |
+
</div>
|
| 239 |
+
|
| 240 |
+
<!-- SHORTFORM ITALIAN -->
|
| 241 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 242 |
+
<h3>Short-form Simultaneous Translations from Italian</h3>
|
| 243 |
+
<p class="mb-0"> The source audios come from our Europarl-ST evaluation data. Hibiki-Zero-IT denotes our model adapted for translation from Italian with less than 1000 hours of Italian-to-English data. </p>
|
| 244 |
+
<div class="container pt-3 table-responsive">
|
| 245 |
+
<table class="table" id="shortform-table-it">
|
| 246 |
+
<thead>
|
| 247 |
+
<tr>
|
| 248 |
+
<th style="text-align: center; min-width: 150px;">Source language</th>
|
| 249 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
| 250 |
+
<th style="text-align: center;">Hibiki-Zero-IT</th>
|
| 251 |
+
<th style="text-align: center">Seamless</th>
|
| 252 |
+
</tr>
|
| 253 |
+
</thead>
|
| 254 |
+
<tbody>
|
| 255 |
+
<!-- it -->
|
| 256 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 257 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 258 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 259 |
+
<tr> <td></td> <td></td> <td></td> <td></td></tr>
|
| 260 |
+
</tbody>
|
| 261 |
+
</table>
|
| 262 |
+
</div>
|
| 263 |
+
</div>
|
| 264 |
+
|
| 265 |
+
<!-- TAIL -->
|
| 266 |
+
<div class="container p-5 mb-5 bg-white rounded">
|
| 267 |
+
<p class="mb-0">
|
| 268 |
+
This page was adapted from the <a href="https://google-research.github.io/seanet/soundstorm/examples">SoundStorm project page</a>.
|
| 269 |
+
</p>
|
| 270 |
+
</div>
|
| 271 |
+
|
| 272 |
+
</body>
|
| 273 |
</html>
|