| <!DOCTYPE html> |
| <html lang="en"> |
|
|
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>Speech-to-Speech Model Comparison</title> |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet"> |
| <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> |
| <style> |
| body { |
| background-color: #f4f6f9; |
| font-family: 'Arial', sans-serif; |
| } |
| |
| .container { |
| background-color: white; |
| border-radius: 10px; |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); |
| padding: 30px; |
| } |
| |
| h3 { |
| font-size: 1.2rem; |
| |
| font-weight: bold; |
| color: #333; |
| } |
| |
| .form-control { |
| border-radius: 25px; |
| padding: 15px; |
| } |
| |
| .btn { |
| border-radius: 25px; |
| font-size: 0.9rem; |
| padding: 8px 16px; |
| transition: background-color 0.3s ease; |
| } |
| |
| .btn-primary { |
| background-color: #007bff; |
| border: none; |
| } |
| |
| .btn-primary:hover { |
| background-color: #0056b3; |
| } |
| |
| .btn-success { |
| background-color: #28a745; |
| border: none; |
| } |
| |
| .btn-success:hover { |
| background-color: #218838; |
| } |
| |
| .btn-selected { |
| background-color: #155724 !important; |
| color: white !important; |
| } |
| |
| .btn-option { |
| font-size: 0.9rem; |
| padding: 8px 20px; |
| margin: 0 10px; |
| } |
| |
| #test-content { |
| display: none; |
| } |
| |
| #category-select, |
| #task-select-dropdown { |
| width: 120% !important; |
| |
| margin: 0 auto; |
| |
| } |
| |
| #confirm-choice, |
| #next-test { |
| display: none; |
| transition: opacity 0.3s ease; |
| } |
| |
| #model-comparison { |
| display: none; |
| opacity: 0; |
| transition: opacity 0.3s ease; |
| } |
| |
| #model-comparison.show { |
| opacity: 1; |
| } |
| |
| #switch-task { |
| font-size: 0.8rem; |
| padding: 5px 10px; |
| position: absolute; |
| top: 10px; |
| right: 20px; |
| display: none; |
| } |
| #task-description { |
| display: none; |
| } |
| </style> |
| </head> |
|
|
| <body> |
| <div class="container py-5"> |
| <h3 class="text-center mb-4">Speech-to-Speech Model Comparison</h3> |
|
|
| <div id="evaluation-info" class="mb-5"> |
| <p class="text-start"> |
| <strong>Welcome to the Speech-to-Speech (S2S) Model Evaluation!</strong> |
| <br><br> |
| In this evaluation, you will assess the performance of 4 S2S models: |
| <strong>ChatGPT-4o</strong>, <strong>FunAudioLLM</strong>, <strong>SpeechGPT</strong>, and |
| <strong>Mini-Omni</strong>. |
| The goal is to evaluate how well these models handle various speech tasks across different domains. |
| <br><br> |
| Once you select a specific domain and task (e.g., <em>Educational Tutoring</em> and <em>Rhythm Control</em>), |
| you will proceed to the evaluation stage. In each round, you will be presented with an audio input. |
| For example: |
| <br><br> |
| |
| |
| <span style="vertical-align: middle; line-height: 1.2; display: inline-block;"><strong>Audio Sample:</strong></span> |
| <audio controls style="vertical-align: middle;"> |
| <source src="/static/audio/sample/input_audio.wav" type="audio/wav"> |
| </audio> |
| |
| <br><br> |
| The corresponding text is: |
| <em>"Say the following sentence at my speed first, then say it again very slowly: |
| 'Artificial intelligence is changing the world in many ways.'" </em> |
| <small>(Note: the audio plays at 1.5x the normal speed.)</small> |
| <br><br> |
| The responses of different S2S models will be provided, and your task is to choose which response best follows |
| the instructions. For example<small>(Note: During the evaluation process, you will be provided with responses from only the two models that have the most comparative significance.)</small>: |
| <br><br> |
| |
| |
| <span><strong>ChatGPT-4o:</strong></span> |
| <audio controls style="vertical-align: middle;"> |
| <source src="/static/audio/sample/4o_audio.wav" type="audio/wav"> |
| </audio> |
| <p class="text-start" style="margin-left: 20px;"> |
| <strong>Performance:</strong> Speech: Partially followed the instruction on speed. Semantics: Accurately followed the instruction, with no semantic deviation or missing information. |
| </p> |
| |
| |
| <span><strong>FunAudioLLM:</strong></span> |
| <audio controls style="vertical-align: middle;"> |
| <source src="/static/audio/sample/FunAudio_audio.wav" type="audio/wav"> |
| </audio> |
| <p class="text-start" style="margin-left: 20px;"> |
| <strong>Performance:</strong> Speech: Partially followed the instruction on speed. Semantics: Accurately followed the instruction, with no semantic deviation or missing information. |
| </p> |
| |
| |
| <span><strong>SpeechGPT:</strong></span> |
| <audio controls style="vertical-align: middle;"> |
| <source src="/static/audio/sample/SpeechGPT.wav" type="audio/wav"> |
| </audio> |
| <p class="text-start" style="margin-left: 20px;"> |
| <strong>Performance:</strong> Speech: Did not follow the instruction on speed. Semantics: Partially followed the instruction, with minor semantic deviation and missing information. |
| </p> |
| |
| |
| <span><strong>Mini-Omni:</strong></span> |
| <audio controls style="vertical-align: middle;"> |
| <source src="/static/audio/sample/mini-omni.wav" type="audio/wav"> |
| </audio> |
| <p class="text-start" style="margin-left: 20px;"> |
| <strong>Performance:</strong> Speech: Did not follow the instruction on speed. Semantics: Did not follow the instruction, with significant semantic deviation and missing information. |
| </p> |
|
|
| <p class="text-start"> |
| After making your choice, you'll proceed to the next round. |
| </p> |
| <strong>Please enter your username and start the evaluation!</strong> |
| </p> |
| </div> |
|
|
| <div id="user-input" class="text-center"> |
| <div class="mb-3"> |
| <input type="text" id="username" class="form-control w-50 mx-auto" placeholder="Your username" /> |
| </div> |
| <button class="btn btn-primary" onclick="startTest()">Start Test</button> |
| </div> |
|
|
|
|
| <div id="task-select" class="text-center" style="display: none;"> |
| <h3 class="my-4">Select Test Category:</h3> |
| <div class="d-grid gap-2 col-6 mx-auto"> |
| |
| <select id="category-select" class="form-select mx-auto" onchange="populateTasks()"> |
| <option value="" disabled selected>Select Category</option> |
| <option value="educational">Educational Tutoring</option> |
| <option value="social">Social Companionship</option> |
| <option value="entertainment">Entertainment Dubbing</option> |
| <option value="medical">Medical Consultation</option> |
| </select> |
| </div> |
|
|
| <h3 class="my-4" id="specific-task-title" style="display: none;">Select Specific Task:</h3> |
| <div class="d-grid gap-2 col-6 mx-auto"> |
| |
| <select id="task-select-dropdown" class="form-select mx-auto" style="display: none;"> |
| <option value="" disabled selected>Select Specific Task</option> |
| |
| </select> |
| </div> |
|
|
| <button class="btn btn-primary mt-4" id="start-task-btn" onclick="selectTaskFromDropdown()" |
| style="display: none;">Start Task</button> |
| </div> |
|
|
| <button id="switch-task" class="btn btn-warning" onclick="switchTask()">Switch Category and Tasks</button> |
|
|
| <div id="test-content"> |
| <div class="text-center"> |
|
|
| <div class="row justify-content-center"> |
| <div class="col-md-6 text-start double-text" style="margin-bottom: 10px;"> |
| <strong>Task description:</strong> <span id="task-description"></span> |
| </div> |
| </div> |
|
|
| |
| <div class="row justify-content-center"> |
| <div class="col-md-6 d-flex justify-content-center align-items-center mb-4"> |
| <strong class="me-2">Audio:</strong> |
| <audio id="input-audio" controls></audio> |
| </div> |
| </div> |
|
|
| <div class="row justify-content-center"> |
| <div class="col-md-6 text-start double-text" style="margin-bottom: 10px;"> |
| <strong>Audio text:</strong> <span id="test-text"></span> |
| </div> |
| </div> |
|
|
| |
| <div class="row justify-content-center"> |
| <div class="col-md-6 text-start"> |
| <p><strong>Question:</strong> Which of the following two models answers the result better?</p> |
| </div> |
| </div> |
|
|
| |
| <div class="mb-4 text-center"> |
| <div class="model-section d-flex align-items-center justify-content-center mb-3"> |
| <h6 class="me-2" style="margin-bottom: 0; margin-top: 5px; font-weight: bold;">Model A:</h6> |
| <audio id="audio-a" controls></audio> |
| </div> |
| <div class="model-section d-flex align-items-center justify-content-center"> |
| <h6 class="me-2" style="margin-bottom: 0; margin-top: 5px; font-weight: bold;">Model B:</h6> |
| <audio id="audio-b" controls></audio> |
| </div> |
| </div> |
|
|
|
|
| <div class="d-flex justify-content-center mt-4"> |
| <button class="btn btn-success btn-option mx-2" onclick="selectModel('A')">Model A</button> |
| <button class="btn btn-success btn-option mx-2" onclick="selectModel('B')">Model B</button> |
| </div> |
|
|
| <div id="model-comparison" class="text-center mt-4"> |
| <p>Model A: <span id="model-a"></span></p> |
| <p>Model B: <span id="model-b"></span></p> |
| <p>Your choice: <span id="chosen-model"></span></p> |
| </div> |
|
|
| <button id="confirm-choice" class="btn btn-primary mt-4" onclick="confirmChoice()">Confirm |
| Selection</button> |
| <button id="next-test" class="btn btn-primary mt-4" onclick="loadNextTest()">Next Test</button> |
| </div> |
| </div> |
|
|
| <div id="test-completed" class="text-center" style="display: none;"> |
| <h3>Thank you for completing the <span id="completed-task"></span> test!</h3> |
| <p>Would you like to test another category or task?</p> |
| <button class="btn btn-primary" onclick="switchTask()">Yes</button> |
| <button class="btn btn-secondary" onclick="endTest()">No</button> |
| </div> |
|
|
| |
| <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script> |
| <script> |
| let username; |
| let task; |
| let chosenModel; |
| let modelA, modelB; |
| |
| |
| const modelNames = { |
| "output_path_speechgpt": "SpeechGPT", |
| "output_path_miniomni": "Mini-Omni", |
| "output_path_4o": "ChatGPT-4o", |
| "output_path_funaudio": "FunAudioLLM", |
| "output_path_4o_cascade": "Cascade", |
| "output_path_4o_llama_omni": "LLaMA-Omni" |
| }; |
| |
| function startTest() { |
| username = $("#username").val(); |
| if (!username) { |
| alert("Please enter a username"); |
| return; |
| } |
| $("#evaluation-info").hide(); |
| $("#user-input").hide(); |
| $("#task-select").show(); |
| } |
| |
| function switchTask() { |
| |
| $("#task-description").text(''); |
| $("#test-content").hide(); |
| $("#test-text").text(''); |
| $("#input-audio").attr("src", ''); |
| $("#audio-a").attr("src", ''); |
| $("#audio-b").attr("src", ''); |
| $("#chosen-model").text(''); |
| $("#model-a").text(''); |
| $("#model-b").text(''); |
| $("#confirm-choice").hide(); |
| $("#next-test").hide(); |
| $("#model-comparison").removeClass('show').hide(); |
| |
| |
| $("#test-completed").hide(); |
| $("#task-select").show(); |
| $("#switch-task").hide(); |
| } |
| |
| function selectTask(selectedTask) { |
| task = selectedTask; |
| |
| |
| $("#task-description").text(''); |
| $("#test-text").text(''); |
| $("#input-audio").attr("src", ''); |
| $("#audio-a").attr("src", ''); |
| $("#audio-b").attr("src", ''); |
| $("#chosen-model").text(''); |
| $("#model-a").text(''); |
| $("#model-b").text(''); |
| $("#confirm-choice").hide(); |
| $("#next-test").hide(); |
| $("#model-comparison").removeClass('show').hide(); |
| |
| |
| $("#task-select").hide(); |
| $("#switch-task").show(); |
| |
| |
| $.ajax({ |
| url: '/start_test', |
| type: 'POST', |
| contentType: 'application/json', |
| data: JSON.stringify({ username: username, task: task }), |
| success: function (data) { |
| $("#test-content").show(); |
| loadNextTest(); |
| }, |
| error: function (xhr, status, error) { |
| console.error("Error occurred: ", status, error); |
| } |
| }); |
| } |
| |
| function populateTasks() { |
| const category = $("#category-select").val(); |
| const taskDropdown = $("#task-select-dropdown"); |
| |
| |
| taskDropdown.empty(); |
| |
| taskDropdown.append('<option value="" disabled selected>Select Specific Task</option>'); |
| |
| |
| if (category === 'educational') { |
| taskDropdown.append('<option value="pronunciation">Correcting pronunciation ability</option>'); |
| taskDropdown.append('<option value="rhythm">Rhythm control capabilities</option>'); |
| taskDropdown.append('<option value="translation">Cross-language translation with emotion</option>'); |
| taskDropdown.append('<option value="language">Language consistency</option>'); |
| taskDropdown.append('<option value="pause">Pause and segmentation</option>'); |
| taskDropdown.append('<option value="polyphone">Polyphonic word comprehension</option>'); |
| taskDropdown.append('<option value="stress">Emphasis control</option>'); |
| } else if (category === 'social') { |
| taskDropdown.append('<option value="emotion">Emotion recognition and expression</option>'); |
| taskDropdown.append('<option value="identity">Identity coping ability</option>'); |
| taskDropdown.append('<option value="humor">Implications ability</option>'); |
| taskDropdown.append('<option value="irony">Sarcasm detection</option>'); |
| } else if (category === 'entertainment') { |
| taskDropdown.append('<option value="natural">Ability to simulate natural sound</option>'); |
| taskDropdown.append('<option value="singing">Singing ability</option>'); |
| taskDropdown.append('<option value="tongue">Tongue twisters capabilities</option>'); |
| taskDropdown.append('<option value="crosstalk">Crosstalk ability</option>'); |
| taskDropdown.append('<option value="poetry">Poetry recitation</option>'); |
| taskDropdown.append('<option value="role">Role-playing</option>'); |
| taskDropdown.append('<option value="story">Storytelling</option>'); |
| } else if (category === 'medical') { |
| taskDropdown.append('<option value="healthcare">Health consultation</option>'); |
| taskDropdown.append('<option value="illness">Querying symptoms</option>'); |
| taskDropdown.append('<option value="psychological">Psychological comfort</option>'); |
| } |
| |
| |
| if (category) { |
| $("#specific-task-title").show(); |
| $("#task-select-dropdown").show(); |
| $("#start-task-btn").show(); |
| } else { |
| $("#specific-task-title").hide(); |
| $("#task-select-dropdown").hide(); |
| $("#start-task-btn").hide(); |
| } |
| } |
| |
| |
| function selectTaskFromDropdown() { |
| const selectedTask = $("#task-select-dropdown").val(); |
| if (selectedTask) { |
| task = selectedTask; |
| $.ajax({ |
| url: '/start_test', |
| type: 'POST', |
| contentType: 'application/json', |
| data: JSON.stringify({ username: username, task: task }), |
| success: function (data) { |
| |
| $("#task-description").text(data.task_description); |
| $("#task-description").show(); |
| $("#task-select").hide(); |
| $("#test-content").show(); |
| $("#switch-task").show(); |
| loadNextTest(); |
| }, |
| error: function (xhr, status, error) { |
| console.error("Error occurred: ", status, error); |
| } |
| }); |
| } else { |
| alert("Please select a specific task."); |
| } |
| } |
| |
| |
| |
| |
| function loadNextTest() { |
| $.get('/next_test', function (data) { |
| if (data.message === 'Test completed') { |
| $("#test-content").hide(); |
| $("#test-completed").show(); |
| |
| |
| $("#completed-task").text(task); |
| |
| |
| sessionStorage.removeItem('current_index'); |
| } else { |
| |
| console.log(data); |
| $("#task-description").text(data.task_description); |
| $("#test-text").text(data.text); |
| $("#input-audio").attr("src", data.input_path); |
| $("#audio-a").attr("src", data.audio_a); |
| $("#audio-b").attr("src", data.audio_b); |
| |
| |
| modelA = modelNames[data.model_a]; |
| modelB = modelNames[data.model_b]; |
| $("#model-a").text(modelA); |
| $("#model-b").text(modelB); |
| |
| $("#next-test").hide(); |
| $("#model-comparison").hide(); |
| $("#confirm-choice").show(); |
| chosenModel = null; |
| $(".btn-option").prop('disabled', false); |
| $(".btn-option").removeClass("btn-selected").addClass("btn-success"); |
| } |
| }, 'json').fail(function (xhr, status, error) { |
| console.error("Failed to load test data:", status, error); |
| }); |
| } |
| |
| function endTest() { |
| |
| alert("Thank you for participating in the test!"); |
| |
| window.location.href = "/thank_you"; |
| } |
| |
| function selectModel(model) { |
| |
| chosenModel = model; |
| |
| |
| $(".btn-option").prop('disabled', false); |
| |
| |
| $(".btn-option").removeClass("btn-selected").addClass("btn-success"); |
| |
| |
| if (model === 'A') { |
| $("button:contains('Model A')").removeClass("btn-success").addClass("btn-selected"); |
| } else if (model === 'B') { |
| $("button:contains('Model B')").removeClass("btn-success").addClass("btn-selected"); |
| } |
| } |
| |
| function confirmChoice() { |
| |
| if (!chosenModel) { |
| alert("Please select a model before confirming."); |
| return; |
| } |
| |
| |
| $(".btn-option").prop('disabled', true); |
| |
| |
| if (chosenModel === 'A') { |
| $("#chosen-model").text(modelA); |
| } else { |
| $("#chosen-model").text(modelB); |
| } |
| |
| |
| $("#model-a").text(modelA); |
| $("#model-b").text(modelB); |
| |
| |
| $("#model-comparison").addClass('show'); |
| $("#model-comparison").show(); |
| |
| |
| $("#confirm-choice").hide(); |
| $("#next-test").show(); |
| |
| |
| $.ajax({ |
| url: '/submit_result', |
| type: 'POST', |
| contentType: 'application/json', |
| data: JSON.stringify({ chosen_model: chosenModel }), |
| success: function (data) { |
| |
| }, |
| error: function (xhr, status, error) { |
| console.error("Error occurred: ", status, error); |
| } |
| }); |
| } |
| </script> |
| </body> |
|
|
| </html> |
|
|