File size: 13,275 Bytes
2099543
 
 
7ff6285
2099543
 
 
 
 
fa5fd4a
2099543
6917ff7
2099543
 
 
 
7ff6285
a44ab22
2099543
9879209
 
2099543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9879209
 
2099543
6917ff7
2099543
 
 
b29a982
2099543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9879209
2099543
6917ff7
 
2099543
 
 
 
 
 
 
6917ff7
 
 
9879209
2099543
 
 
a44ab22
c63fb3b
a44ab22
 
 
2099543
9879209
2099543
a44ab22
2099543
 
 
 
 
 
 
 
 
 
 
 
9879209
2099543
a44ab22
2099543
 
 
 
 
 
a44ab22
2099543
 
 
 
 
 
 
 
 
 
 
 
9879209
2099543
 
 
 
fa5fd4a
 
 
 
 
 
7ff6285
 
 
 
 
 
 
 
 
a44ab22
 
2099543
 
a44ab22
9879209
b29a982
9879209
 
2099543
 
 
6917ff7
2099543
 
 
d767787
 
 
068c34f
d767787
 
058b2cb
474b479
2597a00
93164b3
 
 
 
058b2cb
2597a00
068c34f
058b2cb
80411e6
2597a00
068c34f
 
d767787
78f8403
 
 
421a6ac
068c34f
 
78f8403
2597a00
7ff6285
2099543
 
 
1a1a757
 
 
 
2099543
068c34f
e376ba1
 
 
 
068c34f
 
e376ba1
 
 
 
80411e6
 
068c34f
 
e376ba1
 
 
 
2099543
78f8403
2099543
 
 
a44ab22
 
 
 
 
068c34f
a44ab22
9f6b386
a44ab22
 
 
 
 
 
058b2cb
a44ab22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421a6ac
a44ab22
b29a982
a44ab22
 
6917ff7
a44ab22
 
 
 
 
421a6ac
a44ab22
 
9879209
a44ab22
 
6917ff7
 
 
 
 
a44ab22
6917ff7
a44ab22
6917ff7
a44ab22
 
9249695
a44ab22
058b2cb
2099543
 
 
a44ab22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2099543
9879209
 
 
6917ff7
cbc99b1
9879209
6917ff7
2099543
 
 
6917ff7
2099543
 
cbc99b1
 
 
 
 
2099543
 
 
6917ff7
a44ab22
2099543
 
9f6b386
 
 
 
 
2099543
9f6b386
2099543
 
 
 
 
be3ee63
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
import json
import os
import random
import threading
import time
import uuid
from pathlib import Path

import gradio as gr
from huggingface_hub import HfApi

from config import CLIP_KEYS, MODEL_NAMES, RATING_CATEGORIES, SAMPLES

RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)

HF_BUCKET = os.environ.get("HF_BUCKET", "Cactooz/listening-data") if os.environ.get("SPACE_ID") else None

NUM_SAMPLES = len(SAMPLES)
CAT_KEYS = list(RATING_CATEGORIES.keys())
CAT_LABELS = list(RATING_CATEGORIES.values())


def create_session():
    session_id = str(uuid.uuid4())[:8]
    rng = random.Random(session_id)

    sample_order = list(range(NUM_SAMPLES))
    rng.shuffle(sample_order)

    clip_orders = {}
    for sample in SAMPLES:
        order = list(CLIP_KEYS)
        rng.shuffle(order)
        clip_orders[sample["id"]] = order

    return {
        "session_id": session_id,
        "sample_order": sample_order,
        "clip_orders": clip_orders,
        "current_page": 0,
        "ratings": {},
    }


def get_audio_path(sample, clip_key):
    if clip_key == "target":
        return sample["target_audio"]
    return sample[f"{clip_key}_audio"]


def build_page(state):
    page_idx = state["current_page"]
    sample_idx = state["sample_order"][page_idx]
    sample = SAMPLES[sample_idx]
    clip_order = state["clip_orders"][sample["id"]]

    clip_audios = []
    for key in clip_order:
        path = get_audio_path(sample, key)
        clip_audios.append(path if os.path.exists(path) else None)

    existing = state["ratings"].get(sample["id"], {})
    saved_scores = []
    for key in clip_order:
        for cat in CAT_KEYS:
            val = existing.get(key, {}).get(cat)
            saved_scores.append(val if val is not None else 0)

    return (
        f"### Sample {page_idx + 1} of {NUM_SAMPLES}",
        f"<h1 style='text-align: center; padding-block: 15px;'>Instruction: {sample['instruction']}</h1>",
        sample["input_audio"] if os.path.exists(sample["input_audio"]) else None,
        *clip_audios,
        *saved_scores,
        gr.update(visible=(page_idx < NUM_SAMPLES - 1)),
        gr.update(visible=(page_idx == NUM_SAMPLES - 1)),
    )


def save_ratings_for_page(state, *radio_values):
    page_idx = state["current_page"]
    sample_idx = state["sample_order"][page_idx]
    sample = SAMPLES[sample_idx]
    clip_order = state["clip_orders"][sample["id"]]

    page_ratings = {}
    idx = 0
    for clip_key in clip_order:
        clip_ratings = {}
        for cat in CAT_KEYS:
            val = radio_values[idx]
            if val is not None and val > 0:
                clip_ratings[cat] = int(val)
            idx += 1
        page_ratings[clip_key] = clip_ratings

    state["ratings"][sample["id"]] = page_ratings
    return state


def check_page_complete(slider_values):
    for val in slider_values:
        if val is None or val < 1:
            return False
    return True


def no_change(state):
    num_outputs = 3 + len(CLIP_KEYS) + len(CLIP_KEYS) * len(CAT_KEYS) + 2
    return (state, *([gr.update()] * num_outputs))


def go_next(state, *radio_values):
    if not check_page_complete(radio_values):
        gr.Warning("Please rate all clips before continuing.")
        return no_change(state)
    state = save_ratings_for_page(state, *radio_values)
    state["current_page"] = min(state["current_page"] + 1, NUM_SAMPLES - 1)
    return (state, *build_page(state))


def go_prev(state, *radio_values):
    state = save_ratings_for_page(state, *radio_values)
    state["current_page"] = max(state["current_page"] - 1, 0)
    return (state, *build_page(state))


def submit_results(state, *radio_values):
    if not check_page_complete(radio_values):
        gr.Warning("Please rate all clips on this page before submitting.")
        return (state, gr.update(), gr.update())

    state = save_ratings_for_page(state, *radio_values)

    result = {
        "session_id": state["session_id"],
        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "profile": state.get("profile", {}),
        "samples": {},
    }

    for sample in SAMPLES:
        sid = sample["id"]
        sample_result = {
            "instruction": sample["instruction"],
            "ratings": {},
        }
        ratings = state["ratings"].get(sid, {})
        for model_key in CLIP_KEYS:
            sample_result["ratings"][model_key] = {
                "model_name": MODEL_NAMES[model_key],
                **ratings.get(model_key, {}),
            }
        result["samples"][sid] = sample_result

    filename = f"{state['session_id']}_{int(time.time())}.json"
    out_path = RESULTS_DIR / filename
    with open(out_path, "w") as f:
        json.dump(result, f, indent=2)

    if HF_BUCKET:
        def _upload():
            try:
                HfApi().batch_bucket_files(
                    bucket_id=HF_BUCKET,
                    add=[(str(out_path), filename)],
                )
            except Exception as e:
                print(f"Failed to upload to bucket: {e}")
        threading.Thread(target=_upload, daemon=True).start()

    session_id = state["session_id"]
    return (
        state,
        gr.update(visible=False),
        gr.update(
            value=f"# Thank you for your contribution!\n\nYour responses have been saved successfully.\n\n<p style='color: gray; font-size: 0.8em;'>Session ID: {session_id} (Save this ID if you want your response removed)</p>",
            visible=True,
        ),
    )


with gr.Blocks(title="Music Editing Listening Test", theme=gr.themes.Default()) as demo:
    gr.Markdown(
        """
        # Music Editing Listening Test
        
        Welcome, and thank you for participating in this listening study conducted as part of a Master's thesis at KTH Royal Institute of Technology, in collaboration with Epidemic Sound.
        
        **Expected time needed**: 15-20 minutes
        
        ## What you will do
        In each trial, you will hear a 20-second original audio clip, with an editing instruction to add or remove instruments displayed right below it.  
        Below these, you will hear 4 different randomly ordered edited audio clips that tries to follow that instruction. Ideally, an edit should only change what is requested in the instruction, while preserving everything else from the original audio clip.
        
        The editing instructions can only be for adding or removing instruments and may sometimes include a specific genre. These instructions will use various terms, such as:
        - ADD: add, include, insert, plus, layer, etc.
        - REMOVE: remove, delete, mute, minus, omit, etc.
        
        Important information:
        - There are no right or wrong answers, trust your own perception.
        - Please complete all 10 samples in one go. You cannot save progress or submit halfway through.
        - Ensure you are satisfied with all your ratings before moving to the next sample. You cannot return to previous pages.
        - Sometimes the music might take a little while to load for each page, please be patient.
        - All responses are anonymous and used solely for this Master's thesis research.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Image("audio-instruction-edits.png", show_label=False)
        gr.Column(scale=1)
    
    gr.Markdown(
        """
        ## Rating
        Rate each edited clip on three criteria using a 1-5 scale:
        | Score | Meaning |
        |-------|---------|
        | 5 | Excellent |
        | 4 | Good |
        | 3 | Fair |
        | 2 | Poor |
        | 1 | Bad |

        ### Audio Quality
        Perceptual quality of the edited audio.  
        Ask yourself: Does the audio sound clean, or are there digital artifacts, robotic glitches, distortion, hiss, sudden dropouts, or abrupt cuts?
        - A high score means it sounds clean, like a professional track.
        - A low score means it has sounds corrupted, glitchy, noisy, or low-quality.
        
        ### Relevance
        How well the edited audio matches the given instruction.  
        Ask yourself: Did the edit successfully perform the exact action described, regardless of what happened to the rest of the track? (e.g. If the instruction was "add saxophone" is there now a saxophone?)
        - A high score means:
          - For removal that only the requested instruments were completely removed.
          - For addition that the requested instruments were successfully added, matching the mood, tempo, and rhythm of the original track.
        - A low score means that the instruction was ignored entirely (nothing was changed) or the wrong action was taken (e.g. a guitar was removed instead of a piano, or a synth was added instead of drums).
        
        ### Faithfulness
        How well unedited parts of the original audio are preserved.  
        Ask yourself: Aside from the requested change, does the rest of the music remain identical to the original?
        - A high score means the background tracks, mixing, and rhythm are perfectly preserved.
        - A low score means unrelated instruments were altered, the overall musical structure changed, or the track was completely remixed.
        """
    )

    state = gr.State(create_session)

    intro_group = gr.Group(visible=True)
    with intro_group:
        gr.Markdown("### Before we begin, tell us a bit about yourself")
        expertise_input = gr.Radio(
            choices=[
                "Professional",
                "Musician",
                "Audio/music researcher",
                "Casual listener",
            ],
            label="Listening expertise",
        )
        setup_input = gr.Radio(
            choices=[
                "Studio monitors (speakers)",
                "Over-ear headphones",
                "In-ear headphones",
                "Laptop/phone speakers",
            ],
            label="Listening setup",
        )
        environment_input = gr.Radio(
            choices=[
                "Quiet room",
                "Moderate background noise",
                "Noisy environment",
            ],
            label="Listening environment",
        )
        start_btn = gr.Button("Start Listening Test", variant="primary")

    test_group = gr.Group(visible=False)
    with test_group:
        progress_label = gr.Markdown("### Sample 1 of 10")

        input_audio = gr.Audio(label="Original Audio", type="filepath", interactive=False)

        instruction_label = gr.Markdown("<h1 style='text-align: center; padding-block: 15px;'>...</h1>")

        clip_audios = []
        sliders = []

        with gr.Row(equal_height=False):
            for i in range(len(CLIP_KEYS)):
                with gr.Column():
                    clip_audio = gr.Audio(
                        label=f"Edited Audio {i + 1}",
                        type="filepath",
                        interactive=False,
                    )
                    clip_audios.append(clip_audio)
                    for cat_label in CAT_LABELS:
                        slider = gr.Slider(
                            minimum=0,
                            maximum=5,
                            step=1,
                            value=0,
                            label=cat_label,
                            info="1=Bad  2=Poor  3=Fair  4=Good  5=Excellent",
                        )
                        sliders.append(slider)

        with gr.Row():
            #prev_btn = gr.Button("Previous", interactive=False)
            next_btn = gr.Button("Next", variant="primary")
            submit_btn = gr.Button("Submit", variant="primary", visible=False)

    thanks_msg = gr.Markdown(visible=False)

    def start_test(state, expertise, setup, environment):
        if not expertise or not setup or not environment:
            gr.Warning("Please answer all questions before starting.")
            return (state, gr.update(), gr.update())
        state["profile"] = {
            "expertise": expertise,
            "setup": setup,
            "environment": environment,
        }
        return (
            state,
            gr.update(visible=False),
            gr.update(visible=True),
        )

    start_btn.click(
        fn=start_test,
        inputs=[state, expertise_input, setup_input, environment_input],
        outputs=[state, intro_group, test_group],
    )

    all_outputs = (
        [state, progress_label, instruction_label, input_audio]
        + clip_audios
        + sliders
        + [next_btn, submit_btn]
    )
    slider_inputs = sliders

    next_btn.click(
        fn=go_next,
        inputs=[state] + slider_inputs,
        outputs=all_outputs,
    )
    # prev_btn.click(
    #     fn=go_prev,
    #     inputs=[state] + slider_inputs,
    #     outputs=all_outputs,
    # )

    submit_btn.click(
        fn=submit_results,
        inputs=[state] + slider_inputs,
        outputs=[state, test_group, thanks_msg],
    )

    def init_page(s):
        if callable(s):
            s = create_session()
        return (s, *build_page(s))

    demo.load(
        fn=init_page,
        inputs=[state],
        outputs=all_outputs,
    )

if __name__ == "__main__":
    demo.launch()