import json import os import random import threading import time import uuid from pathlib import Path import gradio as gr from huggingface_hub import HfApi from config import CLIP_KEYS, MODEL_NAMES, RATING_CATEGORIES, SAMPLES RESULTS_DIR = Path("results") RESULTS_DIR.mkdir(exist_ok=True) HF_BUCKET = os.environ.get("HF_BUCKET", "Cactooz/listening-data") if os.environ.get("SPACE_ID") else None NUM_SAMPLES = len(SAMPLES) CAT_KEYS = list(RATING_CATEGORIES.keys()) CAT_LABELS = list(RATING_CATEGORIES.values()) def create_session(): session_id = str(uuid.uuid4())[:8] rng = random.Random(session_id) sample_order = list(range(NUM_SAMPLES)) rng.shuffle(sample_order) clip_orders = {} for sample in SAMPLES: order = list(CLIP_KEYS) rng.shuffle(order) clip_orders[sample["id"]] = order return { "session_id": session_id, "sample_order": sample_order, "clip_orders": clip_orders, "current_page": 0, "ratings": {}, } def get_audio_path(sample, clip_key): if clip_key == "target": return sample["target_audio"] return sample[f"{clip_key}_audio"] def build_page(state): page_idx = state["current_page"] sample_idx = state["sample_order"][page_idx] sample = SAMPLES[sample_idx] clip_order = state["clip_orders"][sample["id"]] clip_audios = [] for key in clip_order: path = get_audio_path(sample, key) clip_audios.append(path if os.path.exists(path) else None) existing = state["ratings"].get(sample["id"], {}) saved_scores = [] for key in clip_order: for cat in CAT_KEYS: val = existing.get(key, {}).get(cat) saved_scores.append(val if val is not None else 0) return ( f"### Sample {page_idx + 1} of {NUM_SAMPLES}", f"
Session ID: {session_id} (Save this ID if you want your response removed)
", visible=True, ), ) with gr.Blocks(title="Music Editing Listening Test", theme=gr.themes.Default()) as demo: gr.Markdown( """ # Music Editing Listening Test Welcome, and thank you for participating in this listening study conducted as part of a Master's thesis at KTH Royal Institute of Technology, in collaboration with Epidemic Sound. **Expected time needed**: 15-20 minutes ## What you will do In each trial, you will hear a 20-second original audio clip, with an editing instruction to add or remove instruments displayed right below it. Below these, you will hear 4 different randomly ordered edited audio clips that tries to follow that instruction. Ideally, an edit should only change what is requested in the instruction, while preserving everything else from the original audio clip. The editing instructions can only be for adding or removing instruments and may sometimes include a specific genre. These instructions will use various terms, such as: - ADD: add, include, insert, plus, layer, etc. - REMOVE: remove, delete, mute, minus, omit, etc. Important information: - There are no right or wrong answers, trust your own perception. - Please complete all 10 samples in one go. You cannot save progress or submit halfway through. - Ensure you are satisfied with all your ratings before moving to the next sample. You cannot return to previous pages. - Sometimes the music might take a little while to load for each page, please be patient. - All responses are anonymous and used solely for this Master's thesis research. """ ) with gr.Row(): with gr.Column(scale=1): gr.Image("audio-instruction-edits.png", show_label=False) gr.Column(scale=1) gr.Markdown( """ ## Rating Rate each edited clip on three criteria using a 1-5 scale: | Score | Meaning | |-------|---------| | 5 | Excellent | | 4 | Good | | 3 | Fair | | 2 | Poor | | 1 | Bad | ### Audio Quality Perceptual quality of the edited audio. Ask yourself: Does the audio sound clean, or are there digital artifacts, robotic glitches, distortion, hiss, sudden dropouts, or abrupt cuts? - A high score means it sounds clean, like a professional track. - A low score means it has sounds corrupted, glitchy, noisy, or low-quality. ### Relevance How well the edited audio matches the given instruction. Ask yourself: Did the edit successfully perform the exact action described, regardless of what happened to the rest of the track? (e.g. If the instruction was "add saxophone" is there now a saxophone?) - A high score means: - For removal that only the requested instruments were completely removed. - For addition that the requested instruments were successfully added, matching the mood, tempo, and rhythm of the original track. - A low score means that the instruction was ignored entirely (nothing was changed) or the wrong action was taken (e.g. a guitar was removed instead of a piano, or a synth was added instead of drums). ### Faithfulness How well unedited parts of the original audio are preserved. Ask yourself: Aside from the requested change, does the rest of the music remain identical to the original? - A high score means the background tracks, mixing, and rhythm are perfectly preserved. - A low score means unrelated instruments were altered, the overall musical structure changed, or the track was completely remixed. """ ) state = gr.State(create_session) intro_group = gr.Group(visible=True) with intro_group: gr.Markdown("### Before we begin, tell us a bit about yourself") expertise_input = gr.Radio( choices=[ "Professional", "Musician", "Audio/music researcher", "Casual listener", ], label="Listening expertise", ) setup_input = gr.Radio( choices=[ "Studio monitors (speakers)", "Over-ear headphones", "In-ear headphones", "Laptop/phone speakers", ], label="Listening setup", ) environment_input = gr.Radio( choices=[ "Quiet room", "Moderate background noise", "Noisy environment", ], label="Listening environment", ) start_btn = gr.Button("Start Listening Test", variant="primary") test_group = gr.Group(visible=False) with test_group: progress_label = gr.Markdown("### Sample 1 of 10") input_audio = gr.Audio(label="Original Audio", type="filepath", interactive=False) instruction_label = gr.Markdown("