#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys import os base_dir = os.path.dirname(os.path.abspath(__file__)) model_dir = os.path.join(base_dir, "ChordRecognitionMIDITrainedExtractor") sys.path.insert(0, model_dir) os.chdir(model_dir) import numpy as np import networks as N from librosa.core import cqt, load, note_to_hz import const as C import utils as U from pyharp import ModelCard, build_endpoint, LabelList, AudioLabel, OutputLabel import gradio as gr # ── Load models once at startup ────────────────────────────────────────────── cnn_feat_extractor = N.FullCNNFeatExtractor() cnn_feat_extractor.load(C.DEFAULT_CONVNETFILE) decoder = N.NBLSTMCRF() decoder.load("nblstm_crf.model") # ── Model card ─────────────────────────────────────────────────────────────── model_card = ModelCard( name="Automatic Chord Recognition", description="Estimates chord progressions from audio using a CNN feature extractor trained on MIDI data and a BLSTM-CRF sequence decoder. Outputs time-stamped chord labels.", author="Wu & Li (2019), wrapped by PyHARP", tags=["chord recognition", "harmony", "MIR"], ) # ── Processing function ─────────────────────────────────────────────────────── def process_fn(input_audio_path: str) -> LabelList: # Load audio y, sr = load(input_audio_path, sr=C.SR) # Extract Harmonic-CQT fmin = note_to_hz("C1") hcqt = np.stack([ np.abs(cqt( y, sr=C.SR, hop_length=C.H, n_bins=C.BIN_CNT, bins_per_octave=C.OCT_BIN, fmin=fmin * (h + 1), filter_scale=2, tuning=None )).T.astype(np.float32) for h in range(C.CQT_H) ]) # Extract deep feature feat = cnn_feat_extractor.GetFeature(U.PreprocessSpec(hcqt)).data # Decode chord label sequence labels = decoder.argmax(feat) # Build LabelList for HARP output_labels = LabelList() cur_label = labels[0] st = 0 for i in range(labels.size): if labels[i] != cur_label: ed = i feat_seg = feat[st:ed, :] chord_sign = U.voc.ChordSignature7thbass(cur_label, feat_seg, sevenths=True, inv=True) start_sec = float(st * C.H) / C.SR end_sec = float(ed * C.H) / C.SR if chord_sign != "N": output_labels.labels.append( AudioLabel( t=start_sec, label=chord_sign, duration=end_sec - start_sec, description=f"Chord: {chord_sign} ({start_sec:.2f}s - {end_sec:.2f}s)", ) ) cur_label = labels[i] st = i # Handle last segment feat_seg = feat[st:labels.size, :] chord_sign = U.voc.ChordSignature7thbass(cur_label, feat_seg) start_sec = float(st * C.H) / C.SR end_sec = float(labels.size * C.H) / C.SR if chord_sign != "N": output_labels.labels.append( AudioLabel( t=start_sec, label=chord_sign, duration=end_sec - start_sec, description=f"Chord: {chord_sign} ({start_sec:.2f}s - {end_sec:.2f}s)", ) ) return output_labels # ── Gradio endpoint ─────────────────────────────────────────────────────────── with gr.Blocks() as demo: input_components = [ gr.Audio(type="filepath", label="Input Audio").harp_required(True), ] output_components = [ gr.JSON(label="Output Labels"), ] app = build_endpoint( model_card=model_card, input_components=input_components, output_components=output_components, process_fn=process_fn, ) demo.queue().launch(share=True, show_error=False, pwa=True)