| |
|
|
| import os |
| from pathlib import Path |
| from typing import Final |
|
|
| |
| from huggingface_hub import hf_hub_download |
|
|
| |
| import numpy as np |
| import librosa |
| import audioread |
| from piano_transcription_inference import utilities |
|
|
| |
| |
| |
| MODEL_NAME: Final[str] = "CRNN_note_F1=0.9677_pedal_F1=0.9186.pth" |
| REPO_ID: Final[str] = "Genius-Society/piano_trans" |
|
|
|
|
| |
|
|
| def download_model_from_hf_if_needed(): |
| """ |
| Checks for the model and downloads it from the Hugging Face Hub if not present. |
| The hf_hub_download function handles caching and existence checks automatically. |
| """ |
| |
| utils_dir = Path(__file__).parent |
| base_dir = utils_dir.parent |
| model_dir = base_dir / "models" |
| model_path = model_dir / MODEL_NAME |
|
|
| print(f"Checking for model '{MODEL_NAME}' from Hugging Face Hub repo '{REPO_ID}'...") |
|
|
| try: |
| |
| |
| |
| hf_hub_download( |
| repo_id=REPO_ID, |
| filename=MODEL_NAME, |
| local_dir=model_dir, |
| |
| |
| ) |
| print(f"Model is available at '{model_path}'") |
|
|
| except AttributeError as e: |
| print(f"Error downloading from Hugging Face Hub. Please check your network connection and the repo/filename.") |
| print(f"Details: {e}") |
| |
| |
| except Exception as e: |
| print(f"An unexpected error occurred: {e}") |
| |
|
|
|
|
| |
|
|
| def _fixed_load_audio(path, sr=22050, mono=True, offset=0.0, duration=None, |
| dtype=np.float32, res_type='kaiser_best', |
| backends=[audioread.ffdec.FFmpegAudioFile]): |
| """ |
| A patched version of load_audio that uses updated function paths |
| for newer librosa versions. This function is intended to replace the |
| original one in the `piano_transcription_inference` library. |
| """ |
| |
| y = [] |
| with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file: |
| sr_native = input_file.samplerate |
| n_channels = input_file.channels |
| s_start = int(np.round(sr_native * offset)) * n_channels |
| if duration is None: |
| s_end = np.inf |
| else: |
| s_end = s_start + (int(np.round(sr_native * duration)) * n_channels) |
| n = 0 |
| for frame in input_file: |
| frame = librosa.util.buf_to_float(frame, dtype=dtype) |
| n_prev = n |
| n = n + len(frame) |
| if n < s_start: |
| continue |
| if s_end < n_prev: |
| break |
| if s_end < n: |
| frame = frame[:s_end - n_prev] |
| if n_prev <= s_start <= n: |
| frame = frame[(s_start - n_prev):] |
| y.append(frame) |
| if y: |
| y = np.concatenate(y) |
| if n_channels > 1: |
| y = y.reshape((-1, n_channels)).T |
| if mono: |
| y = librosa.to_mono(y) |
| if sr is not None: |
| y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type) |
| else: |
| sr = sr_native |
| y = np.ascontiguousarray(y, dtype=dtype) |
| return (y, sr) |
|
|
|
|
| def apply_monkey_patch(): |
| """ |
| Applies the patch to the `piano_transcription_inference` library by |
| replacing its `load_audio` function with our fixed version. |
| """ |
| print("Applying librosa compatibility patch...") |
| utilities.load_audio = _fixed_load_audio |
|
|
|
|
| |
|
|
| def initialize_app(): |
| """ |
| Main initialization function. Call this at the start of your app. |
| It downloads the model from Hugging Face and applies the necessary patches. |
| """ |
| print("--- Initializing Application ---") |
| download_model_from_hf_if_needed() |
| apply_monkey_patch() |
| print("--- Initialization Complete ---") |
|
|