CREPE (models_onnx)

Browse files

Files changed (12) hide show

.gitattributes +1 -0
models/onnx/ailia-models/code/README.md +71 -0
models/onnx/ailia-models/code/crepe.py +240 -0
models/onnx/ailia-models/code/mod_crepe.py +471 -0
models/onnx/ailia-models/code/output_full.png +0 -0
models/onnx/ailia-models/code/output_tiny.png +0 -0
models/onnx/ailia-models/code/test.wav +3 -0
models/onnx/ailia-models/crepe.onnx +3 -0
models/onnx/ailia-models/crepe.onnx.prototxt +2108 -0
models/onnx/ailia-models/crepe_tiny.onnx +3 -0
models/onnx/ailia-models/crepe_tiny.onnx.prototxt +0 -0
models/onnx/ailia-models/source.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/onnx/ailia-models/code/test.wav filter=lfs diff=lfs merge=lfs -text

models/onnx/ailia-models/code/README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# CREPE Pitch Tracker
+## Input
+Audio file
+(Audio from https://github.com/maxrmorrison/torchcrepe/blob/master/tests/assets/test.wav)
+## Output
+Pitch (F0) per 10ms
+full model
+![output_full](output_full.png)
+tiny model
+![output_tiny](output_tiny.png)
+## Requirements
+This model requires additional module.
+```
+pip3 install librosa
+pip3 install soundfile
+```
+## Usage
+Automatically downloads the onnx and prototxt files on the first run.
+It is necessary to be connected to the Internet while downloading.
+For the sample wav,
+```bash
+$ python3 crepe.py
+```
+If you want to specify the audio, put the file path after the `--input` option.
+```bash
+$ python3 crepe.py --input AUDIO_FILE
+```
+Specify the f0 option to infer a model that uses f0. You can choice `crepe` or `crepe_tiny` for f0_method.
+```bash $
+python3 crepe.py --f0_method crepe_tiny
+```
+Specify the `--evaluate` option, you can be compared with the f0 using pyworld.
+```bash $
+python3 crepe.py --f0_method crepe_tiny --evaluate
+```
+## Reference
+- [crepe](https://github.com/marl/crepe/)
+- [torchcrepe](https://github.com/maxrmorrison/torchcrepe)
+## Framework
+Pytorch
+## Model Format
+ONNX opset=11
+## Netron
+- [crepe.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/rvc/crepe.onnx.prototxt)
+- [crepe_tiny.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/rvc/crepe_tiny.onnx.prototxt)

models/onnx/ailia-models/code/crepe.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import sys
+import time
+from logging import getLogger
+import numpy as np
+import scipy.signal as signal
+from PIL import Image
+import librosa
+import soundfile as sf
+import matplotlib.pyplot as plt
+import ailia
+# import original modules
+sys.path.append('../../util')
+from microphone_utils import start_microphone_input  # noqa
+from model_utils import check_and_download_models  # noqa
+from arg_utils import get_base_parser, get_savepath, update_parser  # noqa
+# crepe util
+import mod_crepe
+from mod_crepe import WEIGHT_CREPE_PATH, MODEL_CREPE_PATH, WEIGHT_CREPE_TINY_PATH, MODEL_CREPE_TINY_PATH
+flg_ffmpeg = False
+if flg_ffmpeg:
+    import ffmpeg
+logger = getLogger(__name__)
+# ======================
+# Parameters
+# ======================
+REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/rvc/'
+SAMPLE_RATE = 16000
+WAV_PATH = 'test.wav'
+FIG_PATH = "output.png"
+# ======================
+# Arguemnt Parser Config
+# ======================
+parser = get_base_parser(
+    'Crepe', WAV_PATH, FIG_PATH, input_ftype='audio'
+)
+parser.add_argument(
+    '--f0_method', default="crepe_tiny", choices=("crepe", "crepe_tiny"),
+    help='Select the pitch extraction algorithm',
+)
+parser.add_argument(
+    '--onnx',
+    action='store_true',
+    help='execute onnxruntime version.'
+)
+parser.add_argument(
+    '--evaluate',
+    action='store_true',
+    help='evaluate with harvest.'
+)
+args = update_parser(parser)
+# ======================
+# Secondaty Functions
+# ======================
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    if flg_ffmpeg:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = ffmpeg.input(file, threads=0) \
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) \
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        audio = np.frombuffer(out, np.float32).flatten()
+    else:
+        # prepare input data
+        audio, source_sr = librosa.load(file, sr=None)
+        # Resample the wav if needed
+        if source_sr is not None and source_sr != sr:
+            audio = librosa.resample(audio, orig_sr=source_sr, target_sr=sr)
+    return audio
+# ======================
+# Main functions
+# ======================
+def get_f0(
+        f0_method,
+        window,
+        x,
+        p_len,
+):
+    sr = SAMPLE_RATE
+    f0_min = 50
+    f0_max = 1100
+    if f0_method == "harvest":
+        import pyworld
+        audio = x.astype(np.double)
+        fs = sr
+        frame_period = 10
+        f0, t = pyworld.harvest(
+            audio,
+            fs=fs,
+            f0_ceil=f0_max,
+            f0_floor=f0_min,
+            frame_period=frame_period,
+        )
+        f0 = pyworld.stonemask(audio, f0, t, fs)
+        filter_radius = 3
+        if filter_radius > 2:
+            f0 = signal.medfilt(f0, 3)
+    elif f0_method == "crepe" or f0_method == "crepe_tiny":
+        import mod_crepe
+        # Pick a batch size that doesn't cause memory errors on your gpu
+        batch_size = 512
+        audio = np.copy(x)[None]
+        f0, pd = mod_crepe.predict(
+            audio,
+            sr,
+            window,
+            f0_min,
+            f0_max,
+            batch_size=batch_size,
+            return_periodicity=True,
+        )
+        pd = mod_crepe.median(pd, 3)
+        f0 = mod_crepe.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 = f0[0]
+    else:
+        raise ValueError("f0_method: %s" % f0_method)
+    return f0
+def predict(audio, model, f0_method):
+    audio_max = np.abs(audio).max() / 0.95
+    if audio_max > 1:
+        audio /= audio_max
+    window = 160
+    p_len = audio.shape[0] // window
+    pitch = get_f0(
+        f0_method,
+        window,
+        audio,
+        p_len,
+    )
+    return pitch
+def recognize_from_audio(models):
+    # input audio loop
+    for audio_path in args.input:
+        logger.info(audio_path)
+        # prepare input data
+        audio = load_audio(audio_path, SAMPLE_RATE)
+        # inference
+        logger.info('Start inference...')
+        if args.benchmark:
+            logger.info('BENCHMARK mode')
+            start = int(round(time.time() * 1000))
+            output = predict(audio, models, args.f0_method)
+            end = int(round(time.time() * 1000))
+            estimation_time = (end - start)
+            logger.info(f'\ttotal processing time {estimation_time} ms')
+        else:
+            output = predict(audio, models, args.f0_method)
+        # reference data
+        if args.evaluate:
+            harvest = predict(audio, models, "harvest")
+        # plot
+        x = np.linspace(0, audio.shape[0] / SAMPLE_RATE, output.shape[0])
+        y = output
+        fig = plt.figure()
+        ax = fig.add_subplot()
+        y = output
+        ax.plot(x, y, label=args.f0_method)
+        if args.evaluate:
+            y = harvest
+            ax.plot(x, y, label="harvest", linestyle = "dashed")
+        ax.set_xlabel("sec")
+        ax.set_ylabel("f0 (hz)")
+        plt.legend()
+        # save result
+        savepath = get_savepath(args.savepath, audio_path, ext='.png')
+        logger.info(f'saved at : {savepath}')
+        plt.savefig(savepath)
+    logger.info('Script finished successfully.')
+def main():
+    if args.f0_method == "crepe_tiny":
+        check_and_download_models(WEIGHT_CREPE_TINY_PATH, MODEL_CREPE_TINY_PATH, REMOTE_PATH)
+    else:
+        check_and_download_models(WEIGHT_CREPE_PATH, MODEL_CREPE_PATH, REMOTE_PATH)
+    env_id = args.env_id
+    f0_model = mod_crepe.load_model(env_id, args.onnx, args.f0_method == "crepe_tiny")
+    if args.profile:
+        f0_model.set_profile_mode(True)
+    else:
+        f0_model = None
+    recognize_from_audio(f0_model)
+    if args.profile and not args.onnx:
+        print("--- profile f0_model")
+        print(f0_model.get_summary())
+        print("")
+if __name__ == '__main__':
+    main()

models/onnx/ailia-models/code/mod_crepe.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import functools
+import numpy as np
+import scipy
+import librosa
+import ailia
+from functional import im2col
+from math_utils import softmax
+WEIGHT_CREPE_PATH = "crepe.onnx"
+MODEL_CREPE_PATH = "crepe.onnx.prototxt"
+WEIGHT_CREPE_TINY_PATH = "crepe_tiny.onnx"
+MODEL_CREPE_TINY_PATH = "crepe_tiny.onnx.prototxt"
+CENTS_PER_BIN = 20  # cents
+MAX_FMAX = 2006.  # hz
+PITCH_BINS = 360
+SAMPLE_RATE = 16000  # hz
+WINDOW_SIZE = 1024  # samples
+UNVOICED = np.nan
+def load_model(env_id=0, flg_onnx=False, tiny=False):
+    # initialize
+    if tiny:
+        model_path = MODEL_CREPE_TINY_PATH
+        weight_path = WEIGHT_CREPE_TINY_PATH
+    else:
+        model_path = MODEL_CREPE_PATH
+        weight_path = WEIGHT_CREPE_PATH
+    if not flg_onnx:
+        model = ailia.Net(model_path, weight_path, env_id=env_id)
+    else:
+        import onnxruntime
+        providers = ["CPUExecutionProvider", "CUDAExecutionProvider"]
+        model = onnxruntime.InferenceSession(weight_path, providers=providers)
+    infer.flg_onnx = flg_onnx
+    infer.model = model
+    return model
+###############################################################################
+# Probability sequence decoding methods
+###############################################################################
+def viterbi(logits):
+    """Sample observations using viterbi decoding"""
+    # Create viterbi transition matrix
+    if not hasattr(viterbi, 'transition'):
+        xx, yy = np.meshgrid(range(360), range(360))
+        transition = np.maximum(12 - abs(xx - yy), 0)
+        transition = transition / transition.sum(axis=1, keepdims=True)
+        viterbi.transition = transition
+    # Normalize logits
+    sequences = softmax(logits, axis=1)
+    # Perform viterbi decoding
+    bins = np.array([
+        librosa.sequence.viterbi(sequence, viterbi.transition).astype(np.int64)
+        for sequence in sequences])
+    # Convert to frequency in Hz
+    return bins, bins_to_frequency(bins)
+###############################################################################
+# Crepe pitch prediction
+###############################################################################
+def predict(
+        audio,
+        sample_rate,
+        hop_length=None,
+        fmin=50.,
+        fmax=MAX_FMAX,
+        decoder=viterbi,
+        return_periodicity=False,
+        batch_size=None,
+        pad=True):
+    """Performs pitch estimation
+    Arguments
+        audio (np.ndarray [shape=(1, time)])
+            The audio signal
+        sample_rate (int)
+            The sampling rate in Hz
+        hop_length (int)
+            The hop_length in samples
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        decoder (function)
+            The decoder to use. See decode.py for decoders.
+        return_harmonicity (bool) [DEPRECATED]
+            Whether to also return the network confidence
+        return_periodicity (bool)
+            Whether to also return the network confidence
+        batch_size (int)
+            The number of frames per batch
+        pad (bool)
+            Whether to zero-pad the audio
+    Returns
+        pitch (np.ndarray [shape=(1, 1 + int(time // hop_length))])
+        (Optional) periodicity (np.ndarray
+                                [shape=(1, 1 + int(time // hop_length))])
+    """
+    results = []
+    # Preprocess audio
+    generator = preprocess(
+        audio, sample_rate, hop_length, batch_size, pad)
+    for frames in generator:
+        # Infer independent probabilities for each pitch bin
+        probabilities = infer(frames)
+        # shape=(batch, 360, time / hop_length)
+        probabilities = probabilities.reshape(
+            audio.shape[0], -1, PITCH_BINS).transpose(0, 2, 1)
+        # Convert probabilities to F0 and periodicity
+        result = postprocess(
+            probabilities, fmin, fmax,
+            decoder, return_periodicity)
+        results.append(result)
+    # Split pitch and periodicity
+    if return_periodicity:
+        pitch, periodicity = zip(*results)
+        return np.concatenate(pitch, axis=1), np.concatenate(periodicity, axis=1)
+    # Concatenate
+    return np.concatenate(results, axis=1)
+###############################################################################
+# Components for step-by-step prediction
+###############################################################################
+def infer(frame):
+    if not hasattr(infer, 'model'):
+        load_model()
+    flg_onnx = infer.flg_onnx
+    model = infer.model
+    # feedforward
+    if not flg_onnx:
+        output = model.predict([frame])
+    else:
+        output = model.run(None, {'input': frame})
+    return output[0]
+def postprocess(
+        probabilities,
+        fmin=0.,
+        fmax=MAX_FMAX,
+        decoder=viterbi,
+        return_periodicity=False):
+    """Convert model output to F0 and periodicity
+    Arguments
+        probabilities (np.ndarray [shape=(1, 360, time / hop_length)])
+            The probabilities for each pitch bin inferred by the network
+        fmin (float)
+            The minimum allowable frequency in Hz
+        fmax (float)
+            The maximum allowable frequency in Hz
+        viterbi (bool)
+            Whether to use viterbi decoding
+        return_periodicity (bool)
+            Whether to also return the network confidence
+    Returns
+        pitch (np.ndarray [shape=(1, 1 + int(time // hop_length))])
+        periodicity (np.ndarray [shape=(1, 1 + int(time // hop_length))])
+    """
+    # Convert frequency range to pitch bin range
+    minidx = frequency_to_bins(np.array(fmin))
+    maxidx = frequency_to_bins(np.array(fmax), np.ceil)
+    # Remove frequencies outside of allowable range
+    probabilities[:, :minidx] = -float('inf')
+    probabilities[:, maxidx:] = -float('inf')
+    # Perform argmax or viterbi sampling
+    bins, pitch = decoder(probabilities)
+    if not return_periodicity:
+        return pitch
+    # Compute periodicity from probabilities and decoded pitch bins
+    return pitch, periodicity(probabilities, bins)
+def preprocess(
+        audio,
+        sample_rate,
+        hop_length=None,
+        batch_size=None,
+        pad=True):
+    """Convert audio to model input
+    Arguments
+        audio (np.ndarray [shape=(1, time)])
+            The audio signals
+        sample_rate (int)
+            The sampling rate in Hz
+        hop_length (int)
+            The hop_length in samples
+        batch_size (int)
+            The number of frames per batch
+        pad (bool)
+            Whether to zero-pad the audio
+    Returns
+        frames (np.ndarray [shape=(1 + int(time // hop_length), 1024)])
+    """
+    # Default hop length of 10 ms
+    hop_length = sample_rate // 100 if hop_length is None else hop_length
+    # Resample
+    if sample_rate != SAMPLE_RATE:
+        # We have to use resampy if we want numbers to match Crepe
+        import resampy
+        audio = audio[0]
+        audio = resampy.resample(audio, sample_rate, SAMPLE_RATE)
+        audio = audio[None]
+        hop_length = int(hop_length * SAMPLE_RATE / sample_rate)
+    # Get total number of frames
+    # Maybe pad
+    if pad:
+        total_frames = 1 + int(audio.shape[1] // hop_length)
+        audio = np.pad(
+            audio,
+            ((0, 0), (WINDOW_SIZE // 2, WINDOW_SIZE // 2)))
+    else:
+        total_frames = 1 + int((audio.shape[1] - WINDOW_SIZE) // hop_length)
+    # Default to running all frames in a single batch
+    batch_size = total_frames if batch_size is None else batch_size
+    # Generate batches
+    for i in range(0, total_frames, batch_size):
+        # Batch indices
+        start = max(0, i * hop_length)
+        end = min(
+            audio.shape[1],
+            (i + batch_size - 1) * hop_length + WINDOW_SIZE)
+        kernel_size = (1, WINDOW_SIZE)
+        stride = (1, hop_length)
+        unfold = functools.partial(im2col, filters=kernel_size, stride=stride)
+        # Chunk
+        frames, *_ = unfold(audio[:, None, None, start:end])
+        frames = frames.astype(np.float32)
+        # shape=(1 + int(time / hop_length, 1024)
+        frames = frames[None].transpose(0, 2, 1).reshape(-1, WINDOW_SIZE)
+        # Mean-center
+        frames -= np.mean(frames, axis=1, keepdims=True)
+        # Scale
+        # Note: during silent frames, this produces very large values. But
+        # this seems to be what the network expects.
+        std = np.std(frames, axis=1, keepdims=True)
+        frames /= np.where(std > 1e-10, std, 1e-10)
+        yield frames
+###############################################################################
+# Pitch unit conversions
+###############################################################################
+def bins_to_cents(bins):
+    """Converts pitch bins to cents"""
+    cents = CENTS_PER_BIN * bins + 1997.3794084376191
+    # Trade quantization error for noise
+    return dither(cents)
+def bins_to_frequency(bins):
+    """Converts pitch bins to frequency in Hz"""
+    return cents_to_frequency(bins_to_cents(bins))
+def cents_to_bins(cents, quantize_fn=np.floor):
+    """Converts cents to pitch bins"""
+    bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
+    return quantize_fn(bins).astype(int)
+def cents_to_frequency(cents):
+    """Converts cents to frequency in Hz"""
+    return 10 * 2 ** (cents / 1200)
+def frequency_to_bins(frequency, quantize_fn=np.floor):
+    """Convert frequency in Hz to pitch bins"""
+    return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
+def frequency_to_cents(frequency):
+    """Convert frequency in Hz to cents"""
+    return 1200 * np.log2(frequency / 10.)
+###############################################################################
+# Utilities
+###############################################################################
+def periodicity(probabilities, bins):
+    """Computes the periodicity from the network output and pitch bins"""
+    # shape=(batch * time / hop_length, 360)
+    probs_stacked = probabilities.transpose(0, 2, 1).reshape(-1, PITCH_BINS)
+    # shape=(batch * time / hop_length, 1)
+    bins_stacked = bins.reshape(-1, 1).astype(np.int64)
+    # Use maximum logit over pitch bins as periodicity
+    # periodicity = probs_stacked.gather(1, bins_stacked)
+    periodicity = np.zeros(bins_stacked.shape)
+    for i in range(bins_stacked.shape[0]):
+        periodicity[i] = probs_stacked[i, bins_stacked[i]]
+    # shape=(batch, time / hop_length)
+    return periodicity.reshape(probabilities.shape[0], probabilities.shape[2])
+def dither(cents):
+    """Dither the predicted pitch in cents to remove quantization error"""
+    noise = scipy.stats.triang.rvs(
+        c=0.5,
+        loc=-CENTS_PER_BIN,
+        scale=2 * CENTS_PER_BIN,
+        size=cents.shape)
+    return cents + noise
+###############################################################################
+# Sequence filters
+###############################################################################
+def mean(signals, win_length=9):
+    """Averave filtering for signals containing nan values
+    Arguments
+        signals (np.ndarray (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+    Returns
+        filtered (np.ndarray (shape=(batch, time)))
+    """
+    assert signals.ndim == 2, "Input tensor must have 2 dimensions (batch_size, width)"
+    def apply_convolution(array, kernel):
+        pad_width = win_length // 2
+        padded_array = np.pad(array, ((0, 0), (pad_width, pad_width)), mode='constant', constant_values=0)
+        convolved = np.array([
+            np.convolve(padded_array[i, :], kernel, mode='valid')
+            for i in range(padded_array.shape[0])
+        ])
+        return convolved
+    # Apply the mask by setting masked elements to zero, or make NaNs zero
+    mask = ~np.isnan(signals)
+    masked_x = np.where(mask, signals, np.zeros(signals.shape))
+    # Create a ones kernel with the same number of channels as the input tensor
+    ones_kernel = np.ones(win_length)
+    # Perform sum pooling
+    sum_pooled = apply_convolution(masked_x, ones_kernel)
+    # Count the non-masked (valid) elements in each pooling window
+    valid_count = apply_convolution(mask.astype(float), ones_kernel)
+    valid_count = np.clip(valid_count, 1, None)  # Avoid division by zero
+    # Perform masked average pooling
+    avg_pooled = sum_pooled / valid_count
+    # Fill zero values with NaNs
+    avg_pooled[avg_pooled == 0] = float("nan")
+    return avg_pooled
+def median(signals, win_length):
+    """Median filtering for signals containing nan values
+    Arguments
+        signals (np.ndarray (shape=(batch, time)))
+            The signals to filter
+        win_length
+            The size of the analysis window
+    Returns
+        filtered (np.ndarray (shape=(batch, time)))
+    """
+    assert signals.ndim == 2, "Input tensor must have 2 dimensions (batch_size, width)"
+    signals = np.expand_dims(signals, axis=1)
+    mask = ~np.isnan(signals)
+    masked_x = np.where(mask, signals, np.zeros(signals.shape))
+    padding = win_length // 2
+    shape = masked_x.shape
+    x = np.pad(masked_x, ((0, 0), (0, 0), (padding, padding)), mode="reflect")
+    mask = np.pad(
+        mask.astype(np.float32), ((0, 0), (0, 0), (padding, padding)),
+        mode="constant", constant_values=0)
+    _x = np.zeros(shape + (win_length,))
+    _msk = np.zeros(shape + (win_length,))
+    for i in range(shape[-1]):
+        _x[:, :, i] = x[:, :, i:i + win_length]
+        _msk[:, :, i] = mask[:, :, i:i + win_length]
+    x = _x
+    mask = _msk
+    x = x.reshape(x.shape[:3] + (-1,))
+    mask = mask.reshape(mask.shape[:3] + (-1,))
+    # Combine the mask with the input tensor
+    x_masked = np.where(mask.astype(bool), x.astype(np.float32), float("inf"))
+    # Sort the masked tensor along the last dimension
+    x_sorted = np.sort(x_masked, axis=-1)
+    # Compute the count of non-masked (valid) values
+    valid_count = np.sum(mask, axis=-1)
+    # Calculate the index of the median value for each pooling window
+    median_idx = np.clip((valid_count - 1) // 2, 0, None)
+    # Gather the median values using the calculated indices
+    # median_pooled = x_sorted.gather(-1, median_idx.unsqueeze(-1).long()).squeeze(-1)
+    median_idx = median_idx.astype(int)
+    median_pooled = [
+        x_sorted[:, :, [i], median_idx[0, 0, i]] for i in range(median_idx.shape[-1])
+    ]
+    median_pooled = np.concatenate(median_pooled, axis=-1)
+    # Fill infinite values with NaNs
+    median_pooled[np.isinf(median_pooled)] = float("nan")
+    return np.squeeze(median_pooled, axis=1)

models/onnx/ailia-models/code/output_full.png ADDED Viewed

models/onnx/ailia-models/code/output_tiny.png ADDED Viewed

models/onnx/ailia-models/code/test.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d20bdf667c6b606917159f62c2b728f5836de53079830216735b459bc6aad2e8
+size 882044

models/onnx/ailia-models/crepe.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:124679dba8e591eb2e8b97e338184c824300f4cc100b8e4036f4ef7afccaa9aa
+size 88991558

models/onnx/ailia-models/crepe.onnx.prototxt ADDED Viewed

	@@ -0,0 +1,2108 @@

+ir_version: 6
+producer_name: "pytorch"
+producer_version: "2.1.0"
+model_version: 0
+graph {
+  name: "torch_jit"
+  node {
+    input: "input"
+    output: "/Unsqueeze_output_0"
+    name: "/Unsqueeze"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/Unsqueeze_output_0"
+    output: "/Unsqueeze_1_output_0"
+    name: "/Unsqueeze_1"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 3
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_output_0"
+    name: "/Constant"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_1_output_0"
+    name: "/Constant_1"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_output_0"
+    output: "/ConstantOfShape_output_0"
+    name: "/ConstantOfShape"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_1_output_0"
+    input: "/ConstantOfShape_output_0"
+    output: "/Concat_output_0"
+    name: "/Concat"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_2_output_0"
+    name: "/Constant_2"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_output_0"
+    input: "/Constant_2_output_0"
+    output: "/Reshape_output_0"
+    name: "/Reshape"
+    op_type: "Reshape"
+  }
+  node {
+    output: "/Constant_3_output_0"
+    name: "/Constant_3"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_4_output_0"
+    name: "/Constant_4"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_5_output_0"
+    name: "/Constant_5"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_6_output_0"
+    name: "/Constant_6"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Reshape_output_0"
+    input: "/Constant_4_output_0"
+    input: "/Constant_5_output_0"
+    input: "/Constant_3_output_0"
+    input: "/Constant_6_output_0"
+    output: "/Slice_output_0"
+    name: "/Slice"
+    op_type: "Slice"
+  }
+  node {
+    input: "/Slice_output_0"
+    output: "/Transpose_output_0"
+    name: "/Transpose"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_7_output_0"
+    name: "/Constant_7"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_output_0"
+    input: "/Constant_7_output_0"
+    output: "/Reshape_1_output_0"
+    name: "/Reshape_1"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_1_output_0"
+    output: "/Cast_output_0"
+    name: "/Cast"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/Unsqueeze_1_output_0"
+    input: "/Cast_output_0"
+    input: ""
+    output: "/Pad_output_0"
+    name: "/Pad"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "constant"
+      type: STRING
+    }
+  }
+  node {
+    input: "/Pad_output_0"
+    input: "conv1.weight"
+    input: "conv1.bias"
+    output: "/conv1/Conv_output_0"
+    name: "/conv1/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 512
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 4
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/conv1/Conv_output_0"
+    output: "/Relu_output_0"
+    name: "/Relu"
+    op_type: "Relu"
+  }
+  node {
+    input: "/Relu_output_0"
+    input: "conv1_BN.weight"
+    input: "conv1_BN.bias"
+    input: "conv1_BN.running_mean"
+    input: "conv1_BN.running_var"
+    output: "/conv1_BN/BatchNormalization_output_0"
+    name: "/conv1_BN/BatchNormalization"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 0.0010000000474974513
+      type: FLOAT
+    }
+    attribute {
+      name: "momentum"
+      f: 1.0
+      type: FLOAT
+    }
+  }
+  node {
+    input: "/conv1_BN/BatchNormalization_output_0"
+    output: "/MaxPool_output_0"
+    name: "/MaxPool"
+    op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_8_output_0"
+    name: "/Constant_8"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_9_output_0"
+    name: "/Constant_9"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_8_output_0"
+    output: "/ConstantOfShape_1_output_0"
+    name: "/ConstantOfShape_1"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_9_output_0"
+    input: "/ConstantOfShape_1_output_0"
+    output: "/Concat_1_output_0"
+    name: "/Concat_1"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_10_output_0"
+    name: "/Constant_10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_1_output_0"
+    input: "/Constant_10_output_0"
+    output: "/Reshape_2_output_0"
+    name: "/Reshape_2"
+    op_type: "Reshape"
+  }
+  node {
+    output: "/Constant_11_output_0"
+    name: "/Constant_11"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_12_output_0"
+    name: "/Constant_12"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_13_output_0"
+    name: "/Constant_13"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_14_output_0"
+    name: "/Constant_14"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Reshape_2_output_0"
+    input: "/Constant_12_output_0"
+    input: "/Constant_13_output_0"
+    input: "/Constant_11_output_0"
+    input: "/Constant_14_output_0"
+    output: "/Slice_1_output_0"
+    name: "/Slice_1"
+    op_type: "Slice"
+  }
+  node {
+    input: "/Slice_1_output_0"
+    output: "/Transpose_1_output_0"
+    name: "/Transpose_1"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_15_output_0"
+    name: "/Constant_15"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_1_output_0"
+    input: "/Constant_15_output_0"
+    output: "/Reshape_3_output_0"
+    name: "/Reshape_3"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_3_output_0"
+    output: "/Cast_1_output_0"
+    name: "/Cast_1"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/MaxPool_output_0"
+    input: "/Cast_1_output_0"
+    input: ""
+    output: "/Pad_1_output_0"
+    name: "/Pad_1"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "constant"
+      type: STRING
+    }
+  }
+  node {
+    input: "/Pad_1_output_0"
+    input: "conv2.weight"
+    input: "conv2.bias"
+    output: "/conv2/Conv_output_0"
+    name: "/conv2/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 64
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/conv2/Conv_output_0"
+    output: "/Relu_1_output_0"
+    name: "/Relu_1"
+    op_type: "Relu"
+  }
+  node {
+    input: "/Relu_1_output_0"
+    input: "conv2_BN.weight"
+    input: "conv2_BN.bias"
+    input: "conv2_BN.running_mean"
+    input: "conv2_BN.running_var"
+    output: "/conv2_BN/BatchNormalization_output_0"
+    name: "/conv2_BN/BatchNormalization"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 0.0010000000474974513
+      type: FLOAT
+    }
+    attribute {
+      name: "momentum"
+      f: 1.0
+      type: FLOAT
+    }
+  }
+  node {
+    input: "/conv2_BN/BatchNormalization_output_0"
+    output: "/MaxPool_1_output_0"
+    name: "/MaxPool_1"
+    op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_16_output_0"
+    name: "/Constant_16"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_17_output_0"
+    name: "/Constant_17"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_16_output_0"
+    output: "/ConstantOfShape_2_output_0"
+    name: "/ConstantOfShape_2"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_17_output_0"
+    input: "/ConstantOfShape_2_output_0"
+    output: "/Concat_2_output_0"
+    name: "/Concat_2"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_18_output_0"
+    name: "/Constant_18"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_2_output_0"
+    input: "/Constant_18_output_0"
+    output: "/Reshape_4_output_0"
+    name: "/Reshape_4"
+    op_type: "Reshape"
+  }
+  node {
+    output: "/Constant_19_output_0"
+    name: "/Constant_19"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_20_output_0"
+    name: "/Constant_20"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_21_output_0"
+    name: "/Constant_21"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_22_output_0"
+    name: "/Constant_22"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Reshape_4_output_0"
+    input: "/Constant_20_output_0"
+    input: "/Constant_21_output_0"
+    input: "/Constant_19_output_0"
+    input: "/Constant_22_output_0"
+    output: "/Slice_2_output_0"
+    name: "/Slice_2"
+    op_type: "Slice"
+  }
+  node {
+    input: "/Slice_2_output_0"
+    output: "/Transpose_2_output_0"
+    name: "/Transpose_2"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_23_output_0"
+    name: "/Constant_23"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_2_output_0"
+    input: "/Constant_23_output_0"
+    output: "/Reshape_5_output_0"
+    name: "/Reshape_5"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_5_output_0"
+    output: "/Cast_2_output_0"
+    name: "/Cast_2"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/MaxPool_1_output_0"
+    input: "/Cast_2_output_0"
+    input: ""
+    output: "/Pad_2_output_0"
+    name: "/Pad_2"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "constant"
+      type: STRING
+    }
+  }
+  node {
+    input: "/Pad_2_output_0"
+    input: "conv3.weight"
+    input: "conv3.bias"
+    output: "/conv3/Conv_output_0"
+    name: "/conv3/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 64
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/conv3/Conv_output_0"
+    output: "/Relu_2_output_0"
+    name: "/Relu_2"
+    op_type: "Relu"
+  }
+  node {
+    input: "/Relu_2_output_0"
+    input: "conv3_BN.weight"
+    input: "conv3_BN.bias"
+    input: "conv3_BN.running_mean"
+    input: "conv3_BN.running_var"
+    output: "/conv3_BN/BatchNormalization_output_0"
+    name: "/conv3_BN/BatchNormalization"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 0.0010000000474974513
+      type: FLOAT
+    }
+    attribute {
+      name: "momentum"
+      f: 1.0
+      type: FLOAT
+    }
+  }
+  node {
+    input: "/conv3_BN/BatchNormalization_output_0"
+    output: "/MaxPool_2_output_0"
+    name: "/MaxPool_2"
+    op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_24_output_0"
+    name: "/Constant_24"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_25_output_0"
+    name: "/Constant_25"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_24_output_0"
+    output: "/ConstantOfShape_3_output_0"
+    name: "/ConstantOfShape_3"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_25_output_0"
+    input: "/ConstantOfShape_3_output_0"
+    output: "/Concat_3_output_0"
+    name: "/Concat_3"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_26_output_0"
+    name: "/Constant_26"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_3_output_0"
+    input: "/Constant_26_output_0"
+    output: "/Reshape_6_output_0"
+    name: "/Reshape_6"
+    op_type: "Reshape"
+  }
+  node {
+    output: "/Constant_27_output_0"
+    name: "/Constant_27"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_28_output_0"
+    name: "/Constant_28"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_29_output_0"
+    name: "/Constant_29"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_30_output_0"
+    name: "/Constant_30"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Reshape_6_output_0"
+    input: "/Constant_28_output_0"
+    input: "/Constant_29_output_0"
+    input: "/Constant_27_output_0"
+    input: "/Constant_30_output_0"
+    output: "/Slice_3_output_0"
+    name: "/Slice_3"
+    op_type: "Slice"
+  }
+  node {
+    input: "/Slice_3_output_0"
+    output: "/Transpose_3_output_0"
+    name: "/Transpose_3"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_31_output_0"
+    name: "/Constant_31"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_3_output_0"
+    input: "/Constant_31_output_0"
+    output: "/Reshape_7_output_0"
+    name: "/Reshape_7"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_7_output_0"
+    output: "/Cast_3_output_0"
+    name: "/Cast_3"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/MaxPool_2_output_0"
+    input: "/Cast_3_output_0"
+    input: ""
+    output: "/Pad_3_output_0"
+    name: "/Pad_3"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "constant"
+      type: STRING
+    }
+  }
+  node {
+    input: "/Pad_3_output_0"
+    input: "conv4.weight"
+    input: "conv4.bias"
+    output: "/conv4/Conv_output_0"
+    name: "/conv4/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 64
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/conv4/Conv_output_0"
+    output: "/Relu_3_output_0"
+    name: "/Relu_3"
+    op_type: "Relu"
+  }
+  node {
+    input: "/Relu_3_output_0"
+    input: "conv4_BN.weight"
+    input: "conv4_BN.bias"
+    input: "conv4_BN.running_mean"
+    input: "conv4_BN.running_var"
+    output: "/conv4_BN/BatchNormalization_output_0"
+    name: "/conv4_BN/BatchNormalization"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 0.0010000000474974513
+      type: FLOAT
+    }
+    attribute {
+      name: "momentum"
+      f: 1.0
+      type: FLOAT
+    }
+  }
+  node {
+    input: "/conv4_BN/BatchNormalization_output_0"
+    output: "/MaxPool_3_output_0"
+    name: "/MaxPool_3"
+    op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_32_output_0"
+    name: "/Constant_32"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_33_output_0"
+    name: "/Constant_33"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_32_output_0"
+    output: "/ConstantOfShape_4_output_0"
+    name: "/ConstantOfShape_4"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_33_output_0"
+    input: "/ConstantOfShape_4_output_0"
+    output: "/Concat_4_output_0"
+    name: "/Concat_4"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_34_output_0"
+    name: "/Constant_34"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_4_output_0"
+    input: "/Constant_34_output_0"
+    output: "/Reshape_8_output_0"
+    name: "/Reshape_8"
+    op_type: "Reshape"
+  }
+  node {
+    output: "/Constant_35_output_0"
+    name: "/Constant_35"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_36_output_0"
+    name: "/Constant_36"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_37_output_0"
+    name: "/Constant_37"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_38_output_0"
+    name: "/Constant_38"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Reshape_8_output_0"
+    input: "/Constant_36_output_0"
+    input: "/Constant_37_output_0"
+    input: "/Constant_35_output_0"
+    input: "/Constant_38_output_0"
+    output: "/Slice_4_output_0"
+    name: "/Slice_4"
+    op_type: "Slice"
+  }
+  node {
+    input: "/Slice_4_output_0"
+    output: "/Transpose_4_output_0"
+    name: "/Transpose_4"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_39_output_0"
+    name: "/Constant_39"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_4_output_0"
+    input: "/Constant_39_output_0"
+    output: "/Reshape_9_output_0"
+    name: "/Reshape_9"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_9_output_0"
+    output: "/Cast_4_output_0"
+    name: "/Cast_4"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/MaxPool_3_output_0"
+    input: "/Cast_4_output_0"
+    input: ""
+    output: "/Pad_4_output_0"
+    name: "/Pad_4"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "constant"
+      type: STRING
+    }
+  }
+  node {
+    input: "/Pad_4_output_0"
+    input: "conv5.weight"
+    input: "conv5.bias"
+    output: "/conv5/Conv_output_0"
+    name: "/conv5/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 64
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/conv5/Conv_output_0"
+    output: "/Relu_4_output_0"
+    name: "/Relu_4"
+    op_type: "Relu"
+  }
+  node {
+    input: "/Relu_4_output_0"
+    input: "conv5_BN.weight"
+    input: "conv5_BN.bias"
+    input: "conv5_BN.running_mean"
+    input: "conv5_BN.running_var"
+    output: "/conv5_BN/BatchNormalization_output_0"
+    name: "/conv5_BN/BatchNormalization"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 0.0010000000474974513
+      type: FLOAT
+    }
+    attribute {
+      name: "momentum"
+      f: 1.0
+      type: FLOAT
+    }
+  }
+  node {
+    input: "/conv5_BN/BatchNormalization_output_0"
+    output: "/MaxPool_4_output_0"
+    name: "/MaxPool_4"
+    op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_40_output_0"
+    name: "/Constant_40"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_41_output_0"
+    name: "/Constant_41"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 4
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_40_output_0"
+    output: "/ConstantOfShape_5_output_0"
+    name: "/ConstantOfShape_5"
+    op_type: "ConstantOfShape"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        raw_data: "\000\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Constant_41_output_0"
+    input: "/ConstantOfShape_5_output_0"
+    output: "/Concat_5_output_0"
+    name: "/Concat_5"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 0
+      type: INT
+    }
+  }
+  node {
+    output: "/Constant_42_output_0"
+    name: "/Constant_42"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Concat_5_output_0"
+    input: "/Constant_42_output_0"
+    output: "/Reshape_10_output_0"
+    name: "/Reshape_10"
+    op_type: "Reshape"
+  }
+  node {
+    output: "/Constant_43_output_0"
+    name: "/Constant_43"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_44_output_0"
+    name: "/Constant_44"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_45_output_0"
+    name: "/Constant_45"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    output: "/Constant_46_output_0"
+    name: "/Constant_46"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Reshape_10_output_0"
+    input: "/Constant_44_output_0"
+    input: "/Constant_45_output_0"
+    input: "/Constant_43_output_0"
+    input: "/Constant_46_output_0"
+    output: "/Slice_5_output_0"
+    name: "/Slice_5"
+    op_type: "Slice"
+  }
+  node {
+    input: "/Slice_5_output_0"
+    output: "/Transpose_5_output_0"
+    name: "/Transpose_5"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 1
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_47_output_0"
+    name: "/Constant_47"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 1
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_5_output_0"
+    input: "/Constant_47_output_0"
+    output: "/Reshape_11_output_0"
+    name: "/Reshape_11"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_11_output_0"
+    output: "/Cast_5_output_0"
+    name: "/Cast_5"
+    op_type: "Cast"
+    attribute {
+      name: "to"
+      i: 7
+      type: INT
+    }
+  }
+  node {
+    input: "/MaxPool_4_output_0"
+    input: "/Cast_5_output_0"
+    input: ""
+    output: "/Pad_5_output_0"
+    name: "/Pad_5"
+    op_type: "Pad"
+    attribute {
+      name: "mode"
+      s: "constant"
+      type: STRING
+    }
+  }
+  node {
+    input: "/Pad_5_output_0"
+    input: "conv6.weight"
+    input: "conv6.bias"
+    output: "/conv6/Conv_output_0"
+    name: "/conv6/Conv"
+    op_type: "Conv"
+    attribute {
+      name: "dilations"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "group"
+      i: 1
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 64
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 1
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/conv6/Conv_output_0"
+    output: "/Relu_5_output_0"
+    name: "/Relu_5"
+    op_type: "Relu"
+  }
+  node {
+    input: "/Relu_5_output_0"
+    input: "conv6_BN.weight"
+    input: "conv6_BN.bias"
+    input: "conv6_BN.running_mean"
+    input: "conv6_BN.running_var"
+    output: "/conv6_BN/BatchNormalization_output_0"
+    name: "/conv6_BN/BatchNormalization"
+    op_type: "BatchNormalization"
+    attribute {
+      name: "epsilon"
+      f: 0.0010000000474974513
+      type: FLOAT
+    }
+    attribute {
+      name: "momentum"
+      f: 1.0
+      type: FLOAT
+    }
+  }
+  node {
+    input: "/conv6_BN/BatchNormalization_output_0"
+    output: "/MaxPool_5_output_0"
+    name: "/MaxPool_5"
+    op_type: "MaxPool"
+    attribute {
+      name: "ceil_mode"
+      i: 0
+      type: INT
+    }
+    attribute {
+      name: "kernel_shape"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+    attribute {
+      name: "pads"
+      ints: 0
+      ints: 0
+      ints: 0
+      ints: 0
+      type: INTS
+    }
+    attribute {
+      name: "strides"
+      ints: 2
+      ints: 1
+      type: INTS
+    }
+  }
+  node {
+    input: "/MaxPool_5_output_0"
+    output: "/Transpose_6_output_0"
+    name: "/Transpose_6"
+    op_type: "Transpose"
+    attribute {
+      name: "perm"
+      ints: 0
+      ints: 2
+      ints: 1
+      ints: 3
+      type: INTS
+    }
+  }
+  node {
+    output: "/Constant_48_output_0"
+    name: "/Constant_48"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        data_location: 0
+      }
+      type: TENSOR
+    }
+  }
+  node {
+    input: "/Transpose_6_output_0"
+    input: "/Constant_48_output_0"
+    output: "/Reshape_12_output_0"
+    name: "/Reshape_12"
+    op_type: "Reshape"
+  }
+  node {
+    input: "/Reshape_12_output_0"
+    input: "classifier.weight"
+    input: "classifier.bias"
+    output: "/classifier/Gemm_output_0"
+    name: "/classifier/Gemm"
+    op_type: "Gemm"
+    attribute {
+      name: "alpha"
+      f: 1.0
+      type: FLOAT
+    }
+    attribute {
+      name: "beta"
+      f: 1.0
+      type: FLOAT
+    }
+    attribute {
+      name: "transB"
+      i: 1
+      type: INT
+    }
+  }
+  node {
+    input: "/classifier/Gemm_output_0"
+    output: "output"
+    name: "/Sigmoid"
+    op_type: "Sigmoid"
+  }
+  initializer {
+      dims: 1024
+      dims: 1
+      dims: 512
+      dims: 1
+      data_type: 1
+      name: "conv1.weight"
+  }
+  initializer {
+      dims: 1024
+      data_type: 1
+      name: "conv1.bias"
+  }
+  initializer {
+      dims: 1024
+      data_type: 1
+      name: "conv1_BN.weight"
+  }
+  initializer {
+      dims: 1024
+      data_type: 1
+      name: "conv1_BN.bias"
+  }
+  initializer {
+      dims: 1024
+      data_type: 1
+      name: "conv1_BN.running_mean"
+  }
+  initializer {
+      dims: 1024
+      data_type: 1
+      name: "conv1_BN.running_var"
+  }
+  initializer {
+      dims: 128
+      dims: 1024
+      dims: 64
+      dims: 1
+      data_type: 1
+      name: "conv2.weight"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv2.bias"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv2_BN.weight"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv2_BN.bias"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv2_BN.running_mean"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv2_BN.running_var"
+  }
+  initializer {
+      dims: 128
+      dims: 128
+      dims: 64
+      dims: 1
+      data_type: 1
+      name: "conv3.weight"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv3.bias"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv3_BN.weight"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv3_BN.bias"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv3_BN.running_mean"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv3_BN.running_var"
+  }
+  initializer {
+      dims: 128
+      dims: 128
+      dims: 64
+      dims: 1
+      data_type: 1
+      name: "conv4.weight"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv4.bias"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv4_BN.weight"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv4_BN.bias"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv4_BN.running_mean"
+  }
+  initializer {
+      dims: 128
+      data_type: 1
+      name: "conv4_BN.running_var"
+  }
+  initializer {
+      dims: 256
+      dims: 128
+      dims: 64
+      dims: 1
+      data_type: 1
+      name: "conv5.weight"
+  }
+  initializer {
+      dims: 256
+      data_type: 1
+      name: "conv5.bias"
+  }
+  initializer {
+      dims: 256
+      data_type: 1
+      name: "conv5_BN.weight"
+  }
+  initializer {
+      dims: 256
+      data_type: 1
+      name: "conv5_BN.bias"
+  }
+  initializer {
+      dims: 256
+      data_type: 1
+      name: "conv5_BN.running_mean"
+  }
+  initializer {
+      dims: 256
+      data_type: 1
+      name: "conv5_BN.running_var"
+  }
+  initializer {
+      dims: 512
+      dims: 256
+      dims: 64
+      dims: 1
+      data_type: 1
+      name: "conv6.weight"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "conv6.bias"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "conv6_BN.weight"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "conv6_BN.bias"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "conv6_BN.running_mean"
+  }
+  initializer {
+      dims: 512
+      data_type: 1
+      name: "conv6_BN.running_var"
+  }
+  initializer {
+      dims: 360
+      dims: 2048
+      data_type: 1
+      name: "classifier.weight"
+  }
+  initializer {
+      dims: 360
+      data_type: 1
+      name: "classifier.bias"
+  }
+  input {
+    name: "input"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "n"
+          }
+          dim {
+            dim_value: 1024
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "output"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_param: "Sigmoidoutput_dim_0"
+          }
+          dim {
+            dim_value: 360
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  domain: ""
+  version: 11
+}

models/onnx/ailia-models/crepe_tiny.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fd045b75d2b0d08fe7bb2711e466a4e76b625714f07ea3aabedb50598d7428e
+size 2193941

models/onnx/ailia-models/crepe_tiny.onnx.prototxt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/onnx/ailia-models/source.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/rvc
+https://storage.googleapis.com/ailia-models/rvc/crepe.onnx
+https://storage.googleapis.com/ailia-models/rvc/crepe.onnx.prototxt
+https://storage.googleapis.com/ailia-models/rvc/crepe_tiny.onnx
+https://storage.googleapis.com/ailia-models/rvc/crepe_tiny.onnx.prototxt