Update extract.py
Browse files- extract.py +9 -55
extract.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
"""
|
| 2 |
Module which exposes functionality for extracting training features from
|
| 3 |
-
audio datasets.
|
| 4 |
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
-
|
| 8 |
from multiprocessing import cpu_count
|
| 9 |
|
| 10 |
from ultimate_rvc.core.common import (
|
|
@@ -24,7 +23,6 @@ from ultimate_rvc.typing_extra import (
|
|
| 24 |
TrainingF0Method,
|
| 25 |
)
|
| 26 |
|
| 27 |
-
|
| 28 |
def extract_features(
|
| 29 |
model_name: str,
|
| 30 |
f0_method: TrainingF0Method = TrainingF0Method.RMVPE,
|
|
@@ -36,47 +34,6 @@ def extract_features(
|
|
| 36 |
hardware_acceleration: DeviceType = DeviceType.AUTOMATIC,
|
| 37 |
gpu_ids: set[int] | None = None,
|
| 38 |
) -> None:
|
| 39 |
-
"""
|
| 40 |
-
Extract features from the preprocessed dataset associated with a
|
| 41 |
-
voice model to be trained.
|
| 42 |
-
|
| 43 |
-
Parameters
|
| 44 |
-
----------
|
| 45 |
-
model_name : str
|
| 46 |
-
The name of the voice model to be trained.
|
| 47 |
-
f0_method : TrainingF0Method, default=TrainingF0Method.RMVPE
|
| 48 |
-
The method to use for extracting pitch features.
|
| 49 |
-
hop_length : int, default=128
|
| 50 |
-
The hop length to use for extracting pitch features. Only used
|
| 51 |
-
with the CREPE pitch extraction method.
|
| 52 |
-
embedder_model : EmbedderModel, default=EmbedderModel.CONTENTVEC
|
| 53 |
-
The model to use for extracting audio embeddings.
|
| 54 |
-
custom_embedder_model : StrPath, optional
|
| 55 |
-
The name of the custom embedder model to use for extracting
|
| 56 |
-
audio embeddings.
|
| 57 |
-
include_mutes : int, default=2
|
| 58 |
-
The number of mute audio files to include in the generated
|
| 59 |
-
training file list. Adding silent files enables the voice model
|
| 60 |
-
to handle pure silence in inferred audio files. If the
|
| 61 |
-
preprocessed audio dataset already contains segments of pure
|
| 62 |
-
silence, set this to 0.
|
| 63 |
-
cpu_cores : int, default=cpu_count()
|
| 64 |
-
The number of CPU cores to use for feature extraction.
|
| 65 |
-
hardware_acceleration : DeviceType, default=DeviceType.AUTOMATIC
|
| 66 |
-
The type of hardware acceleration to use for feature extraction.
|
| 67 |
-
`AUTOMATIC` will select the first available GPU and fall back to
|
| 68 |
-
CPU if no GPUs are available.
|
| 69 |
-
gpu_ids : set[int], optional
|
| 70 |
-
Set of ids of the GPUs to use for feature extraction when `GPU`
|
| 71 |
-
is selected for hardware acceleration.
|
| 72 |
-
|
| 73 |
-
Raises
|
| 74 |
-
------
|
| 75 |
-
ModelAsssociatedEntityNotFoundError
|
| 76 |
-
If no preprocessed dataset audio files are associated with the
|
| 77 |
-
voice model identified by the provided name.
|
| 78 |
-
|
| 79 |
-
"""
|
| 80 |
model_path = validate_model(model_name, Entity.TRAINING_MODEL)
|
| 81 |
sliced_audios16k_path = model_path / "sliced_audios_16k"
|
| 82 |
if not sliced_audios16k_path.is_dir() or not any(sliced_audios16k_path.iterdir()):
|
|
@@ -100,19 +57,20 @@ def extract_features(
|
|
| 100 |
chosen_embedder_model = str(custom_embedder_model_path)
|
| 101 |
embedder_model_id = f"custom_{combined_file_hash}"
|
| 102 |
|
|
|
|
| 103 |
f0_method_id = f0_method
|
| 104 |
if f0_method in {TrainingF0Method.CREPE, TrainingF0Method.CREPE_TINY}:
|
| 105 |
f0_method_id = f"{f0_method}_{hop_length}"
|
|
|
|
|
|
|
| 106 |
|
| 107 |
device_type, device_ids = validate_devices(hardware_acceleration, gpu_ids)
|
| 108 |
-
|
| 109 |
devices = (
|
| 110 |
[f"{device_type}:{device_id}" for device_id in device_ids]
|
| 111 |
if device_ids
|
| 112 |
else [device_type]
|
| 113 |
)
|
| 114 |
-
|
| 115 |
-
# so we import it here manually
|
| 116 |
from ultimate_rvc.rvc.train.extract import extract # noqa: PLC0415
|
| 117 |
|
| 118 |
file_infos = extract.initialize_extraction(
|
|
@@ -125,24 +83,20 @@ def extract_features(
|
|
| 125 |
chosen_embedder_model,
|
| 126 |
combined_file_hash,
|
| 127 |
)
|
|
|
|
| 128 |
display_progress("[~] Extracting pitch features...")
|
| 129 |
extract.run_pitch_extraction(file_infos, devices, f0_method, hop_length, cpu_cores)
|
|
|
|
| 130 |
display_progress("[~] Extracting audio embeddings...")
|
| 131 |
extract.run_embedding_extraction(
|
| 132 |
file_infos,
|
| 133 |
devices,
|
| 134 |
embedder_model,
|
| 135 |
-
(
|
| 136 |
-
str(custom_embedder_model_path)
|
| 137 |
-
if custom_embedder_model_path is not None
|
| 138 |
-
else None
|
| 139 |
-
),
|
| 140 |
cpu_cores,
|
| 141 |
)
|
| 142 |
-
# NOTE The lazy_import function does not work with the package below
|
| 143 |
-
# so we import it here manually
|
| 144 |
-
from ultimate_rvc.rvc.train.extract import preparing_files # noqa: PLC0415
|
| 145 |
|
|
|
|
| 146 |
preparing_files.generate_config(str(model_path))
|
| 147 |
preparing_files.generate_filelist(
|
| 148 |
str(model_path),
|
|
|
|
| 1 |
"""
|
| 2 |
Module which exposes functionality for extracting training features from
|
| 3 |
+
audio datasets, now with DJCM support.
|
| 4 |
"""
|
| 5 |
|
| 6 |
from __future__ import annotations
|
|
|
|
| 7 |
from multiprocessing import cpu_count
|
| 8 |
|
| 9 |
from ultimate_rvc.core.common import (
|
|
|
|
| 23 |
TrainingF0Method,
|
| 24 |
)
|
| 25 |
|
|
|
|
| 26 |
def extract_features(
|
| 27 |
model_name: str,
|
| 28 |
f0_method: TrainingF0Method = TrainingF0Method.RMVPE,
|
|
|
|
| 34 |
hardware_acceleration: DeviceType = DeviceType.AUTOMATIC,
|
| 35 |
gpu_ids: set[int] | None = None,
|
| 36 |
) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
model_path = validate_model(model_name, Entity.TRAINING_MODEL)
|
| 38 |
sliced_audios16k_path = model_path / "sliced_audios_16k"
|
| 39 |
if not sliced_audios16k_path.is_dir() or not any(sliced_audios16k_path.iterdir()):
|
|
|
|
| 57 |
chosen_embedder_model = str(custom_embedder_model_path)
|
| 58 |
embedder_model_id = f"custom_{combined_file_hash}"
|
| 59 |
|
| 60 |
+
# Generate f0_method_id
|
| 61 |
f0_method_id = f0_method
|
| 62 |
if f0_method in {TrainingF0Method.CREPE, TrainingF0Method.CREPE_TINY}:
|
| 63 |
f0_method_id = f"{f0_method}_{hop_length}"
|
| 64 |
+
elif f0_method == TrainingF0Method.DJCM:
|
| 65 |
+
f0_method_id = "djcm" # DJCM tidak butuh hop_length
|
| 66 |
|
| 67 |
device_type, device_ids = validate_devices(hardware_acceleration, gpu_ids)
|
|
|
|
| 68 |
devices = (
|
| 69 |
[f"{device_type}:{device_id}" for device_id in device_ids]
|
| 70 |
if device_ids
|
| 71 |
else [device_type]
|
| 72 |
)
|
| 73 |
+
|
|
|
|
| 74 |
from ultimate_rvc.rvc.train.extract import extract # noqa: PLC0415
|
| 75 |
|
| 76 |
file_infos = extract.initialize_extraction(
|
|
|
|
| 83 |
chosen_embedder_model,
|
| 84 |
combined_file_hash,
|
| 85 |
)
|
| 86 |
+
|
| 87 |
display_progress("[~] Extracting pitch features...")
|
| 88 |
extract.run_pitch_extraction(file_infos, devices, f0_method, hop_length, cpu_cores)
|
| 89 |
+
|
| 90 |
display_progress("[~] Extracting audio embeddings...")
|
| 91 |
extract.run_embedding_extraction(
|
| 92 |
file_infos,
|
| 93 |
devices,
|
| 94 |
embedder_model,
|
| 95 |
+
str(custom_embedder_model_path) if custom_embedder_model_path else None,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
cpu_cores,
|
| 97 |
)
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
from ultimate_rvc.rvc.train.extract import preparing_files # noqa: PLC0415
|
| 100 |
preparing_files.generate_config(str(model_path))
|
| 101 |
preparing_files.generate_filelist(
|
| 102 |
str(model_path),
|