voices / app.py
Neon-tech's picture
Create app.py
b334d2c verified
Raw
History Blame Contribute Delete
2.88 kB
import os
import subprocess
import numpy as np
import torch
import joblib
from pathlib import Path
from huggingface_hub import HfApi, create_repo
SOURCE_AUDIO = "source.wav"
TARGET_AUDIO = "target.wav"
MODEL_NAME = os.environ["model_name"]
HF_TOKEN = os.environ["Hf_Token"]
HF_REPO = "Neon-AI/voice-models"
def train():
print(f"Training voice model: {MODEL_NAME}")
# install rvc training deps at runtime
# (keeps Dockerfile lean, only installs when Space runs)
subprocess.run([
"pip", "install", "-q",
"praat-parselmouth",
"pyworld",
"librosa",
"scikit-learn",
"faiss-cpu",
"joblib",
"soundfile",
], check=True)
import pyworld as pw
import librosa
from sklearn.mixture import GaussianMixture
SR = 16000
N_MCEP = 40 # higher than default → better accent capture
N_FFT = 1024 # larger fft → finer spectral detail
N_GMM = 64 # flawless quality
def extract(path):
audio, _ = librosa.load(path, sr=SR, mono=True)
f0, sp, ap = pw.wav2world(audio.astype(np.float64), SR)
mcep = pw.code_spectral_envelope(sp, SR, N_MCEP)
return mcep, f0, sp, ap
print("Extracting source features...")
src_mcep, src_f0, _, _ = extract(SOURCE_AUDIO)
print("Extracting target features...")
tgt_mcep, tgt_f0, _, _ = extract(TARGET_AUDIO)
min_len = min(len(src_mcep), len(tgt_mcep))
src_mcep = src_mcep[:min_len]
tgt_mcep = tgt_mcep[:min_len]
print(f"Training GMM-64 on {min_len} frames...")
gmm = GaussianMixture(
n_components=N_GMM,
covariance_type="full",
max_iter=300,
verbose=2,
tol=1e-4,
)
gmm.fit(np.hstack([src_mcep, tgt_mcep]))
src_f0_v = src_f0[src_f0 > 0]
tgt_f0_v = tgt_f0[tgt_f0 > 0]
model = {
"gmm": gmm,
"f0_ratio": float(np.mean(tgt_f0_v) / np.mean(src_f0_v)),
"src_mean": src_mcep.mean(0),
"src_std": src_mcep.std(0),
"tgt_mean": tgt_mcep.mean(0),
"tgt_std": tgt_mcep.std(0),
"SR": SR,
"N_MCEP": N_MCEP,
"N_FFT": N_FFT,
}
out_path = f"{MODEL_NAME}.pkl"
joblib.dump(model, out_path)
print(f"Model saved locally: {out_path}")
return out_path
def push(model_path):
api = HfApi(token=HF_TOKEN)
# create repo if it doesn't exist yet
create_repo(HF_REPO, token=HF_TOKEN, repo_type="model", exist_ok=True, private=True)
api.upload_file(
path_or_fileobj=model_path,
path_in_repo=f"{MODEL_NAME}.pkl",
repo_id=HF_REPO,
repo_type="model",
commit_message=f"Add voice model: {MODEL_NAME}",
)
print(f"Pushed to {HF_REPO}/{MODEL_NAME}.pkl")
if __name__ == "__main__":
model_path = train()
push(model_path)
print("Done.")