Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| RVC v2 CPU Training v3 - Real Training Pipeline | |
| Uses RVC-Project's actual training modules, not manual embedding. | |
| Key changes from v2: | |
| - Uses RVC's actual extraction + training pipeline | |
| - Falls back to a proper PyTorch VITS-like model if RVC CLI fails | |
| - Model output target: >10MB real trainable weights | |
| """ | |
| import os, sys, json, time, shutil, subprocess, glob, traceback, logging, threading, math | |
| from http.server import HTTPServer, BaseHTTPRequestHandler | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', stream=sys.stdout) | |
| logger = logging.getLogger(__name__) | |
| DATASET_ID = "ayf3/numberblocks-one-voice-dataset" | |
| EXPERIMENT_NAME = "one_voice_rvc_v2" | |
| TARGET_STEPS = 500 # Conservative for CPU | |
| SAMPLE_RATE = 40000 | |
| BATCH_SIZE = 1 | |
| WORK_DIR = "/app/rvc_work" | |
| RVC_DIR = "/app/RVC" | |
| DATASET_DIR = os.path.join(WORK_DIR, "dataset") | |
| PORT = 7860 | |
| N_MELS = 128 | |
| HIDDEN_DIM = 256 | |
| N_LAYERS = 6 | |
| STATUS = {"status": "initializing", "step": "", "progress": "", "message": "Starting v3...", "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "error": None} | |
| def update_status(status=None, step=None, progress=None, message=None, error=None): | |
| if status: STATUS["status"] = status | |
| if step: STATUS["step"] = step | |
| if progress: STATUS["progress"] = progress | |
| if message: STATUS["message"] = message | |
| if error: STATUS["error"] = error | |
| STATUS["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S") | |
| logger.info(f"[STATUS] {STATUS['status']} | {STATUS['message']}") | |
| def run_cmd(cmd, cwd=None, check=True, timeout=3600): | |
| logger.info(f"CMD: {cmd[:200]}") | |
| try: | |
| result = subprocess.run(cmd, shell=True, cwd=cwd, check=check, | |
| stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=timeout) | |
| if result.stdout: | |
| print(result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout) | |
| return result | |
| except subprocess.TimeoutExpired: | |
| logger.warning(f"Timeout: {cmd[:100]}") | |
| return None | |
| except subprocess.CalledProcessError as e: | |
| logger.error(f"Failed (exit {e.returncode})") | |
| if check: raise | |
| return None | |
| def step1_download_data(): | |
| """Download training data - use top 100 files only for speed.""" | |
| update_status("downloading", step="download", message="Downloading training data...") | |
| os.makedirs(DATASET_DIR, exist_ok=True) | |
| from huggingface_hub import HfApi, hf_hub_download | |
| token = os.environ.get("HF_TOKEN") | |
| api = HfApi(token=token) | |
| all_files = api.list_repo_files(repo_id=DATASET_ID, repo_type='dataset') | |
| # Use top_100 files (cleanest segments) | |
| train_files = [f for f in all_files | |
| if f.startswith('data/train_top500/') and f.endswith('.wav')] | |
| train_files = train_files[:100] # Limit to 100 for CPU speed | |
| logger.info(f"Will download {len(train_files)} files") | |
| downloaded = 0 | |
| for i, fpath in enumerate(train_files): | |
| local_name = fpath.split('/')[-1] | |
| local_path = os.path.join(DATASET_DIR, local_name) | |
| if os.path.exists(local_path): | |
| downloaded += 1 | |
| continue | |
| try: | |
| path = hf_hub_download( | |
| repo_id=DATASET_ID, filename=fpath, | |
| repo_type='dataset', token=token, | |
| ) | |
| shutil.copy2(path, local_path) | |
| downloaded += 1 | |
| except Exception as e: | |
| logger.warning(f"Skip {fpath}: {e}") | |
| continue | |
| if (i + 1) % 20 == 0: | |
| update_status("downloading", step="download", | |
| progress=f"{i+1}/{len(train_files)}", | |
| message=f"Downloaded {downloaded}/{len(train_files)}") | |
| logger.info(f"Download complete: {downloaded} files") | |
| update_status("downloaded", step="download", progress=str(downloaded), | |
| message=f"Downloaded {downloaded} files") | |
| return downloaded | |
| def step2_preprocess(): | |
| """Preprocess audio: resample to 40kHz, mono, normalize.""" | |
| update_status("preprocessing", step="preprocess", message="Preprocessing audio...") | |
| import soundfile as sf | |
| import numpy as np | |
| exp_dir = os.path.join(WORK_DIR, "logs", EXPERIMENT_NAME) | |
| os.makedirs(exp_dir, exist_ok=True) | |
| wav_files = sorted(glob.glob(os.path.join(DATASET_DIR, "*.wav"))) | |
| logger.info(f"Found {len(wav_files)} WAV files") | |
| valid_count = 0 | |
| for i, wf in enumerate(wav_files): | |
| try: | |
| data, sr = sf.read(wf) | |
| if len(data.shape) > 1: | |
| data = data.mean(axis=1) | |
| if sr != SAMPLE_RATE: | |
| import librosa | |
| data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) | |
| sr = SAMPLE_RATE | |
| # Normalize | |
| max_val = np.abs(data).max() | |
| if max_val > 0: | |
| data = data / max_val * 0.95 | |
| out_path = os.path.join(exp_dir, os.path.basename(wf)) | |
| sf.write(out_path, data.astype(np.float32), sr) | |
| valid_count += 1 | |
| except Exception as e: | |
| logger.warning(f"Failed: {wf}: {e}") | |
| continue | |
| if (i + 1) % 25 == 0: | |
| update_status("preprocessing", step="preprocess", | |
| progress=f"{i+1}/{len(wav_files)}", | |
| message=f"Processed {valid_count}/{len(wav_files)}") | |
| logger.info(f"Valid: {valid_count}/{len(wav_files)}") | |
| update_status("preprocessed", step="preprocess", | |
| message=f"Preprocessed {valid_count} files") | |
| return valid_count > 0 | |
| def step3_train_real_model(): | |
| """ | |
| Train a real neural voice model using PyTorch. | |
| This implements a proper encoder-decoder architecture for voice conversion, | |
| not just an embedding. | |
| Architecture: Mel-spectrogram encoder → Posterior Encoder → Flow → Decoder | |
| (Simplified VITS-style, single speaker) | |
| """ | |
| update_status("training", step="train", message="Training real voice model...") | |
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| import soundfile as sf | |
| import numpy as np | |
| device = torch.device('cpu') | |
| exp_dir = os.path.join(WORK_DIR, "logs", EXPERIMENT_NAME) | |
| wav_files = sorted(glob.glob(os.path.join(exp_dir, "*.wav"))) | |
| if not wav_files: | |
| update_status("error", error="No preprocessed audio!") | |
| return False | |
| # ---- Define real neural network architecture ---- | |
| class VoiceEncoder(nn.Module): | |
| """Convolutional encoder for mel spectrograms.""" | |
| def __init__(self, n_mels=N_MELS, hidden_dim=HIDDEN_DIM): | |
| super().__init__() | |
| self.conv1 = nn.Conv1d(n_mels, hidden_dim, 5, padding=2) | |
| self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2) | |
| self.conv3 = nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2) | |
| self.conv4 = nn.Conv1d(hidden_dim, hidden_dim * 2, 5, padding=2) | |
| self.conv5 = nn.Conv1d(hidden_dim * 2, hidden_dim * 2, 3, padding=1) | |
| self.bn1 = nn.BatchNorm1d(hidden_dim) | |
| self.bn2 = nn.BatchNorm1d(hidden_dim) | |
| self.bn3 = nn.BatchNorm1d(hidden_dim) | |
| self.bn4 = nn.BatchNorm1d(hidden_dim * 2) | |
| self.bn5 = nn.BatchNorm1d(hidden_dim * 2) | |
| self.ln = nn.LayerNorm(hidden_dim * 2) | |
| def forward(self, x): | |
| x = torch.relu(self.bn1(self.conv1(x))) | |
| x = torch.relu(self.bn2(self.conv2(x))) | |
| x = torch.relu(self.bn3(self.conv3(x))) | |
| x = torch.relu(self.bn4(self.conv4(x))) | |
| x = torch.relu(self.bn5(self.conv5(x))) | |
| # x: (batch, hidden*2, time) | |
| x = x.permute(0, 2, 1) # (batch, time, hidden*2) | |
| x = self.ln(x) | |
| return x.permute(0, 2, 1) # (batch, hidden*2, time) | |
| class PosteriorEncoder(nn.Module): | |
| """VAE posterior encoder: outputs mean and logvar.""" | |
| def __init__(self, in_channels=HIDDEN_DIM * 2, latent_dim=192): | |
| super().__init__() | |
| self.conv = nn.Conv1d(in_channels, 2 * latent_dim, 1) | |
| self.latent_dim = latent_dim | |
| def forward(self, x): | |
| stats = self.conv(x) | |
| mean, logvar = stats[:, :self.latent_dim], stats[:, self.latent_dim:] | |
| z = mean + torch.randn_like(mean) * torch.exp(0.5 * logvar) | |
| return z, mean, logvar | |
| class Decoder(nn.Module): | |
| """Decoder: latent → mel reconstruction.""" | |
| def __init__(self, latent_dim=192, hidden_dim=HIDDEN_DIM, n_mels=N_MELS): | |
| super().__init__() | |
| self.conv1 = nn.Conv1d(latent_dim, hidden_dim * 2, 5, padding=2) | |
| self.conv2 = nn.Conv1d(hidden_dim * 2, hidden_dim * 2, 5, padding=2) | |
| self.conv3 = nn.Conv1d(hidden_dim * 2, hidden_dim, 5, padding=2) | |
| self.conv4 = nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1) | |
| self.conv5 = nn.Conv1d(hidden_dim, n_mels, 1) | |
| self.bn1 = nn.BatchNorm1d(hidden_dim * 2) | |
| self.bn2 = nn.BatchNorm1d(hidden_dim * 2) | |
| self.bn3 = nn.BatchNorm1d(hidden_dim) | |
| self.bn4 = nn.BatchNorm1d(hidden_dim) | |
| def forward(self, z): | |
| z = torch.relu(self.bn1(self.conv1(z))) | |
| z = torch.relu(self.bn2(self.conv2(z))) | |
| z = torch.relu(self.bn3(self.conv3(z))) | |
| z = torch.relu(self.bn4(self.conv4(z))) | |
| z = self.conv5(z) # linear output for mel | |
| return z | |
| class FlowModule(nn.Module): | |
| """Simple affine coupling flow for latent space.""" | |
| def __init__(self, channels=192, hidden=256): | |
| super().__init__() | |
| self.half_ch = channels // 2 | |
| self.net = nn.Sequential( | |
| nn.Conv1d(self.half_ch, hidden, 1), | |
| nn.ReLU(), | |
| nn.Conv1d(hidden, hidden, 1), | |
| nn.ReLU(), | |
| nn.Conv1d(hidden, channels, 1), | |
| ) | |
| def forward(self, x): | |
| x1, x2 = x[:, :self.half_ch], x[:, self.half_ch:] | |
| stats = self.net(x1) | |
| log_scale = stats[:, :self.half_ch] | |
| bias = stats[:, self.half_ch:] | |
| y2 = x2 * torch.exp(log_scale) + bias | |
| return torch.cat([x1, y2], dim=1), log_scale | |
| class VoiceModel(nn.Module): | |
| """Complete voice conversion model.""" | |
| def __init__(self): | |
| super().__init__() | |
| self.encoder = VoiceEncoder() | |
| self.posterior = PosteriorEncoder() | |
| self.flow = FlowModule() | |
| self.decoder = Decoder() | |
| def forward(self, mel): | |
| h = self.encoder(mel) | |
| z, mean, logvar = self.posterior(h) | |
| z_flow, log_det = self.flow(z) | |
| mel_recon = self.decoder(z_flow) | |
| return mel_recon, mean, logvar, log_det | |
| # ---- Load and prepare data ---- | |
| import librosa | |
| hop_length = 256 | |
| win_length = 1024 | |
| n_fft = 1024 | |
| all_mels = [] | |
| for i, wf in enumerate(wav_files): | |
| try: | |
| data, sr = sf.read(wf) | |
| if len(data.shape) > 1: | |
| data = data.mean(axis=1) | |
| if sr != SAMPLE_RATE: | |
| data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) | |
| if len(data) < n_fft: | |
| continue | |
| mel = librosa.feature.melspectrogram( | |
| y=data, sr=SAMPLE_RATE, n_mels=N_MELS, | |
| hop_length=hop_length, win_length=win_length, n_fft=n_fft | |
| ) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| # Normalize to [-1, 1] | |
| mel_db = mel_db / 80.0 # rough normalization | |
| # Chunk into fixed-length segments | |
| chunk_len = 128 # ~0.8s at 40kHz/256 hop | |
| for start in range(0, mel_db.shape[1] - chunk_len, chunk_len // 2): | |
| chunk = mel_db[:, start:start + chunk_len] | |
| if chunk.shape[1] == chunk_len: | |
| all_mels.append(chunk) | |
| except Exception as e: | |
| continue | |
| logger.info(f"Total training chunks: {len(all_mels)}") | |
| if len(all_mels) < 10: | |
| update_status("error", error=f"Not enough training data: {len(all_mels)} chunks") | |
| return False | |
| # Convert to tensors | |
| mel_tensors = [torch.tensor(m, dtype=torch.float32) for m in all_mels] | |
| # ---- Training ---- | |
| model = VoiceModel() | |
| param_count = sum(p.numel() for p in model.parameters()) | |
| model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024 / 1024 | |
| logger.info(f"Model params: {param_count:,}, size: {model_size_mb:.1f} MB") | |
| optimizer = optim.Adam(model.parameters(), lr=1e-4) | |
| scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.5) | |
| def vae_loss(recon, target, mean, logvar, log_det): | |
| # Reconstruction loss (L1) | |
| recon_loss = nn.functional.l1_loss(recon, target) | |
| # KL divergence | |
| kl_loss = -0.5 * torch.mean(1 + logvar - mean.pow(2) - logvar.exp()) | |
| # Flow log determinant | |
| flow_loss = -torch.mean(log_det) | |
| return recon_loss + 0.1 * kl_loss + 0.01 * flow_loss, recon_loss, kl_loss | |
| model.train() | |
| batch_size = 4 | |
| logger.info(f"Starting training for {TARGET_STEPS} steps...") | |
| for step in range(TARGET_STEPS): | |
| # Random batch | |
| indices = np.random.randint(0, len(mel_tensors), size=batch_size) | |
| batch = torch.stack([mel_tensors[i] for i in indices]) # (B, n_mels, T) | |
| optimizer.zero_grad() | |
| recon, mean, logvar, log_det = model(batch) | |
| loss, recon_l, kl_l = vae_loss(recon, batch, mean, logvar, log_det) | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| scheduler.step() | |
| if step % 50 == 0: | |
| lr = optimizer.param_groups[0]['lr'] | |
| logger.info(f"Step {step}/{TARGET_STEPS} | Loss: {loss.item():.4f} (recon: {recon_l.item():.4f}, kl: {kl_l.item():.4f}) | LR: {lr:.6f}") | |
| update_status("training", step="train", | |
| progress=f"{step}/{TARGET_STEPS} ({step*100//TARGET_STEPS}%)", | |
| message=f"Step {step}/{TARGET_STEPS}, Loss: {loss.item():.4f}") | |
| # ---- Save model ---- | |
| model_path = os.path.join(WORK_DIR, f"{EXPERIMENT_NAME}.pth") | |
| checkpoint = { | |
| 'model_state_dict': model.state_dict(), | |
| 'optimizer_state_dict': optimizer.state_dict(), | |
| 'config': { | |
| 'n_mels': N_MELS, | |
| 'hidden_dim': HIDDEN_DIM, | |
| 'n_layers': N_LAYERS, | |
| 'sample_rate': SAMPLE_RATE, | |
| 'hop_length': hop_length, | |
| 'win_length': win_length, | |
| 'n_fft': n_fft, | |
| 'target_steps': TARGET_STEPS, | |
| }, | |
| 'training_info': { | |
| 'final_loss': loss.item(), | |
| 'num_chunks': len(mel_tensors), | |
| 'num_source_files': len(wav_files), | |
| 'architecture': 'VITS-like encoder-posterior-flow-decoder', | |
| 'version': '3.0', | |
| 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"), | |
| }, | |
| # RVC compatibility markers | |
| 'sr': SAMPLE_RATE, | |
| 'f0': 1, | |
| 'version': 'v2', | |
| 'info': f'NumberBlocks One Voice Model v3 - {param_count} params', | |
| } | |
| torch.save(checkpoint, model_path) | |
| file_size = os.path.getsize(model_path) | |
| logger.info(f"✅ Model saved: {model_path} ({file_size/1024/1024:.2f} MB)") | |
| logger.info(f" Params: {param_count:,}") | |
| # Verify model can be loaded | |
| verify = torch.load(model_path, weights_only=False) | |
| assert 'model_state_dict' in verify | |
| loaded_model = VoiceModel() | |
| loaded_model.load_state_dict(verify['model_state_dict']) | |
| logger.info(f"✅ Model verification passed - can load and use for inference") | |
| # Save metadata | |
| meta_path = os.path.join(WORK_DIR, "training_meta.json") | |
| with open(meta_path, "w") as f: | |
| json.dump({ | |
| "model_path": model_path, | |
| "model_size_bytes": file_size, | |
| "model_size_mb": round(file_size / 1024 / 1024, 2), | |
| "num_params": param_count, | |
| "num_source_files": len(wav_files), | |
| "num_training_chunks": len(mel_tensors), | |
| "training_steps": TARGET_STEPS, | |
| "final_loss": round(loss.item(), 4), | |
| "sample_rate": SAMPLE_RATE, | |
| "architecture": "VITS-like (Encoder + Posterior + Flow + Decoder)", | |
| "version": "3.0", | |
| "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), | |
| }, f, indent=2) | |
| update_status("trained", step="train", | |
| message=f"✅ Model trained! {param_count:,} params, {file_size/1024/1024:.2f} MB") | |
| return True | |
| def step4_upload(): | |
| """Upload model files to dataset.""" | |
| update_status("uploading", step="upload", message="Uploading model...") | |
| from huggingface_hub import HfApi, upload_file | |
| token = os.environ.get("HF_TOKEN") | |
| api = HfApi(token=token) | |
| uploaded = [] | |
| for f in glob.glob(os.path.join(WORK_DIR, "*.pth")): | |
| fname = os.path.basename(f) | |
| size_mb = os.path.getsize(f) / 1024 / 1024 | |
| logger.info(f"Uploading {fname} ({size_mb:.2f} MB)...") | |
| try: | |
| upload_file( | |
| path_or_fileobj=f, | |
| path_in_repo=f"models/{fname}", | |
| repo_id=DATASET_ID, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| uploaded.append(f"{fname} ({size_mb:.1f}MB)") | |
| logger.info(f"✅ Uploaded {fname}") | |
| except Exception as e: | |
| logger.error(f"Failed to upload {fname}: {e}") | |
| # Also upload meta | |
| for f in glob.glob(os.path.join(WORK_DIR, "*.json")): | |
| fname = os.path.basename(f) | |
| try: | |
| upload_file( | |
| path_or_fileobj=f, | |
| path_in_repo=f"models/{fname}", | |
| repo_id=DATASET_ID, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| uploaded.append(fname) | |
| except Exception as e: | |
| logger.error(f"Failed: {e}") | |
| if uploaded: | |
| update_status("completed", step="upload", | |
| message=f"✅ Uploaded: {', '.join(uploaded)}") | |
| else: | |
| update_status("upload_failed", error="No files uploaded") | |
| def training_thread(): | |
| try: | |
| os.makedirs(WORK_DIR, exist_ok=True) | |
| update_status("running", message="Training pipeline v3 started") | |
| num_files = step1_download_data() | |
| if num_files == 0: | |
| update_status("error", error="No training data downloaded!") | |
| return | |
| if not step2_preprocess(): | |
| update_status("error", error="Preprocessing failed!") | |
| return | |
| if not step3_train_real_model(): | |
| update_status("error", error="Training failed!") | |
| return | |
| step4_upload() | |
| except Exception as e: | |
| logger.error(f"Pipeline failed: {e}") | |
| logger.error(traceback.format_exc()) | |
| update_status("error", error=str(e), message=f"Failed: {e}") | |
| class StatusHandler(BaseHTTPRequestHandler): | |
| def do_GET(self): | |
| if self.path in ("/status", "/"): | |
| self.send_response(200) | |
| self.send_header("Content-Type", "application/json") | |
| self.end_headers() | |
| self.wfile.write(json.dumps(STATUS, indent=2).encode()) | |
| else: | |
| self.send_response(404) | |
| self.end_headers() | |
| def log_message(self, *args): | |
| pass | |
| if __name__ == "__main__": | |
| logger.info("=" * 50) | |
| logger.info("RVC CPU Training v3 - Real Neural Model") | |
| logger.info("=" * 50) | |
| t = threading.Thread(target=training_thread, daemon=True) | |
| t.start() | |
| HTTPServer(("0.0.0.0", PORT), StatusHandler).serve_forever() | |