| # #!/usr/bin/env python3 | |
| # """ | |
| # Audio Processing Script for Boson Codes | |
| # Processes audio files in parallel using Higgs Audio Tokenizer | |
| # and saves encoded representations as .pt files. | |
| # """ | |
| # import os | |
| # import sys | |
| # import json | |
| # import torch | |
| # import librosa | |
| # import numpy as np | |
| # import warnings | |
| # import argparse | |
| # from pathlib import Path | |
| # from multiprocessing import Pool | |
| # from tqdm import tqdm | |
| # from datasets import load_from_disk | |
| # from higgs_audio_tokenizer import HiggsAudioTokenizer | |
| # # Suppress PyTorch FutureWarnings | |
| # warnings.filterwarnings("ignore", category=FutureWarning) | |
| # # Global configuration | |
| # DEFAULT_OUTPUT_DIR = "/home/ubuntu/boson_codes" | |
| # DEFAULT_NUM_CORES = 48 | |
| # DEFAULT_SAMPLE_RATE = 44100 | |
| # DEFAULT_DATASET_PATH = "/home/ubuntu/ttsar/Layla/src_bpe_2/data" | |
| # # Model paths | |
| # CONFIG_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/config.json" | |
| # MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/model.pth" | |
| # # Global model variable (initialized in each worker) | |
| # model = None | |
| # def init_worker(): | |
| # """Initialize model once per worker process.""" | |
| # global model | |
| # device = 'cpu' | |
| # # Load config | |
| # with open(CONFIG_PATH, 'r') as f: | |
| # config = json.load(f) | |
| # # Initialize model | |
| # model = HiggsAudioTokenizer( | |
| # **config, | |
| # device=device, | |
| # ) | |
| # # Load weights | |
| # parameter_dict = torch.load(MODEL_PATH, map_location=device) | |
| # _ = model.load_state_dict(parameter_dict, strict=False) | |
| # model = model.to(device) | |
| # _ = model.eval() | |
| # print(f"Model loaded in worker {os.getpid()}") | |
| # def process_audio_file(args): | |
| # """Process a single audio file using pre-loaded model.""" | |
| # filename, output_dir, sample_rate = args | |
| # try: | |
| # # Output filename - same name, just change extension to .pt | |
| # base_name = Path(filename).stem | |
| # output_path = os.path.join(output_dir, f"{base_name}.pt") | |
| # # Skip if exists (double-check in case of race conditions) | |
| # if os.path.exists(output_path): | |
| # return ("skipped", filename) | |
| # # Load and process audio | |
| # wav, sr = librosa.load(filename, sr=sample_rate) | |
| # wav = torch.from_numpy(wav).unsqueeze(0).float().to('cpu') | |
| # # Encode using the pre-loaded model | |
| # with torch.no_grad(): | |
| # encoded = model._xcodec_encode(wav.unsqueeze(0)) | |
| # # Save codes only | |
| # torch.save(encoded.audio_codes, output_path) | |
| # return ("success", filename) | |
| # except Exception as e: | |
| # return ("error", filename, str(e)) | |
| # def load_dataset(dataset_path): | |
| # """Load and prepare the dataset.""" | |
| # print(f"Loading dataset from: {dataset_path}") | |
| # ds = load_from_disk(dataset_path) | |
| # print(f"Dataset info: {ds}") | |
| # # Remove unnecessary columns | |
| # columns_to_remove = ['spk', 'duration', 'codes', 'input_ids', 'attention_mask'] | |
| # existing_columns = [col for col in columns_to_remove if col in ds.column_names] | |
| # if existing_columns: | |
| # ds = ds.remove_columns(existing_columns) | |
| # print(f"Removed columns: {existing_columns}") | |
| # # Convert to pandas DataFrame | |
| # df = ds.to_pandas() | |
| # print(f"Loaded {len(df)} files from dataset") | |
| # return df | |
| # def main(args): | |
| # """Main processing function.""" | |
| # # Change to audio processing directory | |
| # os.chdir("/home/ubuntu/ttsar/boson_audio_codec/audio_processing") | |
| # print(f"Working directory: {os.getcwd()}") | |
| # # Create output directory | |
| # os.makedirs(args.output_dir, exist_ok=True) | |
| # print(f"Output directory: {args.output_dir}") | |
| # # Check if model files exist | |
| # if not os.path.exists(CONFIG_PATH): | |
| # print(f"Error: Config file not found at {CONFIG_PATH}") | |
| # sys.exit(1) | |
| # if not os.path.exists(MODEL_PATH): | |
| # print(f"Error: Model file not found at {MODEL_PATH}") | |
| # sys.exit(1) | |
| # # Load dataset | |
| # df = load_dataset(args.dataset_path) | |
| # # Get filenames from dataframe | |
| # all_filenames = df['filename'].tolist() | |
| # # Pre-filter to exclude already processed files | |
| # filenames_to_process = [] | |
| # already_processed = [] | |
| # print(f"\nChecking for already processed files...") | |
| # for filename in all_filenames: | |
| # base_name = Path(filename).stem | |
| # output_path = os.path.join(args.output_dir, f"{base_name}.pt") | |
| # if os.path.exists(output_path): | |
| # already_processed.append(filename) | |
| # else: | |
| # filenames_to_process.append(filename) | |
| # print(f"\nTotal files: {len(all_filenames)}") | |
| # print(f"Already processed: {len(already_processed)}") | |
| # print(f"To process: {len(filenames_to_process)}") | |
| # if len(filenames_to_process) == 0: | |
| # print("\nAll files have already been processed!") | |
| # return | |
| # print(f"\nProcessing {len(filenames_to_process)} files using {args.num_cores} cores...") | |
| # print(f"Sample rate: {args.sample_rate} Hz") | |
| # # Prepare arguments for multiprocessing | |
| # process_args = [(filename, args.output_dir, args.sample_rate) | |
| # for filename in filenames_to_process] | |
| # # Process in parallel with model reuse | |
| # with Pool(processes=args.num_cores, initializer=init_worker) as pool: | |
| # results = list(tqdm( | |
| # pool.imap(process_audio_file, process_args, chunksize=args.chunksize), | |
| # total=len(filenames_to_process), | |
| # desc="Processing audio files" | |
| # )) | |
| # # Count results | |
| # processed = sum(1 for r in results if r[0] == "success") | |
| # skipped = sum(1 for r in results if r[0] == "skipped") | |
| # errors = sum(1 for r in results if r[0] == "error") | |
| # print(f"\nProcessing complete!") | |
| # print(f" Successfully processed: {processed}") | |
| # print(f" Previously processed: {len(already_processed)}") | |
| # print(f" Skipped (race condition): {skipped}") | |
| # print(f" Errors: {errors}") | |
| # # Show errors if any | |
| # if errors > 0: | |
| # print("\nErrors encountered:") | |
| # error_log_path = os.path.join(args.output_dir, "processing_errors.log") | |
| # with open(error_log_path, 'w') as f: | |
| # for r in results: | |
| # if r[0] == "error": | |
| # error_msg = f"{r[1]}: {r[2]}" | |
| # print(f" {error_msg}") | |
| # f.write(error_msg + "\n") | |
| # print(f"\nError log saved to: {error_log_path}") | |
| # # Show summary of all processed files | |
| # total_processed_files = len(list(Path(args.output_dir).glob("*.pt"))) | |
| # print(f"\nTotal .pt files in {args.output_dir}: {total_processed_files}") | |
| # if __name__ == "__main__": | |
| # parser = argparse.ArgumentParser( | |
| # description="Process audio files using Higgs Audio Tokenizer and save as .pt files" | |
| # ) | |
| # parser.add_argument( | |
| # "--dataset-path", | |
| # type=str, | |
| # default=DEFAULT_DATASET_PATH, | |
| # help=f"Path to the dataset (default: {DEFAULT_DATASET_PATH})" | |
| # ) | |
| # parser.add_argument( | |
| # "--output-dir", | |
| # type=str, | |
| # default=DEFAULT_OUTPUT_DIR, | |
| # help=f"Output directory for .pt files (default: {DEFAULT_OUTPUT_DIR})" | |
| # ) | |
| # parser.add_argument( | |
| # "--num-cores", | |
| # type=int, | |
| # default=DEFAULT_NUM_CORES, | |
| # help=f"Number of CPU cores to use (default: {DEFAULT_NUM_CORES})" | |
| # ) | |
| # parser.add_argument( | |
| # "--sample-rate", | |
| # type=int, | |
| # default=DEFAULT_SAMPLE_RATE, | |
| # help=f"Sample rate for audio processing (default: {DEFAULT_SAMPLE_RATE})" | |
| # ) | |
| # parser.add_argument( | |
| # "--chunksize", | |
| # type=int, | |
| # default=1, | |
| # help="Chunksize for multiprocessing pool (default: 1)" | |
| # ) | |
| # args = parser.parse_args() | |
| # # Run main processing | |
| # try: | |
| # main(args) | |
| # except KeyboardInterrupt: | |
| # print("\n\nProcessing interrupted by user") | |
| # sys.exit(1) | |
| # except Exception as e: | |
| # print(f"\n\nError: {e}") | |
| # sys.exit(1) | |
| #!/usr/bin/env python3 | |
| """ | |
| GPU Batch Processing Script for Boson Codes with Dataset Loading | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import torch | |
| import torch.nn.functional as F | |
| import librosa | |
| import numpy as np | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| import warnings | |
| from torch.nn.utils import remove_weight_norm, weight_norm | |
| # from boson_multimodal.audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer | |
| # model = load_higgs_audio_tokenizer("bosonai/higgs-audio-v2-tokenizer") | |
| import librosa | |
| import torch | |
| import torch.nn.functional as F | |
| import numpy as np | |
| import json | |
| import torch | |
| from higgs_audio_tokenizer import HiggsAudioTokenizer | |
| # model = load_higgs_audio_tokenizer("bosonai/higgs-audio-v2-tokenizer") | |
| import torch | |
| import torch.nn as nn | |
| import warnings | |
| # Suppress warnings | |
| warnings.filterwarnings('ignore') | |
| def remove_weight_norms_from_model(model): | |
| for module in model.modules(): | |
| try: | |
| remove_weight_norm(module) | |
| except: | |
| continue | |
| return model | |
| class EncodedResult: | |
| def __init__(self, audio_codes): | |
| self.audio_codes = audio_codes | |
| def encode_batch(model, x_batch): | |
| """ | |
| Encodes a batch of audio tensors using the HiggsAudioTokenizer model. | |
| Args: | |
| model: The loaded HiggsAudioTokenizer model. | |
| x_batch: A tensor of shape [B, 1, T] | |
| """ | |
| # Acoustic and Semantic Feature Extraction | |
| e_semantic_input = model.get_regress_target(x_batch).detach() | |
| e_semantic = model.encoder_semantic(e_semantic_input.transpose(1, 2)) | |
| e_acoustic = model.encoder(x_batch) | |
| # This block contains the fix for batch processing | |
| if e_acoustic.shape[2] != e_semantic.shape[2]: | |
| pad_size = 160 * model.semantic_downsample_factor | |
| # 1. Remove channel dim, preserving batch dim -> [B, T] | |
| x_slice = x_batch[:, 0, :] | |
| # 2. Pad the tensor | |
| x_padded = F.pad(x_slice, (pad_size, pad_size)) | |
| # 3. Re-add channel dim before passing to encoder -> [B, 1, T_padded] | |
| e_acoustic = model.encoder(x_padded.unsqueeze(1)) | |
| # Ensure dimensions match before concatenating | |
| min_len = min(e_acoustic.shape[2], e_semantic.shape[2]) | |
| e_acoustic = e_acoustic[:, :, :min_len] | |
| e_semantic = e_semantic[:, :, :min_len] | |
| # Remainder of the original encoding logic | |
| e = torch.cat([e_acoustic, e_semantic], dim=1) | |
| e = model.fc_prior(e.transpose(1, 2)) | |
| if model.quantizer_type == "RVQ": | |
| e = e.transpose(1, 2) | |
| _, codes, _, _ = model.quantizer(e, model.frame_rate, None) | |
| codes = codes.permute(1, 0, 2) | |
| else: # RFSQ | |
| quantized, codes = model.quantizer(e) | |
| codes = codes.permute(0, 2, 1) | |
| return EncodedResult(audio_codes=codes) | |
| def fix_all_inference_issues(model): | |
| """ | |
| Comprehensive fix for all potential inference issues | |
| """ | |
| device = next(model.parameters()).device | |
| # 1. Force everything to eval mode | |
| model.eval() | |
| with torch.no_grad(): | |
| for module in model.modules(): | |
| if isinstance(module, nn.Module): | |
| module.eval() | |
| if hasattr(module, 'training'): | |
| module.training = False | |
| # 2. Fix semantic model specifically | |
| if hasattr(model, 'semantic_model'): | |
| print("Fixing semantic model...") | |
| # Move to correct device | |
| model.semantic_model = model.semantic_model.to(device) | |
| model.semantic_model.eval() | |
| # Disable ALL gradient checkpointing | |
| def disable_gradient_checkpointing(module): | |
| if hasattr(module, 'gradient_checkpointing'): | |
| module.gradient_checkpointing = False | |
| if hasattr(module, 'gradient_checkpointing_disable'): | |
| try: | |
| module.gradient_checkpointing_disable() | |
| except: | |
| pass | |
| for child in module.children(): | |
| disable_gradient_checkpointing(child) | |
| disable_gradient_checkpointing(model.semantic_model) | |
| # For HuBERT specifically | |
| if hasattr(model.semantic_model, 'encoder'): | |
| model.semantic_model.encoder.gradient_checkpointing = False | |
| if hasattr(model.semantic_model.encoder, 'layers'): | |
| for layer in model.semantic_model.encoder.layers: | |
| if hasattr(layer, 'gradient_checkpointing'): | |
| layer.gradient_checkpointing = False | |
| # 3. Set all dropout to eval mode | |
| def set_dropout_eval(module): | |
| if isinstance(module, nn.Dropout): | |
| module.eval() | |
| module.training = False | |
| for child in module.children(): | |
| set_dropout_eval(child) | |
| set_dropout_eval(model) | |
| # 4. Clear any cached computations | |
| torch.cuda.empty_cache() if torch.cuda.is_available() else None | |
| return model | |
| def inference_pipeline(checkpoint_path, config_path, device='cuda'): | |
| """ | |
| Complete pipeline for inference with your trained model | |
| """ | |
| # Load config | |
| print("Loading config...") | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| # Create model | |
| print("Creating model...") | |
| model = HiggsAudioTokenizer( | |
| n_filters=config['n_filters'], | |
| D=config['D'], | |
| target_bandwidths=config['target_bandwidths'], | |
| ratios=config['ratios'], | |
| sample_rate=config['sample_rate'], | |
| bins=config['bins'], | |
| n_q=config['n_q'], | |
| codebook_dim=config.get('codebook_dim', None), | |
| semantic_techer=config['semantic_techer'], | |
| device=device | |
| ).to(device) | |
| # Load checkpoint | |
| print("Loading checkpoint...") | |
| checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False) | |
| if 'model_state_dict' in checkpoint: | |
| state_dict = checkpoint['model_state_dict'] | |
| else: | |
| state_dict = checkpoint | |
| # Remove 'module.' prefix if present (from DDP) | |
| new_state_dict = {} | |
| for k, v in state_dict.items(): | |
| if k.startswith('module.'): | |
| new_state_dict[k[7:]] = v | |
| else: | |
| new_state_dict[k] = v | |
| model.load_state_dict(new_state_dict, strict=False) | |
| # Fix all inference issues | |
| print("Fixing inference issues...") | |
| model = fix_all_inference_issues(model) | |
| return model | |
| # # Add paths | |
| # sys.path.insert(0, "/home/ubuntu/AP-BWE") | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| # Configuration | |
| OUTPUT_DIR = "/home/ubuntu/data_boson_44.1khz" | |
| BATCH_SIZE = 32 | |
| SAMPLE_RATE = 44100 | |
| DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| DATASET_PATH = "/home/ubuntu/ttsar/Layla/src_bpe_2/Qanary_data" | |
| # # Model paths | |
| # CONFIG_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/config.json" | |
| # MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/model.pth" | |
| # --- Setup --- | |
| print(f"Using device: {DEVICE}") | |
| # Change to working directory | |
| os.chdir("/home/ubuntu/ttsar/boson_audio_codec/audio_processing") | |
| # Load dataset | |
| from datasets import load_from_disk | |
| print(f"Loading dataset from: {DATASET_PATH}") | |
| ds = load_from_disk(DATASET_PATH) | |
| print(f"Dataset info: {ds}") | |
| # Remove unnecessary columns | |
| columns_to_remove = ['spk', 'duration', 'codes', 'input_ids', 'attention_mask'] | |
| existing_columns = [col for col in columns_to_remove if col in ds.column_names] | |
| if existing_columns: | |
| ds = ds.remove_columns(existing_columns) | |
| df = ds.to_pandas() | |
| print(f"Loaded {len(df)} files from dataset") | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| print(f"Output directory '{OUTPUT_DIR}' is ready.") | |
| # --- Filter already processed --- | |
| print("Checking for already processed files...") | |
| def get_output_path(audio_path): | |
| base_name = Path(audio_path).stem | |
| return os.path.join(OUTPUT_DIR, f"{base_name}.pt") | |
| # Filter | |
| original_count = len(df) | |
| df['output_exists'] = df['filename'].apply(lambda x: os.path.exists(get_output_path(x))) | |
| df_filtered = df[~df['output_exists']].copy() | |
| skipped_count = original_count - len(df_filtered) | |
| print(f"Found {skipped_count} already processed files. Skipping them.") | |
| print(f"Processing {len(df_filtered)} remaining files.") | |
| if len(df_filtered) == 0: | |
| print("All files have already been processed!") | |
| exit() | |
| # --- Load Model --- | |
| print("Loading Higgs Audio Tokenizer model...") | |
| from transformers import HubertModel | |
| from higgs_audio_tokenizer import HiggsAudioTokenizer | |
| # Load config | |
| # with open(CONFIG_PATH, 'r') as f: | |
| # config = json.load(f) | |
| # # Initialize model | |
| # model = HiggsAudioTokenizer( | |
| # **config, | |
| # device=DEVICE, | |
| # ) | |
| # Load weights | |
| # parameter_dict = torch.load(MODEL_PATH, map_location=DEVICE) | |
| # _ = model.load_state_dict(parameter_dict, strict=False) | |
| # model = model.to(DEVICE) | |
| # _ = model.eval() | |
| checkpoint_path = '/home/ubuntu/ttsar/boson_audio_codec/audio_processing/outputs_CQT/checkpoints/step_99000.pth' | |
| config_path = '/home/ubuntu/ttsar/boson_audio_codec/audio_processing/config copy.json' | |
| device = 'cuda' | |
| model = inference_pipeline(checkpoint_path, config_path, device) | |
| _ = model.eval() | |
| model = remove_weight_norms_from_model(model) | |
| print(f"Model loaded on {DEVICE}") | |
| # Get hop length | |
| hop_length = model.hop_length | |
| print(f"Encoder hop length: {hop_length}") | |
| # --- Batch Processing --- | |
| print(f"\nStarting batch processing with batch size {BATCH_SIZE}...") | |
| # Process in batches | |
| filenames = df_filtered['filename'].tolist() | |
| total_processed = 0 | |
| total_errors = 0 | |
| with torch.no_grad(): | |
| for batch_start in tqdm(range(0, len(filenames), BATCH_SIZE), desc="Processing batches"): | |
| batch_end = min(batch_start + BATCH_SIZE, len(filenames)) | |
| batch_filenames = filenames[batch_start:batch_end] | |
| batch_audio = [] | |
| batch_lengths = [] | |
| batch_outputs = [] | |
| # Load batch | |
| for filename in batch_filenames: | |
| output_path = get_output_path(filename) | |
| # Skip if exists (race condition check) | |
| if os.path.exists(output_path): | |
| continue | |
| try: | |
| # Load audio | |
| wav, _ = librosa.load(filename, sr=SAMPLE_RATE) | |
| wav_tensor = torch.from_numpy(wav).float() | |
| batch_audio.append(wav_tensor) | |
| batch_lengths.append(len(wav)) | |
| batch_outputs.append(output_path) | |
| except Exception as e: | |
| print(f"\nError loading {filename}: {e}") | |
| total_errors += 1 | |
| continue | |
| if not batch_audio: | |
| continue | |
| # Pad batch to same length | |
| max_len = max(len(x) for x in batch_audio) | |
| padded_batch = [] | |
| for audio in batch_audio: | |
| pad_len = max_len - len(audio) | |
| if pad_len > 0: | |
| audio = F.pad(audio, (0, pad_len), mode='constant', value=0) | |
| # Don't add extra dimensions here, just collect the padded audio | |
| padded_batch.append(audio) | |
| # Convert list to tensor and add channel dimension | |
| # Stack along batch dimension to get [B, T] | |
| batch_tensor = torch.stack(padded_batch, dim=0) # [B, T] | |
| # Add channel dimension | |
| batch_tensor = batch_tensor.unsqueeze(1) # [B, 1, T] | |
| batch_tensor = batch_tensor.to(DEVICE) | |
| # Encode batch | |
| try: | |
| encoded = encode_batch(model, batch_tensor) | |
| codes = encoded.audio_codes # [B, n_codebooks, T_compressed] | |
| # Save each item | |
| for idx, (output_path, orig_len) in enumerate(zip(batch_outputs, batch_lengths)): | |
| # Calculate true code length | |
| true_code_len = int(np.ceil(orig_len / hop_length)) | |
| # Extract non-padded codes | |
| item_codes = codes[idx, :, :true_code_len].cpu() | |
| # Save | |
| torch.save(item_codes, output_path) | |
| total_processed += 1 | |
| except Exception as e: | |
| print(f"\nError encoding batch: {e}") | |
| total_errors += len(batch_outputs) | |
| print("\n" + "="*50) | |
| print("PROCESSING COMPLETE!") | |
| print("="*50) | |
| print(f"Successfully processed: {total_processed} files") | |
| print(f"Previously processed: {skipped_count} files") | |
| print(f"Errors encountered: {total_errors} files") | |
| print(f"Output directory: {OUTPUT_DIR}") | |
| # Final count | |
| final_count = len(list(Path(OUTPUT_DIR).glob("*.pt"))) | |
| print(f"Total .pt files in output: {final_count}") |