File size: 21,203 Bytes

ffaf0d2

# #!/usr/bin/env python3
# """
# Audio Processing Script for Boson Codes
# Processes audio files in parallel using Higgs Audio Tokenizer
# and saves encoded representations as .pt files.
# """

# import os
# import sys
# import json
# import torch
# import librosa
# import numpy as np
# import warnings
# import argparse
# from pathlib import Path
# from multiprocessing import Pool
# from tqdm import tqdm

# from datasets import load_from_disk
# from higgs_audio_tokenizer import HiggsAudioTokenizer

# # Suppress PyTorch FutureWarnings
# warnings.filterwarnings("ignore", category=FutureWarning)

# # Global configuration
# DEFAULT_OUTPUT_DIR = "/home/ubuntu/boson_codes"
# DEFAULT_NUM_CORES = 48
# DEFAULT_SAMPLE_RATE = 44100
# DEFAULT_DATASET_PATH = "/home/ubuntu/ttsar/Layla/src_bpe_2/data"

# # Model paths
# CONFIG_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/config.json"
# MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/model.pth"

# # Global model variable (initialized in each worker)
# model = None


# def init_worker():
#     """Initialize model once per worker process."""
#     global model
#     device = 'cpu'
    
#     # Load config
#     with open(CONFIG_PATH, 'r') as f:
#         config = json.load(f)
    
#     # Initialize model
#     model = HiggsAudioTokenizer(
#         **config,
#         device=device,
#     )
    
#     # Load weights
#     parameter_dict = torch.load(MODEL_PATH, map_location=device)
#     _ = model.load_state_dict(parameter_dict, strict=False)
#     model = model.to(device)
#     _ = model.eval()
    
#     print(f"Model loaded in worker {os.getpid()}")


# def process_audio_file(args):
#     """Process a single audio file using pre-loaded model."""
#     filename, output_dir, sample_rate = args
    
#     try:
#         # Output filename - same name, just change extension to .pt
#         base_name = Path(filename).stem
#         output_path = os.path.join(output_dir, f"{base_name}.pt")
        
#         # Skip if exists (double-check in case of race conditions)
#         if os.path.exists(output_path):
#             return ("skipped", filename)
        
#         # Load and process audio
#         wav, sr = librosa.load(filename, sr=sample_rate)
#         wav = torch.from_numpy(wav).unsqueeze(0).float().to('cpu')
        
#         # Encode using the pre-loaded model
#         with torch.no_grad():
#             encoded = model._xcodec_encode(wav.unsqueeze(0))
        
#         # Save codes only
#         torch.save(encoded.audio_codes, output_path)
        
#         return ("success", filename)
        
#     except Exception as e:
#         return ("error", filename, str(e))


# def load_dataset(dataset_path):
#     """Load and prepare the dataset."""
#     print(f"Loading dataset from: {dataset_path}")
#     ds = load_from_disk(dataset_path)
#     print(f"Dataset info: {ds}")
    
#     # Remove unnecessary columns
#     columns_to_remove = ['spk', 'duration', 'codes', 'input_ids', 'attention_mask']
#     existing_columns = [col for col in columns_to_remove if col in ds.column_names]
#     if existing_columns:
#         ds = ds.remove_columns(existing_columns)
#         print(f"Removed columns: {existing_columns}")
    
#     # Convert to pandas DataFrame
#     df = ds.to_pandas()
#     print(f"Loaded {len(df)} files from dataset")
#     return df


# def main(args):
#     """Main processing function."""
#     # Change to audio processing directory
#     os.chdir("/home/ubuntu/ttsar/boson_audio_codec/audio_processing")
#     print(f"Working directory: {os.getcwd()}")
    
#     # Create output directory
#     os.makedirs(args.output_dir, exist_ok=True)
#     print(f"Output directory: {args.output_dir}")
    
#     # Check if model files exist
#     if not os.path.exists(CONFIG_PATH):
#         print(f"Error: Config file not found at {CONFIG_PATH}")
#         sys.exit(1)
#     if not os.path.exists(MODEL_PATH):
#         print(f"Error: Model file not found at {MODEL_PATH}")
#         sys.exit(1)
    
#     # Load dataset
#     df = load_dataset(args.dataset_path)
    
#     # Get filenames from dataframe
#     all_filenames = df['filename'].tolist()
    
#     # Pre-filter to exclude already processed files
#     filenames_to_process = []
#     already_processed = []
    
#     print(f"\nChecking for already processed files...")
#     for filename in all_filenames:
#         base_name = Path(filename).stem
#         output_path = os.path.join(args.output_dir, f"{base_name}.pt")
#         if os.path.exists(output_path):
#             already_processed.append(filename)
#         else:
#             filenames_to_process.append(filename)
    
#     print(f"\nTotal files: {len(all_filenames)}")
#     print(f"Already processed: {len(already_processed)}")
#     print(f"To process: {len(filenames_to_process)}")
    
#     if len(filenames_to_process) == 0:
#         print("\nAll files have already been processed!")
#         return
    
#     print(f"\nProcessing {len(filenames_to_process)} files using {args.num_cores} cores...")
#     print(f"Sample rate: {args.sample_rate} Hz")
    
#     # Prepare arguments for multiprocessing
#     process_args = [(filename, args.output_dir, args.sample_rate) 
#                     for filename in filenames_to_process]
    
#     # Process in parallel with model reuse
#     with Pool(processes=args.num_cores, initializer=init_worker) as pool:
#         results = list(tqdm(
#             pool.imap(process_audio_file, process_args, chunksize=args.chunksize),
#             total=len(filenames_to_process),
#             desc="Processing audio files"
#         ))
    
#     # Count results
#     processed = sum(1 for r in results if r[0] == "success")
#     skipped = sum(1 for r in results if r[0] == "skipped")
#     errors = sum(1 for r in results if r[0] == "error")
    
#     print(f"\nProcessing complete!")
#     print(f"  Successfully processed: {processed}")
#     print(f"  Previously processed: {len(already_processed)}")
#     print(f"  Skipped (race condition): {skipped}")
#     print(f"  Errors: {errors}")
    
#     # Show errors if any
#     if errors > 0:
#         print("\nErrors encountered:")
#         error_log_path = os.path.join(args.output_dir, "processing_errors.log")
#         with open(error_log_path, 'w') as f:
#             for r in results:
#                 if r[0] == "error":
#                     error_msg = f"{r[1]}: {r[2]}"
#                     print(f"  {error_msg}")
#                     f.write(error_msg + "\n")
#         print(f"\nError log saved to: {error_log_path}")
    
#     # Show summary of all processed files
#     total_processed_files = len(list(Path(args.output_dir).glob("*.pt")))
#     print(f"\nTotal .pt files in {args.output_dir}: {total_processed_files}")


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(
#         description="Process audio files using Higgs Audio Tokenizer and save as .pt files"
#     )
    
#     parser.add_argument(
#         "--dataset-path", 
#         type=str, 
#         default=DEFAULT_DATASET_PATH,
#         help=f"Path to the dataset (default: {DEFAULT_DATASET_PATH})"
#     )
    
#     parser.add_argument(
#         "--output-dir", 
#         type=str, 
#         default=DEFAULT_OUTPUT_DIR,
#         help=f"Output directory for .pt files (default: {DEFAULT_OUTPUT_DIR})"
#     )
    
#     parser.add_argument(
#         "--num-cores", 
#         type=int, 
#         default=DEFAULT_NUM_CORES,
#         help=f"Number of CPU cores to use (default: {DEFAULT_NUM_CORES})"
#     )
    
#     parser.add_argument(
#         "--sample-rate", 
#         type=int, 
#         default=DEFAULT_SAMPLE_RATE,
#         help=f"Sample rate for audio processing (default: {DEFAULT_SAMPLE_RATE})"
#     )
    
#     parser.add_argument(
#         "--chunksize", 
#         type=int, 
#         default=1,
#         help="Chunksize for multiprocessing pool (default: 1)"
#     )
    
#     args = parser.parse_args()
    
#     # Run main processing
#     try:
#         main(args)
#     except KeyboardInterrupt:
#         print("\n\nProcessing interrupted by user")
#         sys.exit(1)
#     except Exception as e:
#         print(f"\n\nError: {e}")
#         sys.exit(1)

#!/usr/bin/env python3
"""
GPU Batch Processing Script for Boson Codes with Dataset Loading
"""

import os
import sys
import json
import torch
import torch.nn.functional as F
import librosa
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
from torch.nn.utils import remove_weight_norm, weight_norm


# from boson_multimodal.audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
# model = load_higgs_audio_tokenizer("bosonai/higgs-audio-v2-tokenizer")
import librosa
import torch
import torch.nn.functional as F
import numpy as np
import json
import torch

from higgs_audio_tokenizer import HiggsAudioTokenizer
# model = load_higgs_audio_tokenizer("bosonai/higgs-audio-v2-tokenizer")

import torch
import torch.nn as nn
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def remove_weight_norms_from_model(model):
    for module in model.modules():
        try:
            remove_weight_norm(module)
        except:
            continue
    return model


class EncodedResult:
    def __init__(self, audio_codes):
        self.audio_codes = audio_codes

def encode_batch(model, x_batch):
    """
    Encodes a batch of audio tensors using the HiggsAudioTokenizer model.
    Args:
        model: The loaded HiggsAudioTokenizer model.
        x_batch: A tensor of shape [B, 1, T]
    """
    # Acoustic and Semantic Feature Extraction
    e_semantic_input = model.get_regress_target(x_batch).detach()
    e_semantic = model.encoder_semantic(e_semantic_input.transpose(1, 2))
    e_acoustic = model.encoder(x_batch)

    # This block contains the fix for batch processing
    if e_acoustic.shape[2] != e_semantic.shape[2]:
        pad_size = 160 * model.semantic_downsample_factor
        
        # 1. Remove channel dim, preserving batch dim -> [B, T]
        x_slice = x_batch[:, 0, :]
        
        # 2. Pad the tensor
        x_padded = F.pad(x_slice, (pad_size, pad_size))
        
        # 3. Re-add channel dim before passing to encoder -> [B, 1, T_padded]
        e_acoustic = model.encoder(x_padded.unsqueeze(1))

    # Ensure dimensions match before concatenating
    min_len = min(e_acoustic.shape[2], e_semantic.shape[2])
    e_acoustic = e_acoustic[:, :, :min_len]
    e_semantic = e_semantic[:, :, :min_len]

    # Remainder of the original encoding logic
    e = torch.cat([e_acoustic, e_semantic], dim=1)
    e = model.fc_prior(e.transpose(1, 2))

    if model.quantizer_type == "RVQ":
        e = e.transpose(1, 2)
        _, codes, _, _ = model.quantizer(e, model.frame_rate, None)
        codes = codes.permute(1, 0, 2)
    else: # RFSQ
        quantized, codes = model.quantizer(e)
        codes = codes.permute(0, 2, 1)

    return EncodedResult(audio_codes=codes)


def fix_all_inference_issues(model):
    """
    Comprehensive fix for all potential inference issues
    """
    device = next(model.parameters()).device
    
    # 1. Force everything to eval mode
    model.eval()
    with torch.no_grad():
        for module in model.modules():
            if isinstance(module, nn.Module):
                module.eval()
                if hasattr(module, 'training'):
                    module.training = False
    
    # 2. Fix semantic model specifically
    if hasattr(model, 'semantic_model'):
        print("Fixing semantic model...")
        
        # Move to correct device
        model.semantic_model = model.semantic_model.to(device)
        model.semantic_model.eval()
        
        # Disable ALL gradient checkpointing
        def disable_gradient_checkpointing(module):
            if hasattr(module, 'gradient_checkpointing'):
                module.gradient_checkpointing = False
            if hasattr(module, 'gradient_checkpointing_disable'):
                try:
                    module.gradient_checkpointing_disable()
                except:
                    pass
            for child in module.children():
                disable_gradient_checkpointing(child)
        
        disable_gradient_checkpointing(model.semantic_model)
        
        # For HuBERT specifically
        if hasattr(model.semantic_model, 'encoder'):
            model.semantic_model.encoder.gradient_checkpointing = False
            if hasattr(model.semantic_model.encoder, 'layers'):
                for layer in model.semantic_model.encoder.layers:
                    if hasattr(layer, 'gradient_checkpointing'):
                        layer.gradient_checkpointing = False
    
    # 3. Set all dropout to eval mode
    def set_dropout_eval(module):
        if isinstance(module, nn.Dropout):
            module.eval()
            module.training = False
        for child in module.children():
            set_dropout_eval(child)
    
    set_dropout_eval(model)
    
    # 4. Clear any cached computations
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return model

def inference_pipeline(checkpoint_path, config_path, device='cuda'):
    """
    Complete pipeline for inference with your trained model
    """
    # Load config
    print("Loading config...")
    with open(config_path, 'r') as f:
        config = json.load(f)
    
    # Create model
    print("Creating model...")
    model = HiggsAudioTokenizer(
        n_filters=config['n_filters'],
        D=config['D'],
        target_bandwidths=config['target_bandwidths'],
        ratios=config['ratios'],
        sample_rate=config['sample_rate'],
        bins=config['bins'],
        n_q=config['n_q'],
        codebook_dim=config.get('codebook_dim', None),
        semantic_techer=config['semantic_techer'],
        device=device
    ).to(device)
    
    # Load checkpoint
    print("Loading checkpoint...")
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
    
    if 'model_state_dict' in checkpoint:
        state_dict = checkpoint['model_state_dict']
    else:
        state_dict = checkpoint
    
    # Remove 'module.' prefix if present (from DDP)
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('module.'):
            new_state_dict[k[7:]] = v
        else:
            new_state_dict[k] = v
    
    model.load_state_dict(new_state_dict, strict=False)
    
    # Fix all inference issues
    print("Fixing inference issues...")
    model = fix_all_inference_issues(model)

    
    return model



# # Add paths
# sys.path.insert(0, "/home/ubuntu/AP-BWE")

# Suppress warnings
warnings.filterwarnings("ignore")

# Configuration
OUTPUT_DIR = "/home/ubuntu/data_boson_44.1khz"
BATCH_SIZE = 32
SAMPLE_RATE = 44100
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATASET_PATH = "/home/ubuntu/ttsar/Layla/src_bpe_2/Qanary_data"

# # Model paths
# CONFIG_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/config.json"
# MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/model.pth"

# --- Setup ---
print(f"Using device: {DEVICE}")

# Change to working directory
os.chdir("/home/ubuntu/ttsar/boson_audio_codec/audio_processing")

# Load dataset
from datasets import load_from_disk


print(f"Loading dataset from: {DATASET_PATH}")
ds = load_from_disk(DATASET_PATH)
print(f"Dataset info: {ds}")

# Remove unnecessary columns
columns_to_remove = ['spk', 'duration', 'codes', 'input_ids', 'attention_mask']
existing_columns = [col for col in columns_to_remove if col in ds.column_names]
if existing_columns:
    ds = ds.remove_columns(existing_columns)

df = ds.to_pandas()
print(f"Loaded {len(df)} files from dataset")

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory '{OUTPUT_DIR}' is ready.")

# --- Filter already processed ---
print("Checking for already processed files...")

def get_output_path(audio_path):
    base_name = Path(audio_path).stem
    return os.path.join(OUTPUT_DIR, f"{base_name}.pt")

# Filter
original_count = len(df)
df['output_exists'] = df['filename'].apply(lambda x: os.path.exists(get_output_path(x)))
df_filtered = df[~df['output_exists']].copy()
skipped_count = original_count - len(df_filtered)

print(f"Found {skipped_count} already processed files. Skipping them.")
print(f"Processing {len(df_filtered)} remaining files.")

if len(df_filtered) == 0:
    print("All files have already been processed!")
    exit()

# --- Load Model ---
print("Loading Higgs Audio Tokenizer model...")

from transformers import HubertModel
from higgs_audio_tokenizer import HiggsAudioTokenizer

# Load config
# with open(CONFIG_PATH, 'r') as f:
#     config = json.load(f)

# # Initialize model
# model = HiggsAudioTokenizer(
#     **config,
#     device=DEVICE,
# )

# Load weights
# parameter_dict = torch.load(MODEL_PATH, map_location=DEVICE)
# _ = model.load_state_dict(parameter_dict, strict=False)
# model = model.to(DEVICE)
# _ = model.eval()


checkpoint_path = '/home/ubuntu/ttsar/boson_audio_codec/audio_processing/outputs_CQT/checkpoints/step_99000.pth'
config_path = '/home/ubuntu/ttsar/boson_audio_codec/audio_processing/config copy.json'
device = 'cuda'
model = inference_pipeline(checkpoint_path, config_path, device)
_ = model.eval()

model = remove_weight_norms_from_model(model)

print(f"Model loaded on {DEVICE}")

# Get hop length
hop_length = model.hop_length
print(f"Encoder hop length: {hop_length}")

# --- Batch Processing ---
print(f"\nStarting batch processing with batch size {BATCH_SIZE}...")

# Process in batches
filenames = df_filtered['filename'].tolist()
total_processed = 0
total_errors = 0

with torch.no_grad():
    for batch_start in tqdm(range(0, len(filenames), BATCH_SIZE), desc="Processing batches"):
        batch_end = min(batch_start + BATCH_SIZE, len(filenames))
        batch_filenames = filenames[batch_start:batch_end]
        
        batch_audio = []
        batch_lengths = []
        batch_outputs = []
        
        # Load batch
        for filename in batch_filenames:
            output_path = get_output_path(filename)
            
            # Skip if exists (race condition check)
            if os.path.exists(output_path):
                continue
            
            try:
                # Load audio
                wav, _ = librosa.load(filename, sr=SAMPLE_RATE)
                wav_tensor = torch.from_numpy(wav).float()
                
                batch_audio.append(wav_tensor)
                batch_lengths.append(len(wav))
                batch_outputs.append(output_path)
                
            except Exception as e:
                print(f"\nError loading {filename}: {e}")
                total_errors += 1
                continue
        
        if not batch_audio:
            continue
        
        # Pad batch to same length
        max_len = max(len(x) for x in batch_audio)
        padded_batch = []
        
        for audio in batch_audio:
            pad_len = max_len - len(audio)
            if pad_len > 0:
                audio = F.pad(audio, (0, pad_len), mode='constant', value=0)
            # Don't add extra dimensions here, just collect the padded audio
            padded_batch.append(audio)
        
        # Convert list to tensor and add channel dimension
        # Stack along batch dimension to get [B, T]
        batch_tensor = torch.stack(padded_batch, dim=0)  # [B, T]
        # Add channel dimension
        batch_tensor = batch_tensor.unsqueeze(1)  # [B, 1, T]
        batch_tensor = batch_tensor.to(DEVICE)
        
        # Encode batch
        try:
            encoded = encode_batch(model, batch_tensor)
            codes = encoded.audio_codes  # [B, n_codebooks, T_compressed]
            
            # Save each item
            for idx, (output_path, orig_len) in enumerate(zip(batch_outputs, batch_lengths)):
                # Calculate true code length
                true_code_len = int(np.ceil(orig_len / hop_length))
                
                # Extract non-padded codes
                item_codes = codes[idx, :, :true_code_len].cpu()
                
                # Save
                torch.save(item_codes, output_path)
                total_processed += 1
                
        except Exception as e:
            print(f"\nError encoding batch: {e}")
            total_errors += len(batch_outputs)

print("\n" + "="*50)
print("PROCESSING COMPLETE!")
print("="*50)
print(f"Successfully processed: {total_processed} files")
print(f"Previously processed: {skipped_count} files")
print(f"Errors encountered: {total_errors} files")
print(f"Output directory: {OUTPUT_DIR}")

# Final count
final_count = len(list(Path(OUTPUT_DIR).glob("*.pt")))
print(f"Total .pt files in output: {final_count}")