Upload folder using huggingface_hub

ffaf0d2 verified 6 months ago

21.2 kB

	# #!/usr/bin/env python3
	# """
	# Audio Processing Script for Boson Codes
	# Processes audio files in parallel using Higgs Audio Tokenizer
	# and saves encoded representations as .pt files.
	# """

	# import os
	# import sys
	# import json
	# import torch
	# import librosa
	# import numpy as np
	# import warnings
	# import argparse
	# from pathlib import Path
	# from multiprocessing import Pool
	# from tqdm import tqdm

	# from datasets import load_from_disk
	# from higgs_audio_tokenizer import HiggsAudioTokenizer

	# # Suppress PyTorch FutureWarnings
	# warnings.filterwarnings("ignore", category=FutureWarning)

	# # Global configuration
	# DEFAULT_OUTPUT_DIR = "/home/ubuntu/boson_codes"
	# DEFAULT_NUM_CORES = 48
	# DEFAULT_SAMPLE_RATE = 44100
	# DEFAULT_DATASET_PATH = "/home/ubuntu/ttsar/Layla/src_bpe_2/data"

	# # Model paths
	# CONFIG_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/config.json"
	# MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/model.pth"

	# # Global model variable (initialized in each worker)
	# model = None


	# def init_worker():
	# """Initialize model once per worker process."""
	# global model
	# device = 'cpu'

	# # Load config
	# with open(CONFIG_PATH, 'r') as f:
	# config = json.load(f)

	# # Initialize model
	# model = HiggsAudioTokenizer(
	# **config,
	# device=device,
	# )

	# # Load weights
	# parameter_dict = torch.load(MODEL_PATH, map_location=device)
	# _ = model.load_state_dict(parameter_dict, strict=False)
	# model = model.to(device)
	# _ = model.eval()

	# print(f"Model loaded in worker {os.getpid()}")


	# def process_audio_file(args):
	# """Process a single audio file using pre-loaded model."""
	# filename, output_dir, sample_rate = args

	# try:
	# # Output filename - same name, just change extension to .pt
	# base_name = Path(filename).stem
	# output_path = os.path.join(output_dir, f"{base_name}.pt")

	# # Skip if exists (double-check in case of race conditions)
	# if os.path.exists(output_path):
	# return ("skipped", filename)

	# # Load and process audio
	# wav, sr = librosa.load(filename, sr=sample_rate)
	# wav = torch.from_numpy(wav).unsqueeze(0).float().to('cpu')

	# # Encode using the pre-loaded model
	# with torch.no_grad():
	# encoded = model._xcodec_encode(wav.unsqueeze(0))

	# # Save codes only
	# torch.save(encoded.audio_codes, output_path)

	# return ("success", filename)

	# except Exception as e:
	# return ("error", filename, str(e))


	# def load_dataset(dataset_path):
	# """Load and prepare the dataset."""
	# print(f"Loading dataset from: {dataset_path}")
	# ds = load_from_disk(dataset_path)
	# print(f"Dataset info: {ds}")

	# # Remove unnecessary columns
	# columns_to_remove = ['spk', 'duration', 'codes', 'input_ids', 'attention_mask']
	# existing_columns = [col for col in columns_to_remove if col in ds.column_names]
	# if existing_columns:
	# ds = ds.remove_columns(existing_columns)
	# print(f"Removed columns: {existing_columns}")

	# # Convert to pandas DataFrame
	# df = ds.to_pandas()
	# print(f"Loaded {len(df)} files from dataset")
	# return df


	# def main(args):
	# """Main processing function."""
	# # Change to audio processing directory
	# os.chdir("/home/ubuntu/ttsar/boson_audio_codec/audio_processing")
	# print(f"Working directory: {os.getcwd()}")

	# # Create output directory
	# os.makedirs(args.output_dir, exist_ok=True)
	# print(f"Output directory: {args.output_dir}")

	# # Check if model files exist
	# if not os.path.exists(CONFIG_PATH):
	# print(f"Error: Config file not found at {CONFIG_PATH}")
	# sys.exit(1)
	# if not os.path.exists(MODEL_PATH):
	# print(f"Error: Model file not found at {MODEL_PATH}")
	# sys.exit(1)

	# # Load dataset
	# df = load_dataset(args.dataset_path)

	# # Get filenames from dataframe
	# all_filenames = df['filename'].tolist()

	# # Pre-filter to exclude already processed files
	# filenames_to_process = []
	# already_processed = []

	# print(f"\nChecking for already processed files...")
	# for filename in all_filenames:
	# base_name = Path(filename).stem
	# output_path = os.path.join(args.output_dir, f"{base_name}.pt")
	# if os.path.exists(output_path):
	# already_processed.append(filename)
	# else:
	# filenames_to_process.append(filename)

	# print(f"\nTotal files: {len(all_filenames)}")
	# print(f"Already processed: {len(already_processed)}")
	# print(f"To process: {len(filenames_to_process)}")

	# if len(filenames_to_process) == 0:
	# print("\nAll files have already been processed!")
	# return

	# print(f"\nProcessing {len(filenames_to_process)} files using {args.num_cores} cores...")
	# print(f"Sample rate: {args.sample_rate} Hz")

	# # Prepare arguments for multiprocessing
	# process_args = [(filename, args.output_dir, args.sample_rate)
	# for filename in filenames_to_process]

	# # Process in parallel with model reuse
	# with Pool(processes=args.num_cores, initializer=init_worker) as pool:
	# results = list(tqdm(
	# pool.imap(process_audio_file, process_args, chunksize=args.chunksize),
	# total=len(filenames_to_process),
	# desc="Processing audio files"
	# ))

	# # Count results
	# processed = sum(1 for r in results if r[0] == "success")
	# skipped = sum(1 for r in results if r[0] == "skipped")
	# errors = sum(1 for r in results if r[0] == "error")

	# print(f"\nProcessing complete!")
	# print(f" Successfully processed: {processed}")
	# print(f" Previously processed: {len(already_processed)}")
	# print(f" Skipped (race condition): {skipped}")
	# print(f" Errors: {errors}")

	# # Show errors if any
	# if errors > 0:
	# print("\nErrors encountered:")
	# error_log_path = os.path.join(args.output_dir, "processing_errors.log")
	# with open(error_log_path, 'w') as f:
	# for r in results:
	# if r[0] == "error":
	# error_msg = f"{r[1]}: {r[2]}"
	# print(f" {error_msg}")
	# f.write(error_msg + "\n")
	# print(f"\nError log saved to: {error_log_path}")

	# # Show summary of all processed files
	# total_processed_files = len(list(Path(args.output_dir).glob("*.pt")))
	# print(f"\nTotal .pt files in {args.output_dir}: {total_processed_files}")


	# if __name__ == "__main__":
	# parser = argparse.ArgumentParser(
	# description="Process audio files using Higgs Audio Tokenizer and save as .pt files"
	# )

	# parser.add_argument(
	# "--dataset-path",
	# type=str,
	# default=DEFAULT_DATASET_PATH,
	# help=f"Path to the dataset (default: {DEFAULT_DATASET_PATH})"
	# )

	# parser.add_argument(
	# "--output-dir",
	# type=str,
	# default=DEFAULT_OUTPUT_DIR,
	# help=f"Output directory for .pt files (default: {DEFAULT_OUTPUT_DIR})"
	# )

	# parser.add_argument(
	# "--num-cores",
	# type=int,
	# default=DEFAULT_NUM_CORES,
	# help=f"Number of CPU cores to use (default: {DEFAULT_NUM_CORES})"
	# )

	# parser.add_argument(
	# "--sample-rate",
	# type=int,
	# default=DEFAULT_SAMPLE_RATE,
	# help=f"Sample rate for audio processing (default: {DEFAULT_SAMPLE_RATE})"
	# )

	# parser.add_argument(
	# "--chunksize",
	# type=int,
	# default=1,
	# help="Chunksize for multiprocessing pool (default: 1)"
	# )

	# args = parser.parse_args()

	# # Run main processing
	# try:
	# main(args)
	# except KeyboardInterrupt:
	# print("\n\nProcessing interrupted by user")
	# sys.exit(1)
	# except Exception as e:
	# print(f"\n\nError: {e}")
	# sys.exit(1)

	#!/usr/bin/env python3
	"""
	GPU Batch Processing Script for Boson Codes with Dataset Loading
	"""

	import os
	import sys
	import json
	import torch
	import torch.nn.functional as F
	import librosa
	import numpy as np
	from pathlib import Path
	from tqdm import tqdm
	import warnings
	from torch.nn.utils import remove_weight_norm, weight_norm


	# from boson_multimodal.audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
	# model = load_higgs_audio_tokenizer("bosonai/higgs-audio-v2-tokenizer")
	import librosa
	import torch
	import torch.nn.functional as F
	import numpy as np
	import json
	import torch

	from higgs_audio_tokenizer import HiggsAudioTokenizer
	# model = load_higgs_audio_tokenizer("bosonai/higgs-audio-v2-tokenizer")

	import torch
	import torch.nn as nn
	import warnings

	# Suppress warnings
	warnings.filterwarnings('ignore')

	def remove_weight_norms_from_model(model):
	for module in model.modules():
	try:
	remove_weight_norm(module)
	except:
	continue
	return model


	class EncodedResult:
	def __init__(self, audio_codes):
	self.audio_codes = audio_codes

	def encode_batch(model, x_batch):
	"""
	Encodes a batch of audio tensors using the HiggsAudioTokenizer model.
	Args:
	model: The loaded HiggsAudioTokenizer model.
	x_batch: A tensor of shape [B, 1, T]
	"""
	# Acoustic and Semantic Feature Extraction
	e_semantic_input = model.get_regress_target(x_batch).detach()
	e_semantic = model.encoder_semantic(e_semantic_input.transpose(1, 2))
	e_acoustic = model.encoder(x_batch)

	# This block contains the fix for batch processing
	if e_acoustic.shape[2] != e_semantic.shape[2]:
	pad_size = 160 * model.semantic_downsample_factor

	# 1. Remove channel dim, preserving batch dim -> [B, T]
	x_slice = x_batch[:, 0, :]

	# 2. Pad the tensor
	x_padded = F.pad(x_slice, (pad_size, pad_size))

	# 3. Re-add channel dim before passing to encoder -> [B, 1, T_padded]
	e_acoustic = model.encoder(x_padded.unsqueeze(1))

	# Ensure dimensions match before concatenating
	min_len = min(e_acoustic.shape[2], e_semantic.shape[2])
	e_acoustic = e_acoustic[:, :, :min_len]
	e_semantic = e_semantic[:, :, :min_len]

	# Remainder of the original encoding logic
	e = torch.cat([e_acoustic, e_semantic], dim=1)
	e = model.fc_prior(e.transpose(1, 2))

	if model.quantizer_type == "RVQ":
	e = e.transpose(1, 2)
	_, codes, _, _ = model.quantizer(e, model.frame_rate, None)
	codes = codes.permute(1, 0, 2)
	else: # RFSQ
	quantized, codes = model.quantizer(e)
	codes = codes.permute(0, 2, 1)

	return EncodedResult(audio_codes=codes)


	def fix_all_inference_issues(model):
	"""
	Comprehensive fix for all potential inference issues
	"""
	device = next(model.parameters()).device

	# 1. Force everything to eval mode
	model.eval()
	with torch.no_grad():
	for module in model.modules():
	if isinstance(module, nn.Module):
	module.eval()
	if hasattr(module, 'training'):
	module.training = False

	# 2. Fix semantic model specifically
	if hasattr(model, 'semantic_model'):
	print("Fixing semantic model...")

	# Move to correct device
	model.semantic_model = model.semantic_model.to(device)
	model.semantic_model.eval()

	# Disable ALL gradient checkpointing
	def disable_gradient_checkpointing(module):
	if hasattr(module, 'gradient_checkpointing'):
	module.gradient_checkpointing = False
	if hasattr(module, 'gradient_checkpointing_disable'):
	try:
	module.gradient_checkpointing_disable()
	except:
	pass
	for child in module.children():
	disable_gradient_checkpointing(child)

	disable_gradient_checkpointing(model.semantic_model)

	# For HuBERT specifically
	if hasattr(model.semantic_model, 'encoder'):
	model.semantic_model.encoder.gradient_checkpointing = False
	if hasattr(model.semantic_model.encoder, 'layers'):
	for layer in model.semantic_model.encoder.layers:
	if hasattr(layer, 'gradient_checkpointing'):
	layer.gradient_checkpointing = False

	# 3. Set all dropout to eval mode
	def set_dropout_eval(module):
	if isinstance(module, nn.Dropout):
	module.eval()
	module.training = False
	for child in module.children():
	set_dropout_eval(child)

	set_dropout_eval(model)

	# 4. Clear any cached computations
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	return model

	def inference_pipeline(checkpoint_path, config_path, device='cuda'):
	"""
	Complete pipeline for inference with your trained model
	"""
	# Load config
	print("Loading config...")
	with open(config_path, 'r') as f:
	config = json.load(f)

	# Create model
	print("Creating model...")
	model = HiggsAudioTokenizer(
	n_filters=config['n_filters'],
	D=config['D'],
	target_bandwidths=config['target_bandwidths'],
	ratios=config['ratios'],
	sample_rate=config['sample_rate'],
	bins=config['bins'],
	n_q=config['n_q'],
	codebook_dim=config.get('codebook_dim', None),
	semantic_techer=config['semantic_techer'],
	device=device
	).to(device)

	# Load checkpoint
	print("Loading checkpoint...")
	checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

	if 'model_state_dict' in checkpoint:
	state_dict = checkpoint['model_state_dict']
	else:
	state_dict = checkpoint

	# Remove 'module.' prefix if present (from DDP)
	new_state_dict = {}
	for k, v in state_dict.items():
	if k.startswith('module.'):
	new_state_dict[k[7:]] = v
	else:
	new_state_dict[k] = v

	model.load_state_dict(new_state_dict, strict=False)

	# Fix all inference issues
	print("Fixing inference issues...")
	model = fix_all_inference_issues(model)


	return model



	# # Add paths
	# sys.path.insert(0, "/home/ubuntu/AP-BWE")

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# Configuration
	OUTPUT_DIR = "/home/ubuntu/data_boson_44.1khz"
	BATCH_SIZE = 32
	SAMPLE_RATE = 44100
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	DATASET_PATH = "/home/ubuntu/ttsar/Layla/src_bpe_2/Qanary_data"

	# # Model paths
	# CONFIG_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/config.json"
	# MODEL_PATH = "/home/ubuntu/.cache/huggingface/hub/models--bosonai--higgs-audio-v2-tokenizer/snapshots/9d4988fbd4ad07b4cac3a5fa462741a41810dbec/model.pth"

	# --- Setup ---
	print(f"Using device: {DEVICE}")

	# Change to working directory
	os.chdir("/home/ubuntu/ttsar/boson_audio_codec/audio_processing")

	# Load dataset
	from datasets import load_from_disk


	print(f"Loading dataset from: {DATASET_PATH}")
	ds = load_from_disk(DATASET_PATH)
	print(f"Dataset info: {ds}")

	# Remove unnecessary columns
	columns_to_remove = ['spk', 'duration', 'codes', 'input_ids', 'attention_mask']
	existing_columns = [col for col in columns_to_remove if col in ds.column_names]
	if existing_columns:
	ds = ds.remove_columns(existing_columns)

	df = ds.to_pandas()
	print(f"Loaded {len(df)} files from dataset")

	os.makedirs(OUTPUT_DIR, exist_ok=True)
	print(f"Output directory '{OUTPUT_DIR}' is ready.")

	# --- Filter already processed ---
	print("Checking for already processed files...")

	def get_output_path(audio_path):
	base_name = Path(audio_path).stem
	return os.path.join(OUTPUT_DIR, f"{base_name}.pt")

	# Filter
	original_count = len(df)
	df['output_exists'] = df['filename'].apply(lambda x: os.path.exists(get_output_path(x)))
	df_filtered = df[~df['output_exists']].copy()
	skipped_count = original_count - len(df_filtered)

	print(f"Found {skipped_count} already processed files. Skipping them.")
	print(f"Processing {len(df_filtered)} remaining files.")

	if len(df_filtered) == 0:
	print("All files have already been processed!")
	exit()

	# --- Load Model ---
	print("Loading Higgs Audio Tokenizer model...")

	from transformers import HubertModel
	from higgs_audio_tokenizer import HiggsAudioTokenizer

	# Load config
	# with open(CONFIG_PATH, 'r') as f:
	# config = json.load(f)

	# # Initialize model
	# model = HiggsAudioTokenizer(
	# **config,
	# device=DEVICE,
	# )

	# Load weights
	# parameter_dict = torch.load(MODEL_PATH, map_location=DEVICE)
	# _ = model.load_state_dict(parameter_dict, strict=False)
	# model = model.to(DEVICE)
	# _ = model.eval()


	checkpoint_path = '/home/ubuntu/ttsar/boson_audio_codec/audio_processing/outputs_CQT/checkpoints/step_99000.pth'
	config_path = '/home/ubuntu/ttsar/boson_audio_codec/audio_processing/config copy.json'
	device = 'cuda'
	model = inference_pipeline(checkpoint_path, config_path, device)
	_ = model.eval()

	model = remove_weight_norms_from_model(model)

	print(f"Model loaded on {DEVICE}")

	# Get hop length
	hop_length = model.hop_length
	print(f"Encoder hop length: {hop_length}")

	# --- Batch Processing ---
	print(f"\nStarting batch processing with batch size {BATCH_SIZE}...")

	# Process in batches
	filenames = df_filtered['filename'].tolist()
	total_processed = 0
	total_errors = 0

	with torch.no_grad():
	for batch_start in tqdm(range(0, len(filenames), BATCH_SIZE), desc="Processing batches"):
	batch_end = min(batch_start + BATCH_SIZE, len(filenames))
	batch_filenames = filenames[batch_start:batch_end]

	batch_audio = []
	batch_lengths = []
	batch_outputs = []

	# Load batch
	for filename in batch_filenames:
	output_path = get_output_path(filename)

	# Skip if exists (race condition check)
	if os.path.exists(output_path):
	continue

	try:
	# Load audio
	wav, _ = librosa.load(filename, sr=SAMPLE_RATE)
	wav_tensor = torch.from_numpy(wav).float()

	batch_audio.append(wav_tensor)
	batch_lengths.append(len(wav))
	batch_outputs.append(output_path)

	except Exception as e:
	print(f"\nError loading {filename}: {e}")
	total_errors += 1
	continue

	if not batch_audio:
	continue

	# Pad batch to same length
	max_len = max(len(x) for x in batch_audio)
	padded_batch = []

	for audio in batch_audio:
	pad_len = max_len - len(audio)
	if pad_len > 0:
	audio = F.pad(audio, (0, pad_len), mode='constant', value=0)
	# Don't add extra dimensions here, just collect the padded audio
	padded_batch.append(audio)

	# Convert list to tensor and add channel dimension
	# Stack along batch dimension to get [B, T]
	batch_tensor = torch.stack(padded_batch, dim=0) # [B, T]
	# Add channel dimension
	batch_tensor = batch_tensor.unsqueeze(1) # [B, 1, T]
	batch_tensor = batch_tensor.to(DEVICE)

	# Encode batch
	try:
	encoded = encode_batch(model, batch_tensor)
	codes = encoded.audio_codes # [B, n_codebooks, T_compressed]

	# Save each item
	for idx, (output_path, orig_len) in enumerate(zip(batch_outputs, batch_lengths)):
	# Calculate true code length
	true_code_len = int(np.ceil(orig_len / hop_length))

	# Extract non-padded codes
	item_codes = codes[idx, :, :true_code_len].cpu()

	# Save
	torch.save(item_codes, output_path)
	total_processed += 1

	except Exception as e:
	print(f"\nError encoding batch: {e}")
	total_errors += len(batch_outputs)

	print("\n" + "="*50)
	print("PROCESSING COMPLETE!")
	print("="*50)
	print(f"Successfully processed: {total_processed} files")
	print(f"Previously processed: {skipped_count} files")
	print(f"Errors encountered: {total_errors} files")
	print(f"Output directory: {OUTPUT_DIR}")

	# Final count
	final_count = len(list(Path(OUTPUT_DIR).glob("*.pt")))
	print(f"Total .pt files in output: {final_count}")