Upload folder using huggingface_hub

3b3134b verified 8 months ago

10.2 kB

	import os
	from itertools import combinations

	import matplotlib.pyplot as plt
	import numpy as np
	import torch
	import torch.nn as nn
	from datasets import Audio, load_dataset
	from safetensors.torch import save_file
	from tqdm import tqdm
	from transformers import AutoFeatureExtractor, WhisperModel

	from .config import *

	model_ids = ENABLED_MODELS

	# Load dataset
	dataset = load_dataset("JacobLinCool/cv161-en-zh-subset-200", split="train")
	if MAX_SAMPLES is not None:
	dataset = dataset.select(range(min(MAX_SAMPLES, len(dataset))))
	print(f"Limited dataset to {len(dataset)} samples for testing")

	dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

	device = torch.device(
	"cuda"
	if torch.cuda.is_available()
	else "mps" if torch.backends.mps.is_available() else "cpu"
	)
	print(f"Using device: {device}")


	def extract_layer_reps_generator(model_id, batch_size=4):
	"""
	Use a generator to process samples in batches, avoiding loading all hidden states into memory at once.
	Yields (sample_idx, layer_reps) pairs, where layer_reps is a list of all layer representations for the sample.
	"""
	model = WhisperModel.from_pretrained(model_id).to(device)
	feat_ext = AutoFeatureExtractor.from_pretrained(model_id)
	model.eval()

	for i in tqdm(
	range(0, len(dataset), batch_size), desc=f"Processing {model_id} in batches"
	):
	batch_end = min(i + batch_size, len(dataset))
	batch_samples = dataset.select(range(i, batch_end))

	# Process each sample in the batch
	for j, sample in enumerate(batch_samples):
	audio = sample["audio"]
	samples = audio["array"]
	sr = audio["sampling_rate"]

	inputs = feat_ext(
	samples, sampling_rate=sr, return_tensors="pt"
	).input_features.to(device)
	with torch.no_grad():
	outputs = model.encoder(
	inputs, return_dict=True, output_hidden_states=True
	)

	# Save the full sequence for each layer and immediately move to CPU; optionally use half precision to save memory
	layer_reps_for_sample = []
	for hs in outputs.hidden_states:
	# hs: [1, T, D] -> [T, D]
	layer_rep = hs.squeeze(0)
	if USE_HALF_PRECISION:
	layer_rep = layer_rep.to(HALF_PRECISION_DTYPE)
	layer_reps_for_sample.append(layer_rep)

	yield i + j, layer_reps_for_sample

	# Clean up GPU memory
	del outputs, inputs
	if AGGRESSIVE_CLEANUP and torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Clean up model memory
	del model, feat_ext
	if AGGRESSIVE_CLEANUP and torch.cuda.is_available():
	torch.cuda.empty_cache()


	def compute_linear_mse_matrix_temporal_memory_efficient(
	model_a_id, model_b_id, n_steps=200, lr=1e-3, batch_size=4
	):
	"""
	Memory-efficient version: For each layer pair (i, j), trains a 1x1 convolution as a linear probe and computes MSE.
	Uses a generator to process in batches, avoiding loading all representations into memory at once.
	Returns an MSE matrix of shape (layers_a, layers_b) and all trained probes.
	"""
	print(f"Computing alignment between {model_a_id} and {model_b_id}...")

	# First, get the number of layers
	sample_gen_a = extract_layer_reps_generator(model_a_id, batch_size=1)
	_, sample_reps_a = next(sample_gen_a)
	layers_a = len(sample_reps_a)

	sample_gen_b = extract_layer_reps_generator(model_b_id, batch_size=1)
	_, sample_reps_b = next(sample_gen_b)
	layers_b = len(sample_reps_b)

	mse_mat = np.zeros((layers_a, layers_b))
	trained_probes = {}

	pbar = tqdm(total=layers_a * layers_b, desc="Comparing layer pairs")

	# Re-initialize generators to process all samples
	gen_a = extract_layer_reps_generator(model_a_id, batch_size=batch_size)
	gen_b = extract_layer_reps_generator(model_b_id, batch_size=batch_size)

	# Collect all sample representations for specified layers
	reps_a_dict_all = {}
	for sample_idx, layer_reps in gen_a:
	reps_a_dict_all[sample_idx] = layer_reps

	reps_b_dict_all = {}
	for sample_idx, layer_reps in gen_b:
	reps_b_dict_all[sample_idx] = layer_reps

	for i in range(layers_a):
	for j in range(layers_b):
	# Collect all sample representations for the specified layer
	reps_a_dict = {}
	for sample_idx, layer_reps in reps_a_dict_all.items():
	if i < len(layer_reps):
	reps_a_dict[sample_idx] = layer_reps[i]

	reps_b_dict = {}
	for sample_idx, layer_reps in reps_b_dict_all.items():
	if j < len(layer_reps):
	reps_b_dict[sample_idx] = layer_reps[j]

	# Concatenate representations in order
	X_list = [reps_a_dict[idx] for idx in sorted(reps_a_dict.keys())]
	Y_list = [reps_b_dict[idx] for idx in sorted(reps_b_dict.keys())]

	# Process in batches to avoid memory issues
	X_cat = torch.cat(X_list, dim=0).to(device)
	Y_cat = torch.cat(Y_list, dim=0).to(device)

	dim_a = X_cat.shape[1]
	dim_b = Y_cat.shape[1]

	# For Conv1d, reshape to [Batch, Channels, Length]
	X = X_cat.T.unsqueeze(0) # [1, Dim_A, Total_Tokens]
	Y = Y_cat.T.unsqueeze(0) # [1, Dim_B, Total_Tokens]

	# 2. Define and train linear probe (1x1 Conv)
	probe = nn.Conv1d(
	in_channels=dim_a, out_channels=dim_b, kernel_size=1, bias=False
	).to(device=device, dtype=HALF_PRECISION_DTYPE)
	probe.train()

	optimizer = torch.optim.Adam(probe.parameters(), lr=lr)
	loss_fn = nn.MSELoss()

	for step in tqdm(range(n_steps), desc=f"Training probe {i}->{j}"):
	optimizer.zero_grad()
	Y_pred = probe(X)
	loss = loss_fn(Y_pred, Y)
	loss.backward()
	optimizer.step()

	# 3. Record final MSE and trained probe
	final_mse = loss.item()
	mse_mat[i, j] = final_mse
	trained_probes[f"layer_{i}_to_{j}"] = probe.state_dict()["weight"]

	# Clean up memory
	del (
	X_cat,
	Y_cat,
	X,
	Y,
	probe,
	optimizer,
	reps_a_dict,
	reps_b_dict,
	X_list,
	Y_list,
	)
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	pbar.update(1)
	pbar.set_postfix({"layer_a": i, "layer_b": j, "mse": f"{final_mse:.4f}"})

	pbar.close()
	return mse_mat, trained_probes


	if __name__ == "__main__":
	print(f"Memory optimization settings:")
	print(f" Batch size: {BATCH_SIZE}")
	print(f" Training steps: {TRAINING_STEPS}")
	if USE_HALF_PRECISION:
	dtype_name = "bfloat16" if HALF_PRECISION_DTYPE == torch.bfloat16 else "float16"
	print(f" Half precision: {USE_HALF_PRECISION} ({dtype_name})")
	else:
	print(f" Half precision: {USE_HALF_PRECISION}")
	print(f" Aggressive cleanup: {AGGRESSIVE_CLEANUP}")
	print(f" Models: {list(model_ids.keys())}")
	print(f" Dataset size: {len(dataset)} samples")

	# Create results directory
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# 2. Compare all model pairs - using memory-efficient method
	model_names = list(model_ids.keys())
	all_pairs = list(combinations(model_names, 2))

	print(
	f"\nProcessing {len(all_pairs)} model pairs with memory-efficient approach..."
	)

	for pair_idx, (model_a, model_b) in enumerate(all_pairs):
	print(
	f"\n[{pair_idx + 1}/{len(all_pairs)}] Computing temporal linear MSE for whisper-{model_a} vs whisper-{model_b}..."
	)

	# Compute linear MSE along the temporal dimension and get trained probes - memory-efficient version
	mse_mat_temporal, trained_probes = (
	compute_linear_mse_matrix_temporal_memory_efficient(
	model_ids[model_a],
	model_ids[model_b],
	n_steps=TRAINING_STEPS,
	lr=LEARNING_RATE,
	batch_size=BATCH_SIZE,
	)
	)

	# Save trained models
	model_save_path = f"{OUTPUT_DIR}/{model_a}-to-{model_b}-probes.safetensors"
	save_file(
	trained_probes,
	model_save_path,
	{
	"from_model": model_a,
	"to_model": model_b,
	"from_layers": str(len(mse_mat_temporal)),
	"to_layers": str(len(mse_mat_temporal[0])),
	},
	)
	print(f"Saved trained probes to: {model_save_path}")

	if SAVE_PLOTS:
	# Visualize results
	# Avoid log(0) by adding a small value
	eps = 1e-10
	log_mse_mat = -np.log10(mse_mat_temporal + eps)

	plt.figure(figsize=(8, 6))
	plt.imshow(
	log_mse_mat, aspect="auto", origin="lower"
	) # origin='lower' is more standard for matrices
	plt.colorbar(label="-log10(MSE)")
	plt.title(
	f"Temporal Linear MSE (log scale): whisper-{model_a} vs whisper-{model_b}"
	)
	plt.xlabel(f"whisper-{model_b} layers")
	plt.ylabel(f"whisper-{model_a} layers")
	plt.tight_layout()

	# Save visualization results
	plot_save_path = (
	f"{OUTPUT_DIR}/{model_a}-vs-{model_b}-temporal-linear-mse-log.png"
	)
	plt.savefig(plot_save_path, dpi=PLOT_DPI)
	plt.close() # Close figure to save memory
	print(f"Saved plot to: {plot_save_path}")

	print(f"\nAll experiments complete! Results saved to '{OUTPUT_DIR}' directory")
	print(
	f"Generated {len(all_pairs)} visualization plots and {len(all_pairs)} trained probe models"
	)