Testk / low.py
MoeZilla's picture
Create low.py
4cfe38d verified
import torch
from safetensors.torch import load_file, save_file
def fp16_to_fp8(tensor):
"""Convert a tensor from FP16 to an approximate FP8 using quantization."""
# Simulate FP8 by scaling FP16 values and converting to INT8
scale = tensor.abs().max() / 127.0 # Maximum value for INT8 is 127
tensor_fp8 = (tensor / scale).clamp(-127, 127).to(torch.int8)
return tensor_fp8, scale
def fp8_to_fp16(tensor_fp8, scale):
"""Convert FP8 tensor back to FP16 using the stored scale."""
return tensor_fp8.to(torch.float16) * scale
def convert_model_to_fp8(model_path, output_path):
# Load the model (FP16) using safetensors
tensors = load_file(model_path)
# Dictionary to store the converted FP8 tensors and scales
converted_tensors = {}
scales = {}
for key, tensor in tensors.items():
if tensor.dtype == torch.float16:
# Convert FP16 to FP8
tensor_fp8, scale = fp16_to_fp8(tensor)
converted_tensors[key] = tensor_fp8
scales[key] = scale # Store the scale used for this tensor
print(f"Converted tensor {key} from FP16 to FP8.")
else:
# Keep non-FP16 tensors as is
converted_tensors[key] = tensor
# Save the converted tensors (FP8) and their scales using safetensors
save_file(converted_tensors, output_path)
print(f"Model saved in FP8 format to {output_path}")
# Optionally, you could also save the scales used for each tensor for later recovery
return scales
# Example usage
convert_model_to_fp8(
'flowgram.safetensors', # Input FP16 model
'flowgram_fp8.safetensors' # Output FP8 model
)