SecureBERT2.0-coreml / convert_via_torch_export.py
chrslrssn's picture
Upload 10 files
d9f49f0 verified
"""
Konverterar cisco-ai/SecureBERT2.0-cross_encoder till Core ML via torch.export.
torch.export är PyTorchs nyare export-mekanism (sedan PyTorch 2.1) som hanterar
dynamiska shapes och int-ops bättre än torch.jit.trace.
Output:
output/SecureBERT2_CrossEncoder_FP32.mlpackage
output/SecureBERT2_CrossEncoder_FP16.mlpackage
"""
import os
import sys
import time
import numpy as np
import torch
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ============================================================================
# KONFIGURATION
# ============================================================================
MODEL_ID = "cisco-ai/SecureBERT2.0-cross_encoder"
OUTPUT_DIR = "./output"
MAX_SEQUENCE_LENGTH = 512
# ============================================================================
# STEG 1: LADDA MODELL
# ============================================================================
print("=" * 70)
print("Fas 1 (via torch.export): SecureBERT2.0-cross_encoder → Core ML")
print("=" * 70)
print()
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"[1/6] Laddar tokenizer och modell...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_ID,
attn_implementation="eager",
torch_dtype=torch.float32
)
model.eval()
print(f" ✓ Modell laddad")
print(f" Architecture: {model.config.architectures}")
print()
# ============================================================================
# STEG 2: FÖRBERED EXAMPLE INPUT
# ============================================================================
print(f"[2/6] Förbereder example input...")
example_query = "How do I configure vPC peer-link on Nexus 9000?"
example_document = (
"vPC peer-link must be configured as a port-channel with all VLANs "
"allowed. Use LACP active mode and spanning-tree port type network "
"for fast convergence between peer switches."
)
example_inputs = tokenizer(
example_query,
example_document,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="pt"
)
# torch.export behöver explicit int64 (default i transformers)
input_ids = example_inputs['input_ids']
attention_mask = example_inputs['attention_mask']
print(f" ✓ Input shape: {input_ids.shape}, dtype: {input_ids.dtype}")
print()
# ============================================================================
# STEG 3: VERIFIERA PYTORCH
# ============================================================================
print(f"[3/6] Verifierar PyTorch-modellen...")
with torch.no_grad():
pytorch_output = model(input_ids=input_ids, attention_mask=attention_mask)
pytorch_score = torch.sigmoid(pytorch_output.logits).item()
print(f" PyTorch score: {pytorch_score:.4f}")
print()
# ============================================================================
# STEG 4: WRAPPA MODELL
# ============================================================================
print(f"[4/6] Wrappar modell med sigmoid...")
class CrossEncoderWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, input_ids, attention_mask):
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask
)
return torch.sigmoid(outputs.logits)
wrapped_model = CrossEncoderWrapper(model)
wrapped_model.eval()
print(f" ✓ Wrapper skapad")
print()
# ============================================================================
# STEG 5: EXPORTERA MED torch.export
# ============================================================================
print(f"[5/6] Exporterar med torch.export.export...")
print(f" (Detta tar 30-60 sekunder)")
start = time.time()
try:
# torch.export.export returnerar en ExportedProgram
# Den hanterar dynamiska ops bättre än jit.trace
exported_program = torch.export.export(
wrapped_model,
args=(input_ids, attention_mask),
# Strict=False tillåter mer flexibilitet med ops
strict=False,
)
export_time = time.time() - start
print(f" ✓ Export klar på {export_time:.1f}s")
print(f" Type: {type(exported_program).__name__}")
# Dekomponera TRAINING-dialekt till ATEN-dialekt som coremltools förstår
print(f" Dekomponerar till ATEN-dialekt för coremltools...")
decomp_start = time.time()
exported_program = exported_program.run_decompositions({})
print(f" ✓ Dekomposition klar på {time.time() - decomp_start:.1f}s")
# Verifiera att exporterad modell ger samma output
with torch.no_grad():
exported_output = exported_program.module()(input_ids, attention_mask)
exported_score = exported_output.item()
print(f" PyTorch: {pytorch_score:.6f}")
print(f" Exported: {exported_score:.6f}")
print(f" Diff: {abs(pytorch_score - exported_score):.6f}")
print()
except Exception as e:
print(f" ✗ torch.export.export misslyckades: {e}")
print()
print(f"Detta är oväntat eftersom ONNX-export fungerade.")
print(f"Möjlig orsak: torch.export är striktare än ONNX-export.")
print()
print(f"Fallback: prova med dynamic_shapes-spec eller andra args.")
sys.exit(1)
# ============================================================================
# STEG 6: KONVERTERA TILL CORE ML
# ============================================================================
print(f"[6/6] Konverterar ExportedProgram → Core ML...")
print()
print(f" a) FP32-version...")
start = time.time()
try:
mlmodel_fp32 = ct.convert(
exported_program,
inputs=[
ct.TensorType(
name="input_ids",
shape=(1, MAX_SEQUENCE_LENGTH),
dtype=np.int32
),
ct.TensorType(
name="attention_mask",
shape=(1, MAX_SEQUENCE_LENGTH),
dtype=np.int32
),
],
outputs=[ct.TensorType(name="score")],
convert_to="mlprogram",
compute_precision=ct.precision.FLOAT32,
compute_units=ct.ComputeUnit.ALL,
minimum_deployment_target=ct.target.macOS14,
)
fp32_time = time.time() - start
print(f" ✓ FP32-konvertering klar på {fp32_time:.1f}s")
mlmodel_fp32.author = "Cisco AI (konverterad för Nomad via torch.export)"
mlmodel_fp32.short_description = (
"SecureBERT 2.0 Cross-Encoder for text reranking. "
"Input: query + document pair. Output: similarity score (0-1)."
)
mlmodel_fp32.version = "1.0"
fp32_path = os.path.join(OUTPUT_DIR, "SecureBERT2_CrossEncoder_FP32.mlpackage")
mlmodel_fp32.save(fp32_path)
fp32_size = sum(
os.path.getsize(os.path.join(dp, f))
for dp, _, files in os.walk(fp32_path)
for f in files
) / (1024 * 1024)
print(f" ✓ Sparad: {fp32_path}")
print(f" Storlek: {fp32_size:.1f} MB")
print()
except Exception as e:
print(f" ✗ FP32-konvertering misslyckades: {e}")
print()
print(f"Vid detta läge har vi uttömt de vanligaste vägarna:")
print(f" ✗ torch.jit.trace + coremltools")
print(f" ✗ ONNX som input till coremltools 9.0")
print(f" ✗ torch.export + coremltools")
print()
print(f"Nästa steg vore att försöka optimum-cli från HuggingFace.")
sys.exit(1)
# ============================================================================
# FP16
# ============================================================================
print(f" b) FP16-version...")
start = time.time()
try:
mlmodel_fp16 = ct.convert(
exported_program,
inputs=[
ct.TensorType(
name="input_ids",
shape=(1, MAX_SEQUENCE_LENGTH),
dtype=np.int32
),
ct.TensorType(
name="attention_mask",
shape=(1, MAX_SEQUENCE_LENGTH),
dtype=np.int32
),
],
outputs=[ct.TensorType(name="score")],
convert_to="mlprogram",
compute_precision=ct.precision.FLOAT16,
compute_units=ct.ComputeUnit.ALL,
minimum_deployment_target=ct.target.macOS14,
)
fp16_time = time.time() - start
print(f" ✓ FP16-konvertering klar på {fp16_time:.1f}s")
mlmodel_fp16.author = "Cisco AI (konverterad för Nomad via torch.export, FP16)"
mlmodel_fp16.short_description = (
"SecureBERT 2.0 Cross-Encoder for text reranking (FP16). "
"Input: query + document pair. Output: similarity score (0-1)."
)
mlmodel_fp16.version = "1.0"
fp16_path = os.path.join(OUTPUT_DIR, "SecureBERT2_CrossEncoder_FP16.mlpackage")
mlmodel_fp16.save(fp16_path)
fp16_size = sum(
os.path.getsize(os.path.join(dp, f))
for dp, _, files in os.walk(fp16_path)
for f in files
) / (1024 * 1024)
print(f" ✓ Sparad: {fp16_path}")
print(f" Storlek: {fp16_size:.1f} MB")
print()
except Exception as e:
print(f" ✗ FP16-konvertering misslyckades: {e}")
print(f" FP32 är OK, vi fortsätter med den")
print()
# ============================================================================
# VERIFIERING
# ============================================================================
print(f"=" * 70)
print(f"VERIFIERING: Core ML vs PyTorch")
print(f"=" * 70)
print()
test_cases = [
("How do I configure vPC peer-link on Nexus 9000?",
"vPC peer-link must be configured as a port-channel with all VLANs allowed. Use LACP active mode and spanning-tree port type network for fast convergence.",
"Hög (relaterat)"),
("How do I configure vPC peer-link on Nexus 9000?",
"BGP neighbor configuration requires remote-as statement and update-source for stable peering between routers.",
"Mellan (samma domän)"),
("How do I configure vPC peer-link on Nexus 9000?",
"How to bake a chocolate cake: mix flour, sugar, cocoa powder and butter.",
"Låg (orelaterat)"),
]
print(f"{'Förväntan':<30} {'PyTorch':<12} {'CoreML FP32':<14} {'Diff':<10}")
print("-" * 66)
max_diff = 0.0
for query, doc, expected in test_cases:
inputs = tokenizer(
query, doc,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="pt"
)
with torch.no_grad():
pt_score = torch.sigmoid(model(**inputs).logits).item()
cm_inputs = {
"input_ids": inputs["input_ids"].numpy().astype(np.int32),
"attention_mask": inputs["attention_mask"].numpy().astype(np.int32),
}
cm_output = mlmodel_fp32.predict(cm_inputs)
cm_score = float(cm_output["score"].flatten()[0])
diff = abs(pt_score - cm_score)
max_diff = max(max_diff, diff)
print(f"{expected:<30} {pt_score:<12.4f} {cm_score:<14.4f} {diff:<10.6f}")
print()
print(f"Max diff: {max_diff:.6f}")
if max_diff < 0.001:
print(f"✓ EXCELLENT: Core ML är numeriskt identisk med PyTorch")
elif max_diff < 0.01:
print(f"✓ OK: Mindre numerisk drift, acceptabel")
else:
print(f"⚠️ Märkbar drift, verifiera rangordning")
print()
print(f"=" * 70)
print(f"KLART")
print(f"=" * 70)
print()
print(f"Output:")
print(f" {fp32_path}")
if 'fp16_path' in dir():
print(f" {fp16_path}")