""" Konverterar cisco-ai/SecureBERT2.0-cross_encoder till Core ML via torch.export. torch.export är PyTorchs nyare export-mekanism (sedan PyTorch 2.1) som hanterar dynamiska shapes och int-ops bättre än torch.jit.trace. Output: output/SecureBERT2_CrossEncoder_FP32.mlpackage output/SecureBERT2_CrossEncoder_FP16.mlpackage """ import os import sys import time import numpy as np import torch import coremltools as ct from transformers import AutoTokenizer, AutoModelForSequenceClassification # ============================================================================ # KONFIGURATION # ============================================================================ MODEL_ID = "cisco-ai/SecureBERT2.0-cross_encoder" OUTPUT_DIR = "./output" MAX_SEQUENCE_LENGTH = 512 # ============================================================================ # STEG 1: LADDA MODELL # ============================================================================ print("=" * 70) print("Fas 1 (via torch.export): SecureBERT2.0-cross_encoder → Core ML") print("=" * 70) print() os.makedirs(OUTPUT_DIR, exist_ok=True) print(f"[1/6] Laddar tokenizer och modell...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForSequenceClassification.from_pretrained( MODEL_ID, attn_implementation="eager", torch_dtype=torch.float32 ) model.eval() print(f" ✓ Modell laddad") print(f" Architecture: {model.config.architectures}") print() # ============================================================================ # STEG 2: FÖRBERED EXAMPLE INPUT # ============================================================================ print(f"[2/6] Förbereder example input...") example_query = "How do I configure vPC peer-link on Nexus 9000?" example_document = ( "vPC peer-link must be configured as a port-channel with all VLANs " "allowed. Use LACP active mode and spanning-tree port type network " "for fast convergence between peer switches." ) example_inputs = tokenizer( example_query, example_document, padding="max_length", truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt" ) # torch.export behöver explicit int64 (default i transformers) input_ids = example_inputs['input_ids'] attention_mask = example_inputs['attention_mask'] print(f" ✓ Input shape: {input_ids.shape}, dtype: {input_ids.dtype}") print() # ============================================================================ # STEG 3: VERIFIERA PYTORCH # ============================================================================ print(f"[3/6] Verifierar PyTorch-modellen...") with torch.no_grad(): pytorch_output = model(input_ids=input_ids, attention_mask=attention_mask) pytorch_score = torch.sigmoid(pytorch_output.logits).item() print(f" PyTorch score: {pytorch_score:.4f}") print() # ============================================================================ # STEG 4: WRAPPA MODELL # ============================================================================ print(f"[4/6] Wrappar modell med sigmoid...") class CrossEncoderWrapper(torch.nn.Module): def __init__(self, model): super().__init__() self.model = model def forward(self, input_ids, attention_mask): outputs = self.model( input_ids=input_ids, attention_mask=attention_mask ) return torch.sigmoid(outputs.logits) wrapped_model = CrossEncoderWrapper(model) wrapped_model.eval() print(f" ✓ Wrapper skapad") print() # ============================================================================ # STEG 5: EXPORTERA MED torch.export # ============================================================================ print(f"[5/6] Exporterar med torch.export.export...") print(f" (Detta tar 30-60 sekunder)") start = time.time() try: # torch.export.export returnerar en ExportedProgram # Den hanterar dynamiska ops bättre än jit.trace exported_program = torch.export.export( wrapped_model, args=(input_ids, attention_mask), # Strict=False tillåter mer flexibilitet med ops strict=False, ) export_time = time.time() - start print(f" ✓ Export klar på {export_time:.1f}s") print(f" Type: {type(exported_program).__name__}") # Dekomponera TRAINING-dialekt till ATEN-dialekt som coremltools förstår print(f" Dekomponerar till ATEN-dialekt för coremltools...") decomp_start = time.time() exported_program = exported_program.run_decompositions({}) print(f" ✓ Dekomposition klar på {time.time() - decomp_start:.1f}s") # Verifiera att exporterad modell ger samma output with torch.no_grad(): exported_output = exported_program.module()(input_ids, attention_mask) exported_score = exported_output.item() print(f" PyTorch: {pytorch_score:.6f}") print(f" Exported: {exported_score:.6f}") print(f" Diff: {abs(pytorch_score - exported_score):.6f}") print() except Exception as e: print(f" ✗ torch.export.export misslyckades: {e}") print() print(f"Detta är oväntat eftersom ONNX-export fungerade.") print(f"Möjlig orsak: torch.export är striktare än ONNX-export.") print() print(f"Fallback: prova med dynamic_shapes-spec eller andra args.") sys.exit(1) # ============================================================================ # STEG 6: KONVERTERA TILL CORE ML # ============================================================================ print(f"[6/6] Konverterar ExportedProgram → Core ML...") print() print(f" a) FP32-version...") start = time.time() try: mlmodel_fp32 = ct.convert( exported_program, inputs=[ ct.TensorType( name="input_ids", shape=(1, MAX_SEQUENCE_LENGTH), dtype=np.int32 ), ct.TensorType( name="attention_mask", shape=(1, MAX_SEQUENCE_LENGTH), dtype=np.int32 ), ], outputs=[ct.TensorType(name="score")], convert_to="mlprogram", compute_precision=ct.precision.FLOAT32, compute_units=ct.ComputeUnit.ALL, minimum_deployment_target=ct.target.macOS14, ) fp32_time = time.time() - start print(f" ✓ FP32-konvertering klar på {fp32_time:.1f}s") mlmodel_fp32.author = "Cisco AI (konverterad för Nomad via torch.export)" mlmodel_fp32.short_description = ( "SecureBERT 2.0 Cross-Encoder for text reranking. " "Input: query + document pair. Output: similarity score (0-1)." ) mlmodel_fp32.version = "1.0" fp32_path = os.path.join(OUTPUT_DIR, "SecureBERT2_CrossEncoder_FP32.mlpackage") mlmodel_fp32.save(fp32_path) fp32_size = sum( os.path.getsize(os.path.join(dp, f)) for dp, _, files in os.walk(fp32_path) for f in files ) / (1024 * 1024) print(f" ✓ Sparad: {fp32_path}") print(f" Storlek: {fp32_size:.1f} MB") print() except Exception as e: print(f" ✗ FP32-konvertering misslyckades: {e}") print() print(f"Vid detta läge har vi uttömt de vanligaste vägarna:") print(f" ✗ torch.jit.trace + coremltools") print(f" ✗ ONNX som input till coremltools 9.0") print(f" ✗ torch.export + coremltools") print() print(f"Nästa steg vore att försöka optimum-cli från HuggingFace.") sys.exit(1) # ============================================================================ # FP16 # ============================================================================ print(f" b) FP16-version...") start = time.time() try: mlmodel_fp16 = ct.convert( exported_program, inputs=[ ct.TensorType( name="input_ids", shape=(1, MAX_SEQUENCE_LENGTH), dtype=np.int32 ), ct.TensorType( name="attention_mask", shape=(1, MAX_SEQUENCE_LENGTH), dtype=np.int32 ), ], outputs=[ct.TensorType(name="score")], convert_to="mlprogram", compute_precision=ct.precision.FLOAT16, compute_units=ct.ComputeUnit.ALL, minimum_deployment_target=ct.target.macOS14, ) fp16_time = time.time() - start print(f" ✓ FP16-konvertering klar på {fp16_time:.1f}s") mlmodel_fp16.author = "Cisco AI (konverterad för Nomad via torch.export, FP16)" mlmodel_fp16.short_description = ( "SecureBERT 2.0 Cross-Encoder for text reranking (FP16). " "Input: query + document pair. Output: similarity score (0-1)." ) mlmodel_fp16.version = "1.0" fp16_path = os.path.join(OUTPUT_DIR, "SecureBERT2_CrossEncoder_FP16.mlpackage") mlmodel_fp16.save(fp16_path) fp16_size = sum( os.path.getsize(os.path.join(dp, f)) for dp, _, files in os.walk(fp16_path) for f in files ) / (1024 * 1024) print(f" ✓ Sparad: {fp16_path}") print(f" Storlek: {fp16_size:.1f} MB") print() except Exception as e: print(f" ✗ FP16-konvertering misslyckades: {e}") print(f" FP32 är OK, vi fortsätter med den") print() # ============================================================================ # VERIFIERING # ============================================================================ print(f"=" * 70) print(f"VERIFIERING: Core ML vs PyTorch") print(f"=" * 70) print() test_cases = [ ("How do I configure vPC peer-link on Nexus 9000?", "vPC peer-link must be configured as a port-channel with all VLANs allowed. Use LACP active mode and spanning-tree port type network for fast convergence.", "Hög (relaterat)"), ("How do I configure vPC peer-link on Nexus 9000?", "BGP neighbor configuration requires remote-as statement and update-source for stable peering between routers.", "Mellan (samma domän)"), ("How do I configure vPC peer-link on Nexus 9000?", "How to bake a chocolate cake: mix flour, sugar, cocoa powder and butter.", "Låg (orelaterat)"), ] print(f"{'Förväntan':<30} {'PyTorch':<12} {'CoreML FP32':<14} {'Diff':<10}") print("-" * 66) max_diff = 0.0 for query, doc, expected in test_cases: inputs = tokenizer( query, doc, padding="max_length", truncation=True, max_length=MAX_SEQUENCE_LENGTH, return_tensors="pt" ) with torch.no_grad(): pt_score = torch.sigmoid(model(**inputs).logits).item() cm_inputs = { "input_ids": inputs["input_ids"].numpy().astype(np.int32), "attention_mask": inputs["attention_mask"].numpy().astype(np.int32), } cm_output = mlmodel_fp32.predict(cm_inputs) cm_score = float(cm_output["score"].flatten()[0]) diff = abs(pt_score - cm_score) max_diff = max(max_diff, diff) print(f"{expected:<30} {pt_score:<12.4f} {cm_score:<14.4f} {diff:<10.6f}") print() print(f"Max diff: {max_diff:.6f}") if max_diff < 0.001: print(f"✓ EXCELLENT: Core ML är numeriskt identisk med PyTorch") elif max_diff < 0.01: print(f"✓ OK: Mindre numerisk drift, acceptabel") else: print(f"⚠️ Märkbar drift, verifiera rangordning") print() print(f"=" * 70) print(f"KLART") print(f"=" * 70) print() print(f"Output:") print(f" {fp32_path}") if 'fp16_path' in dir(): print(f" {fp16_path}")