File size: 11,547 Bytes

d9f49f0

"""
Konverterar cisco-ai/SecureBERT2.0-cross_encoder till Core ML via torch.export.

torch.export är PyTorchs nyare export-mekanism (sedan PyTorch 2.1) som hanterar
dynamiska shapes och int-ops bättre än torch.jit.trace.

Output:
    output/SecureBERT2_CrossEncoder_FP32.mlpackage
    output/SecureBERT2_CrossEncoder_FP16.mlpackage
"""

import os
import sys
import time
import numpy as np
import torch
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ============================================================================
# KONFIGURATION
# ============================================================================

MODEL_ID = "cisco-ai/SecureBERT2.0-cross_encoder"
OUTPUT_DIR = "./output"
MAX_SEQUENCE_LENGTH = 512

# ============================================================================
# STEG 1: LADDA MODELL
# ============================================================================

print("=" * 70)
print("Fas 1 (via torch.export): SecureBERT2.0-cross_encoder → Core ML")
print("=" * 70)
print()

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"[1/6] Laddar tokenizer och modell...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    attn_implementation="eager",
    torch_dtype=torch.float32
)
model.eval()
print(f"      ✓ Modell laddad")
print(f"      Architecture: {model.config.architectures}")
print()

# ============================================================================
# STEG 2: FÖRBERED EXAMPLE INPUT
# ============================================================================

print(f"[2/6] Förbereder example input...")

example_query = "How do I configure vPC peer-link on Nexus 9000?"
example_document = (
    "vPC peer-link must be configured as a port-channel with all VLANs "
    "allowed. Use LACP active mode and spanning-tree port type network "
    "for fast convergence between peer switches."
)

example_inputs = tokenizer(
    example_query,
    example_document,
    padding="max_length",
    truncation=True,
    max_length=MAX_SEQUENCE_LENGTH,
    return_tensors="pt"
)

# torch.export behöver explicit int64 (default i transformers)
input_ids = example_inputs['input_ids']
attention_mask = example_inputs['attention_mask']

print(f"      ✓ Input shape: {input_ids.shape}, dtype: {input_ids.dtype}")
print()

# ============================================================================
# STEG 3: VERIFIERA PYTORCH
# ============================================================================

print(f"[3/6] Verifierar PyTorch-modellen...")

with torch.no_grad():
    pytorch_output = model(input_ids=input_ids, attention_mask=attention_mask)
    pytorch_score = torch.sigmoid(pytorch_output.logits).item()

print(f"      PyTorch score: {pytorch_score:.4f}")
print()

# ============================================================================
# STEG 4: WRAPPA MODELL
# ============================================================================

print(f"[4/6] Wrappar modell med sigmoid...")

class CrossEncoderWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        return torch.sigmoid(outputs.logits)

wrapped_model = CrossEncoderWrapper(model)
wrapped_model.eval()
print(f"      ✓ Wrapper skapad")
print()

# ============================================================================
# STEG 5: EXPORTERA MED torch.export
# ============================================================================

print(f"[5/6] Exporterar med torch.export.export...")
print(f"      (Detta tar 30-60 sekunder)")

start = time.time()

try:
    # torch.export.export returnerar en ExportedProgram
    # Den hanterar dynamiska ops bättre än jit.trace
    exported_program = torch.export.export(
        wrapped_model,
        args=(input_ids, attention_mask),
        # Strict=False tillåter mer flexibilitet med ops
        strict=False,
    )

    export_time = time.time() - start
    print(f"      ✓ Export klar på {export_time:.1f}s")
    print(f"      Type: {type(exported_program).__name__}")

    # Dekomponera TRAINING-dialekt till ATEN-dialekt som coremltools förstår
    print(f"      Dekomponerar till ATEN-dialekt för coremltools...")
    decomp_start = time.time()
    exported_program = exported_program.run_decompositions({})
    print(f"      ✓ Dekomposition klar på {time.time() - decomp_start:.1f}s")

    # Verifiera att exporterad modell ger samma output
    with torch.no_grad():
        exported_output = exported_program.module()(input_ids, attention_mask)
        exported_score = exported_output.item()

    print(f"      PyTorch:  {pytorch_score:.6f}")
    print(f"      Exported: {exported_score:.6f}")
    print(f"      Diff:     {abs(pytorch_score - exported_score):.6f}")
    print()

except Exception as e:
    print(f"      ✗ torch.export.export misslyckades: {e}")
    print()
    print(f"Detta är oväntat eftersom ONNX-export fungerade.")
    print(f"Möjlig orsak: torch.export är striktare än ONNX-export.")
    print()
    print(f"Fallback: prova med dynamic_shapes-spec eller andra args.")
    sys.exit(1)

# ============================================================================
# STEG 6: KONVERTERA TILL CORE ML
# ============================================================================

print(f"[6/6] Konverterar ExportedProgram → Core ML...")
print()

print(f"      a) FP32-version...")
start = time.time()

try:
    mlmodel_fp32 = ct.convert(
        exported_program,
        inputs=[
            ct.TensorType(
                name="input_ids",
                shape=(1, MAX_SEQUENCE_LENGTH),
                dtype=np.int32
            ),
            ct.TensorType(
                name="attention_mask",
                shape=(1, MAX_SEQUENCE_LENGTH),
                dtype=np.int32
            ),
        ],
        outputs=[ct.TensorType(name="score")],
        convert_to="mlprogram",
        compute_precision=ct.precision.FLOAT32,
        compute_units=ct.ComputeUnit.ALL,
        minimum_deployment_target=ct.target.macOS14,
    )

    fp32_time = time.time() - start
    print(f"         ✓ FP32-konvertering klar på {fp32_time:.1f}s")

    mlmodel_fp32.author = "Cisco AI (konverterad för Nomad via torch.export)"
    mlmodel_fp32.short_description = (
        "SecureBERT 2.0 Cross-Encoder for text reranking. "
        "Input: query + document pair. Output: similarity score (0-1)."
    )
    mlmodel_fp32.version = "1.0"

    fp32_path = os.path.join(OUTPUT_DIR, "SecureBERT2_CrossEncoder_FP32.mlpackage")
    mlmodel_fp32.save(fp32_path)

    fp32_size = sum(
        os.path.getsize(os.path.join(dp, f))
        for dp, _, files in os.walk(fp32_path)
        for f in files
    ) / (1024 * 1024)
    print(f"         ✓ Sparad: {fp32_path}")
    print(f"         Storlek: {fp32_size:.1f} MB")
    print()

except Exception as e:
    print(f"         ✗ FP32-konvertering misslyckades: {e}")
    print()
    print(f"Vid detta läge har vi uttömt de vanligaste vägarna:")
    print(f"  ✗ torch.jit.trace + coremltools")
    print(f"  ✗ ONNX som input till coremltools 9.0")
    print(f"  ✗ torch.export + coremltools")
    print()
    print(f"Nästa steg vore att försöka optimum-cli från HuggingFace.")
    sys.exit(1)

# ============================================================================
# FP16
# ============================================================================

print(f"      b) FP16-version...")
start = time.time()

try:
    mlmodel_fp16 = ct.convert(
        exported_program,
        inputs=[
            ct.TensorType(
                name="input_ids",
                shape=(1, MAX_SEQUENCE_LENGTH),
                dtype=np.int32
            ),
            ct.TensorType(
                name="attention_mask",
                shape=(1, MAX_SEQUENCE_LENGTH),
                dtype=np.int32
            ),
        ],
        outputs=[ct.TensorType(name="score")],
        convert_to="mlprogram",
        compute_precision=ct.precision.FLOAT16,
        compute_units=ct.ComputeUnit.ALL,
        minimum_deployment_target=ct.target.macOS14,
    )

    fp16_time = time.time() - start
    print(f"         ✓ FP16-konvertering klar på {fp16_time:.1f}s")

    mlmodel_fp16.author = "Cisco AI (konverterad för Nomad via torch.export, FP16)"
    mlmodel_fp16.short_description = (
        "SecureBERT 2.0 Cross-Encoder for text reranking (FP16). "
        "Input: query + document pair. Output: similarity score (0-1)."
    )
    mlmodel_fp16.version = "1.0"

    fp16_path = os.path.join(OUTPUT_DIR, "SecureBERT2_CrossEncoder_FP16.mlpackage")
    mlmodel_fp16.save(fp16_path)

    fp16_size = sum(
        os.path.getsize(os.path.join(dp, f))
        for dp, _, files in os.walk(fp16_path)
        for f in files
    ) / (1024 * 1024)
    print(f"         ✓ Sparad: {fp16_path}")
    print(f"         Storlek: {fp16_size:.1f} MB")
    print()

except Exception as e:
    print(f"         ✗ FP16-konvertering misslyckades: {e}")
    print(f"         FP32 är OK, vi fortsätter med den")
    print()

# ============================================================================
# VERIFIERING
# ============================================================================

print(f"=" * 70)
print(f"VERIFIERING: Core ML vs PyTorch")
print(f"=" * 70)
print()

test_cases = [
    ("How do I configure vPC peer-link on Nexus 9000?",
     "vPC peer-link must be configured as a port-channel with all VLANs allowed. Use LACP active mode and spanning-tree port type network for fast convergence.",
     "Hög (relaterat)"),

    ("How do I configure vPC peer-link on Nexus 9000?",
     "BGP neighbor configuration requires remote-as statement and update-source for stable peering between routers.",
     "Mellan (samma domän)"),

    ("How do I configure vPC peer-link on Nexus 9000?",
     "How to bake a chocolate cake: mix flour, sugar, cocoa powder and butter.",
     "Låg (orelaterat)"),
]

print(f"{'Förväntan':<30} {'PyTorch':<12} {'CoreML FP32':<14} {'Diff':<10}")
print("-" * 66)

max_diff = 0.0
for query, doc, expected in test_cases:
    inputs = tokenizer(
        query, doc,
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
        return_tensors="pt"
    )

    with torch.no_grad():
        pt_score = torch.sigmoid(model(**inputs).logits).item()

    cm_inputs = {
        "input_ids": inputs["input_ids"].numpy().astype(np.int32),
        "attention_mask": inputs["attention_mask"].numpy().astype(np.int32),
    }
    cm_output = mlmodel_fp32.predict(cm_inputs)
    cm_score = float(cm_output["score"].flatten()[0])

    diff = abs(pt_score - cm_score)
    max_diff = max(max_diff, diff)

    print(f"{expected:<30} {pt_score:<12.4f} {cm_score:<14.4f} {diff:<10.6f}")

print()
print(f"Max diff: {max_diff:.6f}")

if max_diff < 0.001:
    print(f"✓ EXCELLENT: Core ML är numeriskt identisk med PyTorch")
elif max_diff < 0.01:
    print(f"✓ OK: Mindre numerisk drift, acceptabel")
else:
    print(f"⚠️  Märkbar drift, verifiera rangordning")

print()
print(f"=" * 70)
print(f"KLART")
print(f"=" * 70)
print()
print(f"Output:")
print(f"  {fp32_path}")
if 'fp16_path' in dir():
    print(f"  {fp16_path}")