#!/usr/bin/env python """EC-SimToken 2-step smoke test. Verifies three core invariants before committing to 9-hour full training: 1. exist_loss > 0 — is_null is reaching model_forward and BCE is computed 2. mask_loss ≈ 0 — null gate skips mask loss for null samples 3. exist_logit.shape[0] == batch_size — tensor shapes are consistent Expected runtime: ~3-4 minutes (model load dominates), 2 forward passes. Usage: cd /workspace/SimToken && conda activate simtoken python tools/ec_simtoken_smoke_test.py 2>&1 | tee runs/ec_simtoken_smoke.log """ from __future__ import annotations import os, sys, random from argparse import Namespace from functools import partial import numpy as np import torch import transformers from peft import LoraConfig, get_peft_model from torch.utils.data import DataLoader from transformers import AutoConfig ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, ROOT) os.environ["CUDA_VISIBLE_DEVICES"] = "0" from datasets.dataset_refavs import REFAVS from models.ec_simtoken_model import ECSimtoken_ForCausalLM # ── Paths & constants ───────────────────────────────────────────────────────── MLLM = "/workspace/hf_models/Chat-UniVi-7B-v1.5" SAM_CKPT = "/workspace/SimToken/models/segment_anything/sam_vit_h_4b8939.pth" SIMTOKEN_CKPT = "/workspace/SimToken/checkpoints/simtoken_pretrained.pth" DATA_DIR = "/workspace/SimToken/data" VISION_TOWER = "/workspace/hf_models/clip-vit-large-patch14" BATCH_SIZE = 4 IGNORE_INDEX = -100 IMAGE_TOKEN_INDEX = -200 AUDIO_TOKEN_INDEX = -300 # ── Minimal args namespace ───────────────────────────────────────────────────── args = Namespace( mllm=MLLM, vision_pretrained=SAM_CKPT, vision_tower=VISION_TOWER, data_dir=DATA_DIR, compress=True, start=0, batch_size=BATCH_SIZE, exist_loss_weight=1.0, frame_n=10, text_max_len=25, input_type="refer", ct_weight=0.0, # disable contrastive for smoke test conv_template=1, ) # ── Collate (mirrors train_ec_simtoken.py) ──────────────────────────────────── import re def tokenizer_image_audio_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, audio_token_index=AUDIO_TOKEN_INDEX, num_frames=10, return_tensors=None): prompt_chunks = re.split(r'(|