AniFileBERT / reports /training_lineage.json
ModerRAS's picture
Train virtual-shard anime parser
359ff82
{
"published_checkpoint": "repository_root",
"summary": "The published checkpoint was produced in two stages: a full 10-epoch CUDA fine-tune over Rust-generated virtual BIO shards, followed by a light thin-runtime hard-case focus fine-tune.",
"summary_zh": "当前发布 checkpoint 是两阶段产物:先用 Rust 生成的虚拟 BIO shard 做完整 10 epoch CUDA 微调,再做轻量薄层运行时困难样本微调。",
"stages": [
{
"name": "dmhy-char-virtual-sps32-10epoch-lr1e5",
"type": "full_dataset_finetune_with_rust_virtual_shards",
"machine": "adqew@192.168.63.157",
"data_file": "datasets/AnimeName/dmhy_weak_char.jsonl",
"virtual_source_file": "data/generated/virtual_source_train_seed105.jsonl",
"virtual_dataset_dir": "data/generated/virtual_char_sps32_seed105",
"tokenizer_variant": "char",
"vocab_file": "datasets/AnimeName/vocab.char.json",
"vocab_size": 6199,
"max_seq_length": 128,
"source_rows": 619361,
"special_fixture_rows": 935,
"virtual_train_samples": 20439848,
"eval_samples": 12641,
"epochs": 10.0,
"optimizer_steps": 114070,
"batch_size": 1792,
"learning_rate": 0.00001,
"warmup_steps": 2000,
"seed": 105,
"device": "cuda",
"mixed_precision": "bf16",
"tf32": true,
"dataloader_num_workers": 4,
"virtual_generation": {
"samples_per_source": 32,
"separator_mode": "per-gap",
"bracket_mode": "per-part",
"include_original": true,
"include_special_fixtures": true,
"shard_size": 25000,
"shards": 881,
"elapsed_seconds": 31.55
},
"eval_f1": 0.9902097153862615,
"eval_accuracy": 0.9978861640315251,
"fixed_regression_model_only": "22/26",
"fixed_regression_normalized_only": "23/26",
"heldout_model_only": "1994/2048",
"heldout_normalized_only": "2008/2048",
"train_runtime_seconds": 21181.32,
"train_tokens_per_second": 1236288.9470061918,
"perf_gpu_util_avg": 96.14912280701755,
"perf_gpu_util_max": 100.0,
"role": "Base checkpoint for the final light hard-case focus stage. This is the full >100k-step virtual-shard training run."
},
{
"name": "dmhy-char-virtual-sps32-10epoch-lightfocus",
"type": "light_hard_case_focus_finetune",
"machine": "adqew@192.168.63.157",
"data_file": "data/generated/focus_after_virtual_sps32_char.jsonl",
"tokenizer_variant": "char",
"vocab_file": "datasets/AnimeName/vocab.char.json",
"vocab_size": 6199,
"max_seq_length": 128,
"focus_source_rows": 140660,
"train_samples": 133627,
"eval_samples": 7033,
"epochs": 1.0,
"batch_size": 1792,
"learning_rate": 0.000002,
"warmup_steps": 20,
"seed": 208,
"device": "cuda",
"mixed_precision": "bf16",
"tf32": true,
"eval_f1": 0.9843520993189067,
"eval_accuracy": 0.9961191832100342,
"fixed_regression_model_only": "24/26",
"fixed_regression_normalized_only": "26/26",
"heldout_model_only": "1962/2048",
"heldout_normalized_only": "1988/2048",
"perf_tokens_per_second_avg": 997645.0850819343,
"perf_gpu_util_avg": 100.0,
"role": "Published repository-root checkpoint. The default thin runtime also includes narrow postprocessing for bracketed search notes and release-promo title prefixes."
}
]
}