training_sources.json · temsa/IrishCore-DiffMask-135M-v1-rc2 at main

IrishCore-DiffMask-135M-v1-rc2 / training_sources.json

Add files using upload-large-folder tool

ed10267 verified 6 days ago

3.58 kB

	{
	"release": "IrishCore-DiffMask-135M-v1-rc2",
	"base_model": "OpenMed/OpenMed-PII-mLiteClinical-Base-135M-v1",
	"public_references": {
	"rc5": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc5",
	"rc8": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc8"
	},
	"task": "Irish core PII detection and masking in English and Irish Gaelic",
	"coverage": [
	"PPSN",
	"ACCOUNT_NUMBER",
	"BANK_ROUTING_NUMBER",
	"CREDIT_DEBIT_CARD",
	"PASSPORT_NUMBER",
	"POSTCODE",
	"PHONE_NUMBER",
	"EMAIL",
	"FIRST_NAME",
	"LAST_NAME",
	"SWIFT_BIC"
	],
	"architecture": {
	"family": "DistilBERT-size token-span extractor",
	"diffusion_style_training": true,
	"runtime_diffusion": false,
	"scanner_free": true,
	"validator_free": true,
	"heads": [
	"token_presence_head",
	"typed_start_boundary_head",
	"typed_end_boundary_head"
	]
	},
	"training_data": {
	"published": [
	"temsa/OpenMed-Irish-CorePII-TrainMix-v1",
	"temsa/OpenMed-Irish-PPSN-Eircode-Spec-v1",
	"joelniklaus/mapa",
	"gretelai/synthetic_pii_finance_multilingual"
	],
	"local_synthetic_hardening_sets": [
	"irish_dllm_hardening_v1",
	"dllm_gap_patch_v1",
	"dllm_gap_patch_v2",
	"dllm_gap_patch_v3",
	"dllm_gap_patch_v4",
	"dllm_uat_replay_v1",
	"dllm_uat_patch_v2",
	"irish_core_diffmask_v2_mix",
	"irish_core_diffmask_v3_mix",
	"irish_core_diffmask_v4_mix",
	"irish_core_diffmask_v5_mix",
	"irish_core_diffmask_focus_v1"
	],
	"selection_note": "The published checkpoint was selected from multiple continuation and interpolation runs to balance Irish core, multilingual PPSN, hardening performance, and the UAT replay exact suite after fixing label contamination in the v5 mix."
	},
	"training_recipe": {
	"noise_schedule_family": "linear masked denoising schedule",
	"runtime_diffusion": false,
	"train_time_diffusion_steps": 4,
	"start_noise_fraction": 0.65,
	"end_noise_fraction": 0.05,
	"loss": "average BCE losses over token presence and typed boundaries across noised passes"
	},
	"release_selection": {
	"published_checkpoint": "selected interpolation blend used for rc2",
	"selection_strategy": "interpolation blend between the stronger broad-coverage DiffMask checkpoint and the cleaned v5 continuation",
	"reason": "This blend gave the best overall deployment-path tradeoff once the new UAT replay exact suite was added."
	},
	"known_remaining_misses": [
	"Second phone number inside the long Client Identity Services sentence: 071 967 2616",
	"Postcode inside the longer allocation-centre block: R93 EC57",
	"Email mailbox form: EPStamp4@enterprise.gov.ie",
	"One D02 XY45 address form from the UAT replay suite"
	],
	"references": [
	{
	"title": "BERT",
	"url": "https://arxiv.org/abs/1810.04805"
	},
	{
	"title": "DistilBERT",
	"url": "https://arxiv.org/abs/1910.01108"
	},
	{
	"title": "Boundary Smoothing for Named Entity Recognition",
	"url": "https://aclanthology.org/2022.acl-long.490/"
	},
	{
	"title": "SPANNER: Named Entity Re-/Recognition as Span Prediction",
	"url": "https://aclanthology.org/2021.acl-long.558/"
	},
	{
	"title": "LLaDA 2.0: Scaling Up Diffusion Language Models to 100B",
	"url": "https://arxiv.org/abs/2512.15745"
	},
	{
	"title": "Scaling Diffusion Language Models via Adaptation from Autoregressive Models",
	"url": "https://arxiv.org/abs/2410.17891"
	}
	]
	}