training_sources.json · temsa/IrishCore-DiffMask-135M-v1-rc3 at main

IrishCore-DiffMask-135M-v1-rc3 / training_sources.json

Add files using upload-large-folder tool

85b0a00 verified 27 days ago

3.59 kB

	{
	"release": "IrishCore-DiffMask-135M-v1-rc3",
	"base_model": "OpenMed/OpenMed-PII-mLiteClinical-Base-135M-v1",
	"public_references": {
	"rc5": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc5",
	"rc8": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc8"
	},
	"task": "Irish core PII detection and masking in English and Irish Gaelic",
	"coverage": [
	"PPSN",
	"ACCOUNT_NUMBER",
	"BANK_ROUTING_NUMBER",
	"CREDIT_DEBIT_CARD",
	"PASSPORT_NUMBER",
	"POSTCODE",
	"PHONE_NUMBER",
	"EMAIL",
	"FIRST_NAME",
	"LAST_NAME",
	"SWIFT_BIC"
	],
	"architecture": {
	"family": "DistilBERT-size token-span extractor",
	"diffusion_style_training": true,
	"runtime_diffusion": false,
	"scanner_free": true,
	"validator_free": true,
	"heads": [
	"token_presence_head",
	"typed_start_boundary_head",
	"typed_end_boundary_head"
	]
	},
	"training_data": {
	"published": [
	"temsa/OpenMed-Irish-CorePII-TrainMix-v1",
	"temsa/OpenMed-Irish-PPSN-Eircode-Spec-v1",
	"joelniklaus/mapa",
	"gretelai/synthetic_pii_finance_multilingual"
	],
	"local_synthetic_hardening_sets": [
	"irish_dllm_hardening_v1",
	"dllm_gap_patch_v1",
	"dllm_gap_patch_v2",
	"dllm_gap_patch_v3",
	"dllm_gap_patch_v4",
	"dllm_uat_replay_v1",
	"dllm_uat_patch_v3",
	"irish_core_diffmask_v2_mix",
	"irish_core_diffmask_v3_mix",
	"irish_core_diffmask_v4_mix",
	"irish_core_diffmask_v5_mix",
	"irish_core_diffmask_focus_v3"
	],
	"selection_note": "The published rc3 checkpoint keeps the stronger focusv3 continuation and then publishes a narrower decoder profile tuned for email continuation and q8 passport recovery."
	},
	"training_recipe": {
	"noise_schedule_family": "linear masked denoising schedule",
	"runtime_diffusion": false,
	"train_time_diffusion_steps": 4,
	"start_noise_fraction": 0.65,
	"end_noise_fraction": 0.05,
	"loss": "average BCE losses over token presence and typed boundaries across noised passes"
	},
	"release_selection": {
	"published_checkpoint": "focusv3 continuation with rc3 decoder profile",
	"selection_strategy": "publish the stronger focusv3 checkpoint while retuning the decoder thresholds for the public config",
	"reason": "This preserved rc2-level Irish core quality while improving the UAT replay exact suite and multilingual PPSN coverage."
	},
	"benchmark_caveats": [
	"user_raw_regression_cases_v1 is a legacy PPSN-only negative suite. In rc3 the counted false positive is 0871234567, now intentionally masked as PHONE_NUMBER.",
	"The remaining UAT misses are the second phone in the long Client Identity Services sentence, R93 EC57 inside the longer centre block, and EPStamp4@enterprise.gov.ie."
	],
	"references": [
	{
	"title": "BERT",
	"url": "https://arxiv.org/abs/1810.04805"
	},
	{
	"title": "DistilBERT",
	"url": "https://arxiv.org/abs/1910.01108"
	},
	{
	"title": "Boundary Smoothing for Named Entity Recognition",
	"url": "https://aclanthology.org/2022.acl-long.490/"
	},
	{
	"title": "SPANNER: Named Entity Re-/Recognition as Span Prediction",
	"url": "https://aclanthology.org/2021.acl-long.558/"
	},
	{
	"title": "LLaDA 2.0: Scaling Up Diffusion Language Models to 100B",
	"url": "https://arxiv.org/abs/2512.15745"
	},
	{
	"title": "Scaling Diffusion Language Models via Adaptation from Autoregressive Models",
	"url": "https://arxiv.org/abs/2410.17891"
	}
	]
	}