IrishCore-DiffMask-135M-v1-rc3 / training_sources.json
temsa's picture
Add files using upload-large-folder tool
85b0a00 verified
{
"release": "IrishCore-DiffMask-135M-v1-rc3",
"base_model": "OpenMed/OpenMed-PII-mLiteClinical-Base-135M-v1",
"public_references": {
"rc5": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc5",
"rc8": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc8"
},
"task": "Irish core PII detection and masking in English and Irish Gaelic",
"coverage": [
"PPSN",
"ACCOUNT_NUMBER",
"BANK_ROUTING_NUMBER",
"CREDIT_DEBIT_CARD",
"PASSPORT_NUMBER",
"POSTCODE",
"PHONE_NUMBER",
"EMAIL",
"FIRST_NAME",
"LAST_NAME",
"SWIFT_BIC"
],
"architecture": {
"family": "DistilBERT-size token-span extractor",
"diffusion_style_training": true,
"runtime_diffusion": false,
"scanner_free": true,
"validator_free": true,
"heads": [
"token_presence_head",
"typed_start_boundary_head",
"typed_end_boundary_head"
]
},
"training_data": {
"published": [
"temsa/OpenMed-Irish-CorePII-TrainMix-v1",
"temsa/OpenMed-Irish-PPSN-Eircode-Spec-v1",
"joelniklaus/mapa",
"gretelai/synthetic_pii_finance_multilingual"
],
"local_synthetic_hardening_sets": [
"irish_dllm_hardening_v1",
"dllm_gap_patch_v1",
"dllm_gap_patch_v2",
"dllm_gap_patch_v3",
"dllm_gap_patch_v4",
"dllm_uat_replay_v1",
"dllm_uat_patch_v3",
"irish_core_diffmask_v2_mix",
"irish_core_diffmask_v3_mix",
"irish_core_diffmask_v4_mix",
"irish_core_diffmask_v5_mix",
"irish_core_diffmask_focus_v3"
],
"selection_note": "The published rc3 checkpoint keeps the stronger focusv3 continuation and then publishes a narrower decoder profile tuned for email continuation and q8 passport recovery."
},
"training_recipe": {
"noise_schedule_family": "linear masked denoising schedule",
"runtime_diffusion": false,
"train_time_diffusion_steps": 4,
"start_noise_fraction": 0.65,
"end_noise_fraction": 0.05,
"loss": "average BCE losses over token presence and typed boundaries across noised passes"
},
"release_selection": {
"published_checkpoint": "focusv3 continuation with rc3 decoder profile",
"selection_strategy": "publish the stronger focusv3 checkpoint while retuning the decoder thresholds for the public config",
"reason": "This preserved rc2-level Irish core quality while improving the UAT replay exact suite and multilingual PPSN coverage."
},
"benchmark_caveats": [
"user_raw_regression_cases_v1 is a legacy PPSN-only negative suite. In rc3 the counted false positive is 0871234567, now intentionally masked as PHONE_NUMBER.",
"The remaining UAT misses are the second phone in the long Client Identity Services sentence, R93 EC57 inside the longer centre block, and EPStamp4@enterprise.gov.ie."
],
"references": [
{
"title": "BERT",
"url": "https://arxiv.org/abs/1810.04805"
},
{
"title": "DistilBERT",
"url": "https://arxiv.org/abs/1910.01108"
},
{
"title": "Boundary Smoothing for Named Entity Recognition",
"url": "https://aclanthology.org/2022.acl-long.490/"
},
{
"title": "SPANNER: Named Entity Re-/Recognition as Span Prediction",
"url": "https://aclanthology.org/2021.acl-long.558/"
},
{
"title": "LLaDA 2.0: Scaling Up Diffusion Language Models to 100B",
"url": "https://arxiv.org/abs/2512.15745"
},
{
"title": "Scaling Diffusion Language Models via Adaptation from Autoregressive Models",
"url": "https://arxiv.org/abs/2410.17891"
}
]
}