{ "release": "IrishCore-DiffMask-135M-v1-rc3", "base_model": "OpenMed/OpenMed-PII-mLiteClinical-Base-135M-v1", "public_references": { "rc5": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc5", "rc8": "temsa/OpenMed-mLiteClinical-IrishCorePII-135M-v2-rc8" }, "task": "Irish core PII detection and masking in English and Irish Gaelic", "coverage": [ "PPSN", "ACCOUNT_NUMBER", "BANK_ROUTING_NUMBER", "CREDIT_DEBIT_CARD", "PASSPORT_NUMBER", "POSTCODE", "PHONE_NUMBER", "EMAIL", "FIRST_NAME", "LAST_NAME", "SWIFT_BIC" ], "architecture": { "family": "DistilBERT-size token-span extractor", "diffusion_style_training": true, "runtime_diffusion": false, "scanner_free": true, "validator_free": true, "heads": [ "token_presence_head", "typed_start_boundary_head", "typed_end_boundary_head" ] }, "training_data": { "published": [ "temsa/OpenMed-Irish-CorePII-TrainMix-v1", "temsa/OpenMed-Irish-PPSN-Eircode-Spec-v1", "joelniklaus/mapa", "gretelai/synthetic_pii_finance_multilingual" ], "local_synthetic_hardening_sets": [ "irish_dllm_hardening_v1", "dllm_gap_patch_v1", "dllm_gap_patch_v2", "dllm_gap_patch_v3", "dllm_gap_patch_v4", "dllm_uat_replay_v1", "dllm_uat_patch_v3", "irish_core_diffmask_v2_mix", "irish_core_diffmask_v3_mix", "irish_core_diffmask_v4_mix", "irish_core_diffmask_v5_mix", "irish_core_diffmask_focus_v3" ], "selection_note": "The published rc3 checkpoint keeps the stronger focusv3 continuation and then publishes a narrower decoder profile tuned for email continuation and q8 passport recovery." }, "training_recipe": { "noise_schedule_family": "linear masked denoising schedule", "runtime_diffusion": false, "train_time_diffusion_steps": 4, "start_noise_fraction": 0.65, "end_noise_fraction": 0.05, "loss": "average BCE losses over token presence and typed boundaries across noised passes" }, "release_selection": { "published_checkpoint": "focusv3 continuation with rc3 decoder profile", "selection_strategy": "publish the stronger focusv3 checkpoint while retuning the decoder thresholds for the public config", "reason": "This preserved rc2-level Irish core quality while improving the UAT replay exact suite and multilingual PPSN coverage." }, "benchmark_caveats": [ "user_raw_regression_cases_v1 is a legacy PPSN-only negative suite. In rc3 the counted false positive is 0871234567, now intentionally masked as PHONE_NUMBER.", "The remaining UAT misses are the second phone in the long Client Identity Services sentence, R93 EC57 inside the longer centre block, and EPStamp4@enterprise.gov.ie." ], "references": [ { "title": "BERT", "url": "https://arxiv.org/abs/1810.04805" }, { "title": "DistilBERT", "url": "https://arxiv.org/abs/1910.01108" }, { "title": "Boundary Smoothing for Named Entity Recognition", "url": "https://aclanthology.org/2022.acl-long.490/" }, { "title": "SPANNER: Named Entity Re-/Recognition as Span Prediction", "url": "https://aclanthology.org/2021.acl-long.558/" }, { "title": "LLaDA 2.0: Scaling Up Diffusion Language Models to 100B", "url": "https://arxiv.org/abs/2512.15745" }, { "title": "Scaling Diffusion Language Models via Adaptation from Autoregressive Models", "url": "https://arxiv.org/abs/2410.17891" } ] }