insureos-models / data /gen_ner.py
piyushptiwari's picture
Upload folder using huggingface_hub
2cc32a5 verified
"""
InsureOS β€” Synthetic NER (Named Entity Recognition) Data Generator
Generates 8K token-labelled insurance text examples in IOB2 format for ModernBERT NER.
"""
import json
import os
import random
from datetime import timedelta
from faker import Faker
from tqdm import tqdm
from data.constants import (
UK_INSURERS, LLOYDS_SYNDICATES, MGAS, UK_REGIONS,
NER_ENTITY_TYPES, FCA_REFERENCES,
)
fake = Faker("en_GB")
Faker.seed(46)
random.seed(46)
# Entity types with IOB2 labels:
# PERSON, ORG, INSURER, MGA, SYNDICATE, POLICY_NUMBER, CLAIM_NUMBER,
# MONEY, DATE, POSTCODE, LOB, REGULATION, PERIL, VEHICLE, ADDRESS
def _postcode() -> str:
region_name, region = random.choice(list(UK_REGIONS.items()))
prefix = random.choice(region)
return f"{prefix}{random.randint(1,29)} {random.randint(1,9)}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}"
def _policy_ref() -> str:
return f"POL-{random.randint(100000, 999999)}"
def _claim_ref() -> str:
return f"CLM-{random.randint(200000, 999999)}"
def _amount() -> str:
val = random.choice([
random.randint(100, 999),
random.randint(1000, 9999),
random.randint(10000, 99999),
random.randint(100000, 999999),
])
return f"Β£{val:,}"
def _date_str() -> str:
d = fake.date_between(start_date="-3y", end_date="+1y")
return d.strftime(random.choice(["%d/%m/%Y", "%d %B %Y", "%Y-%m-%d"]))
def _vehicle() -> str:
makes = ["Ford Fiesta", "VW Golf", "BMW 3 Series", "Toyota Yaris", "Kia Sportage",
"Vauxhall Corsa", "Mercedes A-Class", "Tesla Model 3", "Nissan Qashqai", "Audi A3"]
return random.choice(makes)
def _peril() -> str:
return random.choice([
"escape of water", "storm damage", "theft", "fire", "flood",
"accidental damage", "subsidence", "malicious damage", "collision",
"burst pipe", "lightning strike", "impact damage", "vandalism",
])
def _regulation() -> str:
return random.choice(list(FCA_REFERENCES.values()) + [
"ICOBS 8.1.1R", "DISP 1.3", "PRIN 2A", "Consumer Duty",
"FCA PS21/5", "Equality Act 2010", "GDPR Article 6",
])
def _lob() -> str:
return random.choice([
"motor insurance", "home insurance", "commercial combined",
"employers' liability", "public liability", "professional indemnity",
"property insurance", "cyber insurance", "D&O insurance",
])
# ── Sentence templates with entity slots ──
TEMPLATES = [
# 0 β€” claim notification
lambda: _build(
"{PERSON} reported a {PERIL} claim ({CLAIM_NUMBER}) on {DATE}. "
"The loss occurred at {POSTCODE} and is covered under {LOB} policy {POLICY_NUMBER} "
"with {INSURER}. Estimated value: {MONEY}."
),
# 1 β€” subrogation
lambda: _build(
"{INSURER} is pursuing subrogation recovery of {MONEY} against {ORG} "
"in respect of claim {CLAIM_NUMBER} dated {DATE}. "
"The policyholder {PERSON} resides at {POSTCODE}."
),
# 2 β€” Lloyd's placement
lambda: _build(
"{SYNDICATE} has written a {MONEY} line on the {LOB} facility "
"brokered for {ORG} by {MGA}. Inception date {DATE}."
),
# 3 β€” regulatory
lambda: _build(
"Under {REGULATION}, {INSURER} must provide {PERSON} with a final response "
"to their {PERIL} claim ({CLAIM_NUMBER}) by {DATE}. "
"The claim value is {MONEY}."
),
# 4 β€” renewal
lambda: _build(
"{PERSON}'s {LOB} policy {POLICY_NUMBER} with {INSURER} is due for renewal on {DATE}. "
"Current premium: {MONEY}. Property at {POSTCODE}."
),
# 5 β€” vehicle claim
lambda: _build(
"{PERSON} was driving a {VEHICLE} when the {PERIL} incident occurred on {DATE} "
"near {POSTCODE}. Claim {CLAIM_NUMBER} has been opened with {INSURER} for {MONEY}."
),
# 6 β€” MGA bordereaux
lambda: _build(
"{MGA} submitted the {DATE} bordereaux to {SYNDICATE} showing {MONEY} GWP "
"across {LOB} business. Contact: {PERSON}."
),
# 7 β€” complaint
lambda: _build(
"{PERSON} has filed a complaint against {INSURER} regarding claim {CLAIM_NUMBER}. "
"Per {REGULATION}, we must respond by {DATE}. Claim relates to {PERIL} at {POSTCODE}. "
"Amount disputed: {MONEY}."
),
# 8 β€” loss adjuster
lambda: _build(
"Loss adjuster {PERSON} from {ORG} inspected the {PERIL} damage at {POSTCODE} on {DATE}. "
"They recommend a settlement of {MONEY} on claim {CLAIM_NUMBER} under {LOB} cover."
),
# 9 β€” medical
lambda: _build(
"Dr {PERSON} examined the claimant in connection with claim {CLAIM_NUMBER} "
"dated {DATE}. The {PERIL} incident at {POSTCODE} resulted in injuries. "
"{INSURER} has reserved {MONEY} under the {LOB} policy."
),
# 10 β€” endorsement
lambda: _build(
"Endorsement applied to {POLICY_NUMBER}: {PERSON} has changed vehicle to {VEHICLE}. "
"Effective {DATE}. Additional premium: {MONEY}. Insurer: {INSURER}."
),
# 11 β€” fraud referral
lambda: _build(
"Claim {CLAIM_NUMBER} by {PERSON} for {PERIL} ({MONEY}) has been referred to the fraud team. "
"Policy {POLICY_NUMBER} with {INSURER} started on {DATE}. "
"Property postcode: {POSTCODE}. Cf. {REGULATION}."
),
]
def _build(template: str) -> tuple[list[str], list[str]]:
"""Fill template slots and return (tokens, iob_tags)."""
# Generate entity values
entities = {
"PERSON": fake.name(),
"ORG": fake.company(),
"INSURER": random.choice(UK_INSURERS),
"MGA": random.choice(MGAS),
"SYNDICATE": random.choice(LLOYDS_SYNDICATES),
"POLICY_NUMBER": _policy_ref(),
"CLAIM_NUMBER": _claim_ref(),
"MONEY": _amount(),
"DATE": _date_str(),
"POSTCODE": _postcode(),
"LOB": _lob(),
"REGULATION": _regulation(),
"PERIL": _peril(),
"VEHICLE": _vehicle(),
}
# Parse template to get ordered (text_fragment, entity_type) pairs
tokens = []
tags = []
remaining = template
while remaining:
# Find next entity slot
best_pos = len(remaining)
best_key = None
for key in entities:
marker = "{" + key + "}"
pos = remaining.find(marker)
if pos != -1 and pos < best_pos:
best_pos = pos
best_key = key
if best_key is None:
# No more entities β€” tokenize remaining text
for tok in remaining.split():
tokens.append(tok)
tags.append("O")
break
marker = "{" + best_key + "}"
# Text before entity
before = remaining[:best_pos]
for tok in before.split():
if tok:
tokens.append(tok)
tags.append("O")
# Entity tokens
entity_value = entities[best_key]
entity_tokens = entity_value.split()
for j, etok in enumerate(entity_tokens):
tokens.append(etok)
tags.append(f"B-{best_key}" if j == 0 else f"I-{best_key}")
remaining = remaining[best_pos + len(marker):]
return tokens, tags
def generate_ner_dataset(n: int = 8000, output_path: str = "data/output/insurance_ner_8k.jsonl"):
"""Generate n NER examples in token-level IOB2 format."""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
records = []
for _ in tqdm(range(n), desc="NER examples"):
gen_fn = random.choice(TEMPLATES)
tokens, tags = gen_fn()
records.append({
"tokens": tokens,
"ner_tags": tags,
"text": " ".join(tokens),
})
random.shuffle(records)
with open(output_path, "w") as f:
for rec in records:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
# Stats
all_tags = set()
for rec in records:
all_tags.update(rec["ner_tags"])
entity_tags = sorted(t for t in all_tags if t != "O")
print(f"\nβœ“ Generated {len(records)} NER examples β†’ {output_path}")
print(f" Entity types found: {len(entity_tags)}")
for t in entity_tags:
count = sum(1 for rec in records for tag in rec["ner_tags"] if tag == t)
print(f" {t}: {count}")
return output_path
if __name__ == "__main__":
generate_ner_dataset()