| """ |
| InsureOS β Synthetic NER (Named Entity Recognition) Data Generator |
| Generates 8K token-labelled insurance text examples in IOB2 format for ModernBERT NER. |
| """ |
|
|
| import json |
| import os |
| import random |
| from datetime import timedelta |
|
|
| from faker import Faker |
| from tqdm import tqdm |
|
|
| from data.constants import ( |
| UK_INSURERS, LLOYDS_SYNDICATES, MGAS, UK_REGIONS, |
| NER_ENTITY_TYPES, FCA_REFERENCES, |
| ) |
|
|
| fake = Faker("en_GB") |
| Faker.seed(46) |
| random.seed(46) |
|
|
| |
| |
| |
|
|
|
|
| def _postcode() -> str: |
| region_name, region = random.choice(list(UK_REGIONS.items())) |
| prefix = random.choice(region) |
| return f"{prefix}{random.randint(1,29)} {random.randint(1,9)}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}" |
|
|
|
|
| def _policy_ref() -> str: |
| return f"POL-{random.randint(100000, 999999)}" |
|
|
|
|
| def _claim_ref() -> str: |
| return f"CLM-{random.randint(200000, 999999)}" |
|
|
|
|
| def _amount() -> str: |
| val = random.choice([ |
| random.randint(100, 999), |
| random.randint(1000, 9999), |
| random.randint(10000, 99999), |
| random.randint(100000, 999999), |
| ]) |
| return f"Β£{val:,}" |
|
|
|
|
| def _date_str() -> str: |
| d = fake.date_between(start_date="-3y", end_date="+1y") |
| return d.strftime(random.choice(["%d/%m/%Y", "%d %B %Y", "%Y-%m-%d"])) |
|
|
|
|
| def _vehicle() -> str: |
| makes = ["Ford Fiesta", "VW Golf", "BMW 3 Series", "Toyota Yaris", "Kia Sportage", |
| "Vauxhall Corsa", "Mercedes A-Class", "Tesla Model 3", "Nissan Qashqai", "Audi A3"] |
| return random.choice(makes) |
|
|
|
|
| def _peril() -> str: |
| return random.choice([ |
| "escape of water", "storm damage", "theft", "fire", "flood", |
| "accidental damage", "subsidence", "malicious damage", "collision", |
| "burst pipe", "lightning strike", "impact damage", "vandalism", |
| ]) |
|
|
|
|
| def _regulation() -> str: |
| return random.choice(list(FCA_REFERENCES.values()) + [ |
| "ICOBS 8.1.1R", "DISP 1.3", "PRIN 2A", "Consumer Duty", |
| "FCA PS21/5", "Equality Act 2010", "GDPR Article 6", |
| ]) |
|
|
|
|
| def _lob() -> str: |
| return random.choice([ |
| "motor insurance", "home insurance", "commercial combined", |
| "employers' liability", "public liability", "professional indemnity", |
| "property insurance", "cyber insurance", "D&O insurance", |
| ]) |
|
|
|
|
| |
|
|
| TEMPLATES = [ |
| |
| lambda: _build( |
| "{PERSON} reported a {PERIL} claim ({CLAIM_NUMBER}) on {DATE}. " |
| "The loss occurred at {POSTCODE} and is covered under {LOB} policy {POLICY_NUMBER} " |
| "with {INSURER}. Estimated value: {MONEY}." |
| ), |
| |
| lambda: _build( |
| "{INSURER} is pursuing subrogation recovery of {MONEY} against {ORG} " |
| "in respect of claim {CLAIM_NUMBER} dated {DATE}. " |
| "The policyholder {PERSON} resides at {POSTCODE}." |
| ), |
| |
| lambda: _build( |
| "{SYNDICATE} has written a {MONEY} line on the {LOB} facility " |
| "brokered for {ORG} by {MGA}. Inception date {DATE}." |
| ), |
| |
| lambda: _build( |
| "Under {REGULATION}, {INSURER} must provide {PERSON} with a final response " |
| "to their {PERIL} claim ({CLAIM_NUMBER}) by {DATE}. " |
| "The claim value is {MONEY}." |
| ), |
| |
| lambda: _build( |
| "{PERSON}'s {LOB} policy {POLICY_NUMBER} with {INSURER} is due for renewal on {DATE}. " |
| "Current premium: {MONEY}. Property at {POSTCODE}." |
| ), |
| |
| lambda: _build( |
| "{PERSON} was driving a {VEHICLE} when the {PERIL} incident occurred on {DATE} " |
| "near {POSTCODE}. Claim {CLAIM_NUMBER} has been opened with {INSURER} for {MONEY}." |
| ), |
| |
| lambda: _build( |
| "{MGA} submitted the {DATE} bordereaux to {SYNDICATE} showing {MONEY} GWP " |
| "across {LOB} business. Contact: {PERSON}." |
| ), |
| |
| lambda: _build( |
| "{PERSON} has filed a complaint against {INSURER} regarding claim {CLAIM_NUMBER}. " |
| "Per {REGULATION}, we must respond by {DATE}. Claim relates to {PERIL} at {POSTCODE}. " |
| "Amount disputed: {MONEY}." |
| ), |
| |
| lambda: _build( |
| "Loss adjuster {PERSON} from {ORG} inspected the {PERIL} damage at {POSTCODE} on {DATE}. " |
| "They recommend a settlement of {MONEY} on claim {CLAIM_NUMBER} under {LOB} cover." |
| ), |
| |
| lambda: _build( |
| "Dr {PERSON} examined the claimant in connection with claim {CLAIM_NUMBER} " |
| "dated {DATE}. The {PERIL} incident at {POSTCODE} resulted in injuries. " |
| "{INSURER} has reserved {MONEY} under the {LOB} policy." |
| ), |
| |
| lambda: _build( |
| "Endorsement applied to {POLICY_NUMBER}: {PERSON} has changed vehicle to {VEHICLE}. " |
| "Effective {DATE}. Additional premium: {MONEY}. Insurer: {INSURER}." |
| ), |
| |
| lambda: _build( |
| "Claim {CLAIM_NUMBER} by {PERSON} for {PERIL} ({MONEY}) has been referred to the fraud team. " |
| "Policy {POLICY_NUMBER} with {INSURER} started on {DATE}. " |
| "Property postcode: {POSTCODE}. Cf. {REGULATION}." |
| ), |
| ] |
|
|
|
|
| def _build(template: str) -> tuple[list[str], list[str]]: |
| """Fill template slots and return (tokens, iob_tags).""" |
| |
| entities = { |
| "PERSON": fake.name(), |
| "ORG": fake.company(), |
| "INSURER": random.choice(UK_INSURERS), |
| "MGA": random.choice(MGAS), |
| "SYNDICATE": random.choice(LLOYDS_SYNDICATES), |
| "POLICY_NUMBER": _policy_ref(), |
| "CLAIM_NUMBER": _claim_ref(), |
| "MONEY": _amount(), |
| "DATE": _date_str(), |
| "POSTCODE": _postcode(), |
| "LOB": _lob(), |
| "REGULATION": _regulation(), |
| "PERIL": _peril(), |
| "VEHICLE": _vehicle(), |
| } |
|
|
| |
| tokens = [] |
| tags = [] |
|
|
| remaining = template |
| while remaining: |
| |
| best_pos = len(remaining) |
| best_key = None |
| for key in entities: |
| marker = "{" + key + "}" |
| pos = remaining.find(marker) |
| if pos != -1 and pos < best_pos: |
| best_pos = pos |
| best_key = key |
|
|
| if best_key is None: |
| |
| for tok in remaining.split(): |
| tokens.append(tok) |
| tags.append("O") |
| break |
|
|
| marker = "{" + best_key + "}" |
|
|
| |
| before = remaining[:best_pos] |
| for tok in before.split(): |
| if tok: |
| tokens.append(tok) |
| tags.append("O") |
|
|
| |
| entity_value = entities[best_key] |
| entity_tokens = entity_value.split() |
| for j, etok in enumerate(entity_tokens): |
| tokens.append(etok) |
| tags.append(f"B-{best_key}" if j == 0 else f"I-{best_key}") |
|
|
| remaining = remaining[best_pos + len(marker):] |
|
|
| return tokens, tags |
|
|
|
|
| def generate_ner_dataset(n: int = 8000, output_path: str = "data/output/insurance_ner_8k.jsonl"): |
| """Generate n NER examples in token-level IOB2 format.""" |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
| records = [] |
| for _ in tqdm(range(n), desc="NER examples"): |
| gen_fn = random.choice(TEMPLATES) |
| tokens, tags = gen_fn() |
| records.append({ |
| "tokens": tokens, |
| "ner_tags": tags, |
| "text": " ".join(tokens), |
| }) |
|
|
| random.shuffle(records) |
|
|
| with open(output_path, "w") as f: |
| for rec in records: |
| f.write(json.dumps(rec, ensure_ascii=False) + "\n") |
|
|
| |
| all_tags = set() |
| for rec in records: |
| all_tags.update(rec["ner_tags"]) |
| entity_tags = sorted(t for t in all_tags if t != "O") |
|
|
| print(f"\nβ Generated {len(records)} NER examples β {output_path}") |
| print(f" Entity types found: {len(entity_tags)}") |
| for t in entity_tags: |
| count = sum(1 for rec in records for tag in rec["ner_tags"] if tag == t) |
| print(f" {t}: {count}") |
|
|
| return output_path |
|
|
|
|
| if __name__ == "__main__": |
| generate_ner_dataset() |
|
|