Upload folder using huggingface_hub

2cc32a5 verified 9 days ago

8.48 kB

	"""
	InsureOS — Synthetic NER (Named Entity Recognition) Data Generator
	Generates 8K token-labelled insurance text examples in IOB2 format for ModernBERT NER.
	"""

	import json
	import os
	import random
	from datetime import timedelta

	from faker import Faker
	from tqdm import tqdm

	from data.constants import (
	UK_INSURERS, LLOYDS_SYNDICATES, MGAS, UK_REGIONS,
	NER_ENTITY_TYPES, FCA_REFERENCES,
	)

	fake = Faker("en_GB")
	Faker.seed(46)
	random.seed(46)

	# Entity types with IOB2 labels:
	# PERSON, ORG, INSURER, MGA, SYNDICATE, POLICY_NUMBER, CLAIM_NUMBER,
	# MONEY, DATE, POSTCODE, LOB, REGULATION, PERIL, VEHICLE, ADDRESS


	def _postcode() -> str:
	region_name, region = random.choice(list(UK_REGIONS.items()))
	prefix = random.choice(region)
	return f"{prefix}{random.randint(1,29)} {random.randint(1,9)}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}"


	def _policy_ref() -> str:
	return f"POL-{random.randint(100000, 999999)}"


	def _claim_ref() -> str:
	return f"CLM-{random.randint(200000, 999999)}"


	def _amount() -> str:
	val = random.choice([
	random.randint(100, 999),
	random.randint(1000, 9999),
	random.randint(10000, 99999),
	random.randint(100000, 999999),
	])
	return f"£{val:,}"


	def _date_str() -> str:
	d = fake.date_between(start_date="-3y", end_date="+1y")
	return d.strftime(random.choice(["%d/%m/%Y", "%d %B %Y", "%Y-%m-%d"]))


	def _vehicle() -> str:
	makes = ["Ford Fiesta", "VW Golf", "BMW 3 Series", "Toyota Yaris", "Kia Sportage",
	"Vauxhall Corsa", "Mercedes A-Class", "Tesla Model 3", "Nissan Qashqai", "Audi A3"]
	return random.choice(makes)


	def _peril() -> str:
	return random.choice([
	"escape of water", "storm damage", "theft", "fire", "flood",
	"accidental damage", "subsidence", "malicious damage", "collision",
	"burst pipe", "lightning strike", "impact damage", "vandalism",
	])


	def _regulation() -> str:
	return random.choice(list(FCA_REFERENCES.values()) + [
	"ICOBS 8.1.1R", "DISP 1.3", "PRIN 2A", "Consumer Duty",
	"FCA PS21/5", "Equality Act 2010", "GDPR Article 6",
	])


	def _lob() -> str:
	return random.choice([
	"motor insurance", "home insurance", "commercial combined",
	"employers' liability", "public liability", "professional indemnity",
	"property insurance", "cyber insurance", "D&O insurance",
	])


	# ── Sentence templates with entity slots ──

	TEMPLATES = [
	# 0 — claim notification
	lambda: _build(
	"{PERSON} reported a {PERIL} claim ({CLAIM_NUMBER}) on {DATE}. "
	"The loss occurred at {POSTCODE} and is covered under {LOB} policy {POLICY_NUMBER} "
	"with {INSURER}. Estimated value: {MONEY}."
	),
	# 1 — subrogation
	lambda: _build(
	"{INSURER} is pursuing subrogation recovery of {MONEY} against {ORG} "
	"in respect of claim {CLAIM_NUMBER} dated {DATE}. "
	"The policyholder {PERSON} resides at {POSTCODE}."
	),
	# 2 — Lloyd's placement
	lambda: _build(
	"{SYNDICATE} has written a {MONEY} line on the {LOB} facility "
	"brokered for {ORG} by {MGA}. Inception date {DATE}."
	),
	# 3 — regulatory
	lambda: _build(
	"Under {REGULATION}, {INSURER} must provide {PERSON} with a final response "
	"to their {PERIL} claim ({CLAIM_NUMBER}) by {DATE}. "
	"The claim value is {MONEY}."
	),
	# 4 — renewal
	lambda: _build(
	"{PERSON}'s {LOB} policy {POLICY_NUMBER} with {INSURER} is due for renewal on {DATE}. "
	"Current premium: {MONEY}. Property at {POSTCODE}."
	),
	# 5 — vehicle claim
	lambda: _build(
	"{PERSON} was driving a {VEHICLE} when the {PERIL} incident occurred on {DATE} "
	"near {POSTCODE}. Claim {CLAIM_NUMBER} has been opened with {INSURER} for {MONEY}."
	),
	# 6 — MGA bordereaux
	lambda: _build(
	"{MGA} submitted the {DATE} bordereaux to {SYNDICATE} showing {MONEY} GWP "
	"across {LOB} business. Contact: {PERSON}."
	),
	# 7 — complaint
	lambda: _build(
	"{PERSON} has filed a complaint against {INSURER} regarding claim {CLAIM_NUMBER}. "
	"Per {REGULATION}, we must respond by {DATE}. Claim relates to {PERIL} at {POSTCODE}. "
	"Amount disputed: {MONEY}."
	),
	# 8 — loss adjuster
	lambda: _build(
	"Loss adjuster {PERSON} from {ORG} inspected the {PERIL} damage at {POSTCODE} on {DATE}. "
	"They recommend a settlement of {MONEY} on claim {CLAIM_NUMBER} under {LOB} cover."
	),
	# 9 — medical
	lambda: _build(
	"Dr {PERSON} examined the claimant in connection with claim {CLAIM_NUMBER} "
	"dated {DATE}. The {PERIL} incident at {POSTCODE} resulted in injuries. "
	"{INSURER} has reserved {MONEY} under the {LOB} policy."
	),
	# 10 — endorsement
	lambda: _build(
	"Endorsement applied to {POLICY_NUMBER}: {PERSON} has changed vehicle to {VEHICLE}. "
	"Effective {DATE}. Additional premium: {MONEY}. Insurer: {INSURER}."
	),
	# 11 — fraud referral
	lambda: _build(
	"Claim {CLAIM_NUMBER} by {PERSON} for {PERIL} ({MONEY}) has been referred to the fraud team. "
	"Policy {POLICY_NUMBER} with {INSURER} started on {DATE}. "
	"Property postcode: {POSTCODE}. Cf. {REGULATION}."
	),
	]


	def _build(template: str) -> tuple[list[str], list[str]]:
	"""Fill template slots and return (tokens, iob_tags)."""
	# Generate entity values
	entities = {
	"PERSON": fake.name(),
	"ORG": fake.company(),
	"INSURER": random.choice(UK_INSURERS),
	"MGA": random.choice(MGAS),
	"SYNDICATE": random.choice(LLOYDS_SYNDICATES),
	"POLICY_NUMBER": _policy_ref(),
	"CLAIM_NUMBER": _claim_ref(),
	"MONEY": _amount(),
	"DATE": _date_str(),
	"POSTCODE": _postcode(),
	"LOB": _lob(),
	"REGULATION": _regulation(),
	"PERIL": _peril(),
	"VEHICLE": _vehicle(),
	}

	# Parse template to get ordered (text_fragment, entity_type) pairs
	tokens = []
	tags = []

	remaining = template
	while remaining:
	# Find next entity slot
	best_pos = len(remaining)
	best_key = None
	for key in entities:
	marker = "{" + key + "}"
	pos = remaining.find(marker)
	if pos != -1 and pos < best_pos:
	best_pos = pos
	best_key = key

	if best_key is None:
	# No more entities — tokenize remaining text
	for tok in remaining.split():
	tokens.append(tok)
	tags.append("O")
	break

	marker = "{" + best_key + "}"

	# Text before entity
	before = remaining[:best_pos]
	for tok in before.split():
	if tok:
	tokens.append(tok)
	tags.append("O")

	# Entity tokens
	entity_value = entities[best_key]
	entity_tokens = entity_value.split()
	for j, etok in enumerate(entity_tokens):
	tokens.append(etok)
	tags.append(f"B-{best_key}" if j == 0 else f"I-{best_key}")

	remaining = remaining[best_pos + len(marker):]

	return tokens, tags


	def generate_ner_dataset(n: int = 8000, output_path: str = "data/output/insurance_ner_8k.jsonl"):
	"""Generate n NER examples in token-level IOB2 format."""
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	records = []
	for _ in tqdm(range(n), desc="NER examples"):
	gen_fn = random.choice(TEMPLATES)
	tokens, tags = gen_fn()
	records.append({
	"tokens": tokens,
	"ner_tags": tags,
	"text": " ".join(tokens),
	})

	random.shuffle(records)

	with open(output_path, "w") as f:
	for rec in records:
	f.write(json.dumps(rec, ensure_ascii=False) + "\n")

	# Stats
	all_tags = set()
	for rec in records:
	all_tags.update(rec["ner_tags"])
	entity_tags = sorted(t for t in all_tags if t != "O")

	print(f"\n✓ Generated {len(records)} NER examples → {output_path}")
	print(f" Entity types found: {len(entity_tags)}")
	for t in entity_tags:
	count = sum(1 for rec in records for tag in rec["ner_tags"] if tag == t)
	print(f" {t}: {count}")

	return output_path


	if __name__ == "__main__":
	generate_ner_dataset()