piyushptiwari commited on 21 days ago

Commit

2cc32a5

verified ·

1 Parent(s): 439311b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +224 -0
__init__.py +1 -0
api.py +367 -0
collect/__init__.py +1 -0
collect/config.py +202 -0
collect/convert_sft.py +416 -0
collect/run_collection.py +128 -0
collect/run_fast.py +131 -0
collect/scraper_base.py +102 -0
collect/sources/__init__.py +108 -0
collect/sources/education.py +167 -0
collect/sources/fca.py +100 -0
collect/sources/hf_datasets.py +133 -0
collect/sources/investopedia.py +102 -0
collect/sources/legislation.py +118 -0
collect/sources/rss_news.py +122 -0
collect/sources/wikipedia.py +106 -0
config.py +202 -0
convert_sft.py +494 -0
data/__init__.py +1 -0
data/constants.py +219 -0
data/gen_documents.py +324 -0
data/gen_dpo.py +375 -0
data/gen_ner.py +258 -0
data/gen_sft.py +1192 -0
data/gen_tabular.py +343 -0
data/generate_all.py +65 -0
distill.py +260 -0
doc_classifier.py +181 -0
dpo_train.py +188 -0
evaluation/__init__.py +1 -0
evaluation/results/full_eval_report.json +412 -0
evaluation/results/insurellm_eval.json +87 -0
evaluation/run_eval.py +361 -0
fraud_model.py +320 -0
ner_model.py +254 -0
pricing_glm.py +262 -0
push_to_hf.py +153 -0
qlora_finetune.py +198 -0
retrain_realworld.py +176 -0
run_collection.py +128 -0
run_eval.py +356 -0
run_fast.py +131 -0
scraper_base.py +102 -0
scripts/setup.sh +82 -0
scripts/train_all.sh +108 -0
search/__init__.py +1 -0
search/api.py +234 -0
search/bm25.py +232 -0
search/config.py +55 -0

README.md ADDED Viewed

	@@ -0,0 +1,224 @@

+---
+language:
+- en
+license: apache-2.0
+tags:
+- insurance
+- uk-insurance
+- training-pipeline
+- search-engine
+- bytical
+---
+# INSUREOS Models — Complete Insurance AI Training Pipeline
+**Created by [Bytical AI](https://bytical.ai)** — AI agents that run insurance operations.
+## Overview
+INSUREOS is a complete AI/ML training and inference pipeline for UK insurance operations. This repository contains all source code for data generation, model training, evaluation, data collection, and a hybrid search engine.
+### Model Suite
+| Model | HuggingFace | Task | Key Metric |
+|-------|-------------|------|------------|
+| InsureLLM-4B | [piyushptiwari/InsureLLM-4B](https://huggingface.co/piyushptiwari/InsureLLM-4B) | Insurance domain LLM | ROUGE-1: 0.384 |
+| InsureDocClassifier | [piyushptiwari/InsureDocClassifier](https://huggingface.co/piyushptiwari/InsureDocClassifier) | 12-class document classification | F1: 1.0 |
+| InsureNER | [piyushptiwari/InsureNER](https://huggingface.co/piyushptiwari/InsureNER) | 13-entity NER | F1: 1.0 |
+| InsureFraudNet | [piyushptiwari/InsureFraudNet](https://huggingface.co/piyushptiwari/InsureFraudNet) | Fraud detection (3 LoB) | AUC-ROC: 1.0 |
+| InsurePricing | [piyushptiwari/InsurePricing](https://huggingface.co/piyushptiwari/InsurePricing) | Premium pricing (GLM + EBM) | MAE: £11,132 |
+| InsureSearch | (included in this repo) | Hybrid search engine | 33K docs indexed |
+### Training Dataset
+[piyushptiwari/insureos-training-data](https://huggingface.co/datasets/piyushptiwari/insureos-training-data) — 10K SFT, 5K DPO, 50K tabular, 10K docs, 8K NER
+## Repository Structure
+```
+insureos-models/
+├── data/                        # Synthetic data generation
+│   ├── constants.py             # UK insurance constants (regions, perils, regulators)
+│   ├── gen_sft.py               # Generate SFT instruction-response pairs
+│   ├── gen_dpo.py               # Generate DPO preference pairs
+│   ├── gen_documents.py         # Generate insurance documents (12 classes)
+│   ├── gen_ner.py               # Generate NER-annotated text
+│   ├── gen_tabular.py           # Generate claims tabular data
+│   └── generate_all.py          # Run all generators
+│
+├── collect/                     # Real-world data collection
+│   ├── config.py                # Scraping targets and configuration
+│   ├── scraper_base.py          # Base HTTP scraper with caching
+│   ├── convert_sft.py           # Convert raw docs → SFT/DPO format
+│   ├── run_fast.py              # Fast collection orchestrator
+│   └── sources/                 # Per-source scrapers
+│       ├── wikipedia.py         # Wikipedia insurance articles
+│       ├── legislation.py       # UK legislation (legislation.gov.uk)
+│       ├── fca.py               # FCA Handbook
+│       ├── hf_datasets.py       # HuggingFace insurance datasets
+│       ├── rss_news.py          # Insurance news RSS feeds
+│       └── education.py         # Insurance education resources
+│
+├── training/                    # Model training scripts
+│   ├── qlora_finetune.py        # QLoRA fine-tuning (Qwen3-4B)
+│   ├── dpo_train.py             # DPO alignment training
+│   ├── retrain_realworld.py     # Real-world data retraining
+│   ├── doc_classifier.py        # ModernBERT document classifier
+│   ├── ner_model.py             # ModernBERT NER model
+│   ├── fraud_model.py           # XGBoost + Isolation Forest fraud
+│   ├── pricing_glm.py           # Tweedie GLM + EBM pricing
+│   └── distill.py               # Model distillation (experimental)
+│
+├── evaluation/                  # Evaluation suite
+│   ├── run_eval.py              # Full multi-model evaluation
+│   └── results/                 # Evaluation results (JSON)
+│
+├── search/                      # Hybrid search engine
+│   ├── config.py                # Search configuration
+│   ├── embedder.py              # BGE-small-en-v1.5 embedding service
+│   ├── bm25.py                  # Custom Okapi BM25 implementation
+│   ├── vector_store.py          # Qdrant vector store
+│   ├── reranker.py              # Cross-encoder reranker
+│   ├── hybrid_engine.py         # RRF fusion (vector + BM25 + reranker)
+│   ├── indexer.py               # Document ingestion pipeline
+│   ├── models.py                # Pydantic data models
+│   └── api.py                   # FastAPI REST API
+│
+├── serve/                       # Model serving
+│   └── api.py                   # FastAPI inference endpoints
+│
+└── scripts/                     # Automation
+    ├── setup.sh                 # Environment setup (NVIDIA, Python, deps)
+    └── train_all.sh             # Full training pipeline script
+```
+## Quick Start
+### 1. Environment Setup
+```bash
+# Create virtual environment
+python3 -m venv .venv && source .venv/bin/activate
+# Install dependencies
+pip install torch transformers trl peft bitsandbytes
+pip install xgboost scikit-learn interpret
+pip install sentence-transformers qdrant-client fastapi uvicorn
+```
+### 2. Generate Training Data
+```bash
+python -m data.generate_all
+# Outputs: data/output/ (SFT, DPO, docs, NER, tabular)
+```
+### 3. Train Models
+```bash
+# Train all models sequentially
+bash scripts/train_all.sh
+# Or individually:
+python training/qlora_finetune.py          # InsureLLM QLoRA
+python training/dpo_train.py               # InsureLLM DPO
+python training/doc_classifier.py          # Document classifier
+python training/ner_model.py               # NER model
+python training/fraud_model.py             # Fraud detection
+python training/pricing_glm.py             # Pricing models
+```
+### 4. Evaluate
+```bash
+python evaluation/run_eval.py
+# Results saved to evaluation/results/
+```
+### 5. Run Search Engine
+```bash
+# Index documents
+python search/indexer.py
+# Start API
+python search/api.py
+# API at http://localhost:8900
+# Endpoints: /search, /search/vector, /search/keyword, /suggest, /facets, /stats
+```
+## Search Engine — InsureSearch
+A hybrid search engine rivaling Azure AI Search, built entirely on open-source components:
+| Component | Technology | Details |
+|-----------|-----------|---------|
+| **Vector Search** | BGE-small-en-v1.5 (384-dim) + Qdrant | Semantic similarity |
+| **Keyword Search** | Custom Okapi BM25 | Insurance-aware tokenization |
+| **Reranking** | cross-encoder/ms-marco-MiniLM-L-6-v2 | Cross-encoder reranking |
+| **Fusion** | Reciprocal Rank Fusion (RRF) | Vector 60% + BM25 40% |
+| **API** | FastAPI | REST API with facets, suggestions |
+**Index stats:** 33,034 chunks from 31,679 documents, 51,640 BM25 terms.
+## Training Pipeline
+```
+Stage 1: Synthetic Data Generation
+├── 10K SFT instruction-response pairs
+├── 5K DPO preference pairs
+├── 50K tabular claims (Motor/Property/Liability)
+├── 10K insurance documents (12 classes)
+└── 8K NER-annotated texts (13 entity types)
+Stage 2: QLoRA Fine-Tuning → Qwen3-4B
+├── rank=64, alpha=128, all-linear targets
+├── 2 epochs, batch=2, grad_accum=4
+├── Final: train_loss=0.012, eval_loss=0.118
+└── Token accuracy: 95.88%
+Stage 3: DPO Alignment
+├── 5K preference pairs
+├── 149 steps, reward_accuracy=1.0
+└── Reward margin: 26.76
+Stage 4: Real-World Data Collection
+├── Wikipedia (150 docs), UK Legislation (692)
+├── HuggingFace datasets (31,060), RSS (50), Education (88)
+├── Converted to 3,685 SFT + 776 DPO pairs
+└── Quality filtered (English-only, no echo responses)
+Stage 5: Real-World Retraining
+├── 876 steps on real-world SFT data
+└── Claims process score improved 0.40 → 0.60
+Stage 6: Specialized Models (parallel)
+├── FraudNet: XGBoost + Isolation Forest → AUC-ROC 1.0
+├── PricingGLM: Tweedie GLM + EBM → MAE £11,132
+├── DocClassifier: ModernBERT → F1 1.0
+└── InsureNER: ModernBERT → F1 1.0
+```
+## Tech Stack
+- **LLM:** Qwen3-4B + QLoRA + DPO (PyTorch, Transformers, TRL, PEFT, bitsandbytes)
+- **Classification & NER:** ModernBERT-base (Transformers)
+- **Fraud Detection:** XGBoost + Isolation Forest (scikit-learn)
+- **Pricing:** Tweedie GLM (scikit-learn) + EBM (InterpretML)
+- **Search:** BGE-small-en-v1.5 + Qdrant + BM25 + cross-encoder
+- **Training GPU:** NVIDIA Tesla T4 16GB
+## Citation
+```bibtex
+@misc{bytical2026insureos,
+  title={INSUREOS: A Complete AI/ML Suite for UK Insurance Operations},
+  author={Bytical AI},
+  year={2026},
+  url={https://huggingface.co/piyushptiwari/insureos-models}
+}
+```
+## About Bytical AI
+[Bytical](https://bytical.ai) builds AI agents that run insurance operations — claims automation, underwriting intelligence, digital sales, and core system modernization for insurers across the UK and Europe. Microsoft AI Partner | NVIDIA | Salesforce.

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Training package

api.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+InsureOS — FastAPI Model Serving Endpoint
+Serves all InsureOS models via a unified REST API.
+"""
+import os
+import json
+import time
+import pickle
+import logging
+from pathlib import Path
+from contextlib import asynccontextmanager
+import torch
+import numpy as np
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+# ── Logging ──
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("insureos-api")
+# ── Request/Response models ──
+class ChatRequest(BaseModel):
+    messages: list[dict] = Field(..., description="Chat messages in OpenAI format")
+    max_tokens: int = Field(512, ge=1, le=2048)
+    temperature: float = Field(0.7, ge=0.0, le=2.0)
+class ChatResponse(BaseModel):
+    response: str
+    model: str
+    latency_ms: float
+class FraudRequest(BaseModel):
+    lob: str = Field(..., description="Line of business: motor, property, or liability")
+    features: dict = Field(..., description="Claim features as key-value pairs")
+class FraudResponse(BaseModel):
+    fraud_probability: float
+    fraud_label: bool
+    anomaly_score: float | None = None
+    model: str
+    latency_ms: float
+class PricingRequest(BaseModel):
+    features: dict = Field(..., description="Rating factor values")
+    model_type: str = Field("ebm", description="Model type: glm or ebm")
+class PricingResponse(BaseModel):
+    predicted_premium: float
+    model: str
+    latency_ms: float
+class DocClassifyRequest(BaseModel):
+    text: str = Field(..., description="Document text to classify")
+class DocClassifyResponse(BaseModel):
+    label: str
+    confidence: float
+    all_scores: dict[str, float]
+    model: str
+    latency_ms: float
+class NERRequest(BaseModel):
+    text: str = Field(..., description="Text for entity extraction")
+class NERResponse(BaseModel):
+    entities: list[dict]
+    model: str
+    latency_ms: float
+class HealthResponse(BaseModel):
+    status: str
+    models_loaded: dict[str, bool]
+# ── Model store ──
+models = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load models on startup."""
+    logger.info("Loading models...")
+    # InsureLLM
+    insurellm_path = os.getenv("INSURELLM_MODEL", "models/insurellm-8b-dpo-merged")
+    if Path(insurellm_path).exists():
+        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+        logger.info(f"Loading InsureLLM from {insurellm_path}...")
+        tokenizer = AutoTokenizer.from_pretrained(insurellm_path, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            insurellm_path,
+            quantization_config=bnb_config,
+            device_map="auto",
+            trust_remote_code=True,
+            attn_implementation="sdpa",
+            torch_dtype=torch.bfloat16,
+        )
+        model.eval()
+        models["insurellm"] = {"model": model, "tokenizer": tokenizer}
+        logger.info("InsureLLM loaded ✓")
+    # FraudNet
+    fraud_dir = Path(os.getenv("FRAUD_MODEL", "models/fraudnet"))
+    for lob in ["motor", "property", "liability"]:
+        xgb_path = fraud_dir / f"xgb_{lob}.json"
+        if xgb_path.exists():
+            import xgboost as xgb_lib
+            xgb_model = xgb_lib.XGBClassifier()
+            xgb_model.load_model(str(xgb_path))
+            models[f"fraud_{lob}"] = xgb_model
+            logger.info(f"FraudNet {lob} loaded ✓")
+        iforest_path = fraud_dir / f"iforest_{lob}.pkl"
+        if iforest_path.exists():
+            with open(iforest_path, "rb") as f:
+                models[f"iforest_{lob}"] = pickle.load(f)
+            logger.info(f"IsolationForest {lob} loaded ✓")
+    # Pricing
+    pricing_dir = Path(os.getenv("PRICING_MODEL", "models/pricing-glm"))
+    for name in ["tweedie_glm", "pricing_ebm"]:
+        pkl_path = pricing_dir / f"{name}.pkl"
+        if pkl_path.exists():
+            with open(pkl_path, "rb") as f:
+                models[name] = pickle.load(f)
+            logger.info(f"Pricing {name} loaded ✓")
+    # Doc Classifier
+    doc_path = os.getenv("DOC_MODEL", "models/doc-classifier")
+    if Path(doc_path).exists():
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer as ATok
+        models["doc_classifier"] = {
+            "model": AutoModelForSequenceClassification.from_pretrained(doc_path),
+            "tokenizer": ATok.from_pretrained(doc_path),
+        }
+        models["doc_classifier"]["model"].eval()
+        meta_path = Path(doc_path) / "training_meta.json"
+        if meta_path.exists():
+            with open(meta_path) as f:
+                models["doc_classifier"]["meta"] = json.load(f)
+        logger.info("DocClassifier loaded ✓")
+    # NER
+    ner_path = os.getenv("NER_MODEL", "models/ner-model")
+    if Path(ner_path).exists():
+        from transformers import AutoModelForTokenClassification, AutoTokenizer as ATok2
+        models["ner"] = {
+            "model": AutoModelForTokenClassification.from_pretrained(ner_path),
+            "tokenizer": ATok2.from_pretrained(ner_path),
+        }
+        models["ner"]["model"].eval()
+        meta_path = Path(ner_path) / "training_meta.json"
+        if meta_path.exists():
+            with open(meta_path) as f:
+                models["ner"]["meta"] = json.load(f)
+        logger.info("NER model loaded ✓")
+    logger.info(f"Models loaded: {list(models.keys())}")
+    yield
+    models.clear()
+app = FastAPI(
+    title="InsureOS Model API",
+    description="UK Insurance AI Model Serving — InsureLLM, FraudNet, PricingGLM, DocClassifier, NER",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+# ── Endpoints ──
+@app.get("/health", response_model=HealthResponse)
+async def health():
+    return HealthResponse(
+        status="healthy",
+        models_loaded={
+            "insurellm": "insurellm" in models,
+            "fraud_motor": "fraud_motor" in models,
+            "fraud_property": "fraud_property" in models,
+            "fraud_liability": "fraud_liability" in models,
+            "pricing_glm": "tweedie_glm" in models,
+            "pricing_ebm": "pricing_ebm" in models,
+            "doc_classifier": "doc_classifier" in models,
+            "ner": "ner" in models,
+        },
+    )
+@app.post("/v1/chat", response_model=ChatResponse)
+async def chat(request: ChatRequest):
+    if "insurellm" not in models:
+        raise HTTPException(status_code=503, detail="InsureLLM not loaded")
+    tokenizer = models["insurellm"]["tokenizer"]
+    model = models["insurellm"]["model"]
+    text = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
+    start = time.time()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=request.max_tokens,
+            temperature=max(request.temperature, 0.01),
+            top_p=0.9,
+            do_sample=request.temperature > 0,
+        )
+    latency = (time.time() - start) * 1000
+    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    return ChatResponse(response=response, model="insurellm-8b-dpo", latency_ms=latency)
+@app.post("/v1/fraud", response_model=FraudResponse)
+async def fraud_detect(request: FraudRequest):
+    lob = request.lob.lower()
+    model_key = f"fraud_{lob}"
+    if model_key not in models:
+        raise HTTPException(status_code=503, detail=f"FraudNet {lob} not loaded")
+    xgb_model = models[model_key]
+    import pandas as pd
+    features_df = pd.DataFrame([request.features]).fillna(0)
+    start = time.time()
+    prob = float(xgb_model.predict_proba(features_df)[:, 1][0])
+    label = prob >= 0.5
+    latency = (time.time() - start) * 1000
+    # Anomaly score from isolation forest
+    anomaly = None
+    iforest_key = f"iforest_{lob}"
+    if iforest_key in models:
+        anomaly = float(models[iforest_key].score_samples(features_df)[0])
+    return FraudResponse(
+        fraud_probability=prob,
+        fraud_label=label,
+        anomaly_score=anomaly,
+        model=f"fraudnet-{lob}",
+        latency_ms=latency,
+    )
+@app.post("/v1/pricing", response_model=PricingResponse)
+async def predict_price(request: PricingRequest):
+    model_key = "tweedie_glm" if request.model_type == "glm" else "pricing_ebm"
+    if model_key not in models:
+        raise HTTPException(status_code=503, detail=f"Pricing model {request.model_type} not loaded")
+    pricing_model = models[model_key]
+    import pandas as pd
+    features_df = pd.DataFrame([request.features]).fillna(0)
+    start = time.time()
+    prediction = float(max(0, pricing_model.predict(features_df)[0]))
+    latency = (time.time() - start) * 1000
+    return PricingResponse(
+        predicted_premium=prediction,
+        model=request.model_type,
+        latency_ms=latency,
+    )
+@app.post("/v1/classify", response_model=DocClassifyResponse)
+async def classify_document(request: DocClassifyRequest):
+    if "doc_classifier" not in models:
+        raise HTTPException(status_code=503, detail="DocClassifier not loaded")
+    tokenizer = models["doc_classifier"]["tokenizer"]
+    model = models["doc_classifier"]["model"]
+    meta = models["doc_classifier"].get("meta", {})
+    id2label = meta.get("id2label", {})
+    inputs = tokenizer(request.text, return_tensors="pt", truncation=True, max_length=512)
+    start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    latency = (time.time() - start) * 1000
+    probs = torch.softmax(outputs.logits, dim=-1)[0]
+    pred_id = probs.argmax().item()
+    confidence = probs[pred_id].item()
+    scores = {id2label.get(str(i), f"class_{i}"): float(p) for i, p in enumerate(probs)}
+    return DocClassifyResponse(
+        label=id2label.get(str(pred_id), f"class_{pred_id}"),
+        confidence=confidence,
+        all_scores=scores,
+        model="doc-classifier",
+        latency_ms=latency,
+    )
+@app.post("/v1/ner", response_model=NERResponse)
+async def extract_entities(request: NERRequest):
+    if "ner" not in models:
+        raise HTTPException(status_code=503, detail="NER model not loaded")
+    tokenizer = models["ner"]["tokenizer"]
+    model = models["ner"]["model"]
+    meta = models["ner"].get("meta", {})
+    id2label = meta.get("id2label", {})
+    tokens = request.text.split()
+    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=256)
+    start = time.time()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    latency = (time.time() - start) * 1000
+    preds = outputs.logits.argmax(dim=-1)[0].tolist()
+    word_ids = inputs.word_ids(batch_index=0)
+    entities = []
+    current_entity = None
+    for idx, (word_id, pred_id) in enumerate(zip(word_ids, preds)):
+        if word_id is None:
+            continue
+        label = id2label.get(str(pred_id), "O")
+        if label.startswith("B-"):
+            if current_entity:
+                entities.append(current_entity)
+            current_entity = {
+                "entity_type": label[2:],
+                "text": tokens[word_id],
+                "start_token": word_id,
+                "end_token": word_id,
+            }
+        elif label.startswith("I-") and current_entity and label[2:] == current_entity["entity_type"]:
+            current_entity["text"] += " " + tokens[word_id]
+            current_entity["end_token"] = word_id
+        else:
+            if current_entity:
+                entities.append(current_entity)
+                current_entity = None
+    if current_entity:
+        entities.append(current_entity)
+    return NERResponse(entities=entities, model="ner-model", latency_ms=latency)
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.getenv("PORT", "8000"))
+    uvicorn.run("serve.api:app", host="0.0.0.0", port=port, reload=False)

collect/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Insurance data collection pipeline

collect/config.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""Configuration for data collection sources."""
+import os
+from pathlib import Path
+# ── Paths ──────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).resolve().parent.parent
+RAW_DIR = BASE_DIR / "collect" / "raw"
+PROCESSED_DIR = BASE_DIR / "collect" / "processed"
+SFT_OUTPUT = BASE_DIR / "collect" / "sft_real_world.jsonl"
+DPO_OUTPUT = BASE_DIR / "collect" / "dpo_real_world.jsonl"
+RAW_DIR.mkdir(parents=True, exist_ok=True)
+PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+# ── Rate limiting ──────────────────────────────────────────────────
+REQUEST_DELAY = 1.5          # seconds between requests (be polite)
+MAX_RETRIES = 3
+TIMEOUT = 30
+# ── User agent ─────────────────────────────────────────────────────
+USER_AGENT = (
+    "InsureOS-DataCollector/1.0 "
+    "(Research; insurance-domain-model-training; "
+    "contact: piyush@bytical.com)"
+)
+HEADERS = {
+    "User-Agent": USER_AGENT,
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-GB,en;q=0.9",
+}
+# ── Wikipedia insurance articles ───────────────────────────────────
+WIKIPEDIA_SEED_ARTICLES = [
+    "Insurance", "Reinsurance", "Underwriting", "Actuarial_science",
+    "Insurance_policy", "Lloyd%27s_of_London", "Property_insurance",
+    "Casualty_insurance", "Life_insurance", "Health_insurance",
+    "Motor_insurance", "Marine_insurance", "Liability_insurance",
+    "Professional_indemnity_insurance", "Directors_and_officers_liability_insurance",
+    "Cyber_insurance", "Product_liability", "Public_liability",
+    "Employers%27_liability_insurance", "Business_interruption_insurance",
+    "Catastrophe_bond", "Insurance-linked_securities",
+    "Solvency_II", "IFRS_17", "Risk_management",
+    "Claims_adjusting", "Loss_adjustment", "Salvage_(insurance)",
+    "Subrogation", "Indemnity", "Utmost_good_faith",
+    "Proximate_cause_(insurance)", "Insurance_fraud",
+    "Parametric_insurance", "Microinsurance", "Takaful",
+    "Financial_Conduct_Authority", "Prudential_Regulation_Authority_(United_Kingdom)",
+    "General_insurance", "Insurance_broker", "Managing_general_agent",
+    "Coverholder", "Bordereaux", "Treaty_reinsurance",
+    "Facultative_reinsurance", "Excess_of_loss", "Quota_share",
+    "Stop-loss_insurance", "Aggregate_stop-loss_insurance",
+    "Deductible", "Co-insurance", "Self-insurance",
+    "Captive_insurance", "Risk_retention_group",
+    "Insurance_in_the_United_Kingdom", "Association_of_British_Insurers",
+    "Chartered_Insurance_Institute", "Insurance_premium_tax",
+    "Motor_Insurers%27_Bureau", "Pool_Reinsurance_Company",
+    "Flood_Re", "Terrorism_reinsurance",
+    "Insurance_contract", "Warranty_(insurance)",
+    "Condition_(insurance)", "Exclusion_(insurance)",
+    "Endorsement_(insurance)", "Schedule_(insurance)",
+    "Inception_(insurance)", "Renewal_(insurance)",
+    "Cancellation_(insurance)", "Claims-made_policy",
+    "Occurrence_policy", "Claims_reserve",
+    "Incurred_but_not_reported", "Loss_ratio",
+    "Combined_ratio", "Expense_ratio",
+    "Generalized_linear_model", "Tweedie_distribution",
+    "Poisson_regression", "Gamma_distribution",
+    "Chain_ladder_method", "Bornhuetter–Ferguson_method",
+    "Credibility_theory", "Experience_rating",
+    "Risk_classification", "Adverse_selection",
+    "Moral_hazard", "Insurance_scoring",
+    "Telematics", "Usage-based_insurance",
+    "Insurtech", "Peer-to-peer_insurance",
+    "Embedded_insurance", "Open_insurance",
+    "ACORD", "ISO_ClaimSearch",
+    "National_Flood_Insurance_Program",
+    "Earthquake_insurance", "Windstorm_insurance",
+    "Hail_insurance", "Crop_insurance",
+    "Title_insurance", "Surety_bond",
+    "Fidelity_bond", "Warranty",
+    "Extended_warranty", "Home_warranty",
+    "Pet_insurance", "Travel_insurance",
+    "Wedding_insurance", "Event_insurance",
+    "Key_person_insurance", "Trade_credit_insurance",
+    "Political_risk_insurance", "Environmental_liability",
+    "Pollution_insurance",
+]
+# ── FCA Handbook sections ──────────────────────────────────────────
+FCA_HANDBOOK_SECTIONS = [
+    "ICOBS",       # Insurance: Conduct of Business Sourcebook
+    "SYSC",        # Senior Management Arrangements
+    "PRIN",        # Principles for Businesses
+    "COBS",        # Conduct of Business Sourcebook
+    "DISP",        # Dispute Resolution: Complaints
+    "SUP",         # Supervision
+    "CONC",        # Consumer Credit
+    "MCOB",        # Mortgages and Home Finance
+]
+FCA_BASE_URL = "https://www.handbook.fca.org.uk"
+# ── UK Legislation ─────────────────────────────────────────────────
+UK_LEGISLATION_URLS = [
+    # Insurance Act 2015
+    "https://www.legislation.gov.uk/ukpga/2015/4/contents",
+    # Enterprise Act 2016 (insurance damages for late payment)
+    "https://www.legislation.gov.uk/ukpga/2016/12/contents",
+    # Financial Services and Markets Act 2000
+    "https://www.legislation.gov.uk/ukpga/2000/8/contents",
+    # Third Parties (Rights against Insurers) Act 2010
+    "https://www.legislation.gov.uk/ukpga/2010/10/contents",
+    # Road Traffic Act 1988 (compulsory motor insurance)
+    "https://www.legislation.gov.uk/ukpga/1988/52/contents",
+    # Employers' Liability (Compulsory Insurance) Act 1969
+    "https://www.legislation.gov.uk/ukpga/1969/57/contents",
+    # Marine Insurance Act 1906
+    "https://www.legislation.gov.uk/ukpga/Edw7/6/41/contents",
+    # Consumer Insurance (Disclosure and Representations) Act 2012
+    "https://www.legislation.gov.uk/ukpga/2012/6/contents",
+    # Data Protection Act 2018
+    "https://www.legislation.gov.uk/ukpga/2018/12/contents",
+]
+# ── Investopedia insurance glossary terms ──────────────────────────
+INVESTOPEDIA_TERMS = [
+    "insurance", "reinsurance", "underwriting", "premium",
+    "deductible", "copayment", "coinsurance", "policy-limit",
+    "exclusion", "endorsement", "rider", "binder",
+    "actuary", "actuarial-science", "loss-ratio",
+    "combined-ratio", "expense-ratio", "claims-reserve",
+    "ibnr", "incurred-but-not-reported",
+    "lloyd-s-of-london", "surplus-lines",
+    "managing-general-agent", "captive-insurance-company",
+    "risk-retention-group", "self-insurance",
+    "occurrence-policy", "claims-made-policy",
+    "general-liability-insurance", "professional-liability-insurance",
+    "errors-and-omissions-insurance", "directors-and-officers-liability-insurance",
+    "cyber-insurance", "key-person-insurance",
+    "business-interruption-insurance", "commercial-property-insurance",
+    "workers-compensation", "employers-liability-insurance",
+    "public-liability-insurance", "product-liability-insurance",
+    "environmental-liability-insurance", "marine-insurance",
+    "hull-insurance", "cargo-insurance",
+    "protection-and-indemnity-insurance", "aviation-insurance",
+    "crop-insurance", "title-insurance",
+    "surety-bond", "fidelity-bond",
+    "catastrophe-bond", "insurance-linked-securities",
+    "parametric-insurance", "microinsurance",
+    "property-insurance", "casualty-insurance",
+    "fire-insurance", "flood-insurance",
+    "earthquake-insurance", "windstorm-insurance",
+    "homeowners-insurance", "renters-insurance",
+    "auto-insurance", "uninsured-motorist-coverage",
+    "comprehensive-auto-insurance", "collision-insurance",
+    "gap-insurance", "umbrella-insurance",
+    "life-insurance", "term-life-insurance",
+    "whole-life-insurance", "universal-life-insurance",
+    "variable-life-insurance", "endowment-policy",
+    "annuity", "health-insurance",
+    "disability-insurance", "long-term-care-insurance",
+    "pet-insurance", "travel-insurance",
+    "wedding-insurance", "event-insurance",
+    "trade-credit-insurance", "political-risk-insurance",
+    "warranty", "extended-warranty",
+    "solvency", "moral-hazard",
+    "adverse-selection", "risk-management",
+    "risk-assessment", "risk-transfer",
+    "risk-pooling", "law-of-large-numbers",
+    "subrogation", "indemnity", "utmost-good-faith",
+    "proximate-cause", "insurable-interest",
+    "insurance-fraud", "total-loss",
+    "actual-cash-value", "replacement-cost",
+    "agreed-value", "reinstatement-value",
+]
+# ── HuggingFace datasets ──────────────────────────────────────────
+HF_DATASETS = [
+    ("rvpierre/insurance-qa-en", None),
+    ("ebrigham/NL_insurance_reviews_sentiment", None),
+    ("snorkelai/Multi-Turn-Insurance-Underwriting-Code-Gen", None),
+    ("Ddream-ai/InsuranceCorpus", None),
+]
+# ── Insurance subreddits ──────────────────────────────────────────
+REDDIT_SUBREDDITS = [
+    "insurance",
+    "InsuranceProfessional",
+    "HealthInsurance",
+    "ActuaryUK",
+    "actuary",
+]
+# ── RSS feeds for insurance news ───────────────────────────────────
+RSS_FEEDS = [
+    "https://www.insurancetimes.co.uk/rss",
+    "https://www.insurancejournal.com/rss/news/",
+    "https://www.reinsurancene.ws/feed/",
+    "https://www.artemis.bm/feed/",
+]

collect/convert_sft.py ADDED Viewed

	@@ -0,0 +1,416 @@

+"""Convert collected real-world insurance data into SFT and DPO training format.
+Strategies:
+1. Knowledge Q&A — generate question-answer pairs from article text
+2. Summarisation — "Summarise this insurance concept"
+3. Regulation interpretation — "What does FCA say about X?"
+4. Definition — "Define {term} in insurance context"
+5. Scenario analysis — "Given {scenario}, what insurance considerations apply?"
+6. Comparison — "Compare {A} and {B} in insurance"
+"""
+import json
+import logging
+import random
+import re
+import textwrap
+from pathlib import Path
+from collect.config import PROCESSED_DIR, SFT_OUTPUT, DPO_OUTPUT, RAW_DIR
+logger = logging.getLogger(__name__)
+# ── Templates ──────────────────────────────────────────────────────
+QA_TEMPLATES = [
+    "What is {concept}?",
+    "Explain {concept} in the context of UK insurance.",
+    "How does {concept} work in insurance?",
+    "Define {concept} for an insurance professional.",
+    "What role does {concept} play in the insurance industry?",
+    "Describe {concept} and its importance in insurance.",
+    "As an insurance underwriter, explain {concept}.",
+    "What should a claims handler know about {concept}?",
+    "How is {concept} relevant to insurance regulation in the UK?",
+    "Explain {concept} as it applies to general insurance.",
+]
+REGULATION_TEMPLATES = [
+    "What does the FCA require regarding {topic}?",
+    "Explain the regulatory requirements for {topic} in UK insurance.",
+    "How does {topic} affect insurance companies under UK regulation?",
+    "What compliance obligations exist for {topic}?",
+    "Summarise the key regulatory points about {topic}.",
+]
+SCENARIO_TEMPLATES = [
+    "A policyholder has filed a claim for {scenario}. What are the key considerations?",
+    "An underwriter is assessing a risk involving {scenario}. What factors should they evaluate?",
+    "A broker needs to advise their client about {scenario}. What guidance should they give?",
+    "An insurance company is developing a product for {scenario}. What are the main considerations?",
+]
+COMPARISON_TEMPLATES = [
+    "Compare and contrast {a} and {b} in insurance.",
+    "What are the differences between {a} and {b}?",
+    "When would an insurer choose {a} over {b}?",
+]
+def _extract_first_paragraph(text: str, max_len: int = 800) -> str:
+    """Extract a clean first paragraph as a concise answer."""
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    if not paragraphs:
+        return text[:max_len]
+    # Take first substantive paragraph
+    for p in paragraphs:
+        if len(p) > 50:
+            return p[:max_len]
+    return paragraphs[0][:max_len]
+def _extract_key_concepts(text: str) -> list[str]:
+    """Extract key insurance concepts/terms from text."""
+    # Look for bold/capitalized terms, section headers, etc.
+    concepts = set()
+    # Find section headers (lines that look like headers)
+    for line in text.split("\n"):
+        line = line.strip()
+        if 3 < len(line) < 80 and not line.endswith("."):
+            if line[0].isupper() and not line.startswith("The "):
+                # Could be a concept
+                concepts.add(line.strip("=").strip("#").strip())
+    # Find insurance-specific noun phrases (simple heuristic)
+    insurance_terms = re.findall(
+        r'\b([A-Z][a-z]+(?: [A-Z][a-z]+){0,3})\b', text[:5000]
+    )
+    for term in insurance_terms:
+        if len(term) > 3 and any(kw in term.lower() for kw in [
+            "insurance", "reinsur", "claim", "underw", "polic",
+            "premium", "loss", "risk", "cover", "liabil",
+            "indemnit", "act", "regulation", "fca", "lloyd",
+        ]):
+            concepts.add(term)
+    return list(concepts)[:10]
+def _make_sft_from_knowledge(doc: dict) -> list[dict]:
+    """Create SFT pairs from a knowledge article."""
+    pairs = []
+    title = doc.get("title", "")
+    text = doc.get("text", "")
+    if not text or len(text) < 100:
+        return pairs
+    # 1. Knowledge Q&A from title concept
+    if title and len(title) > 3:
+        concept = title.replace("_", " ")
+        question = random.choice(QA_TEMPLATES).format(concept=concept)
+        # Use first ~800 chars as answer
+        answer = _extract_first_paragraph(text, max_len=1200)
+        if len(answer) > 50:
+            pairs.append({
+                "instruction": question,
+                "response": answer,
+                "source": doc.get("source", "unknown"),
+                "category": "knowledge_qa",
+            })
+    # 2. Summarisation task
+    if len(text) > 500:
+        chunk = text[:3000]
+        pairs.append({
+            "instruction": f"Summarise the following insurance content:\n\n{chunk}",
+            "response": _extract_first_paragraph(text, max_len=600),
+            "source": doc.get("source", "unknown"),
+            "category": "summarisation",
+        })
+    # 3. Extract concepts and create Q&A for each
+    concepts = _extract_key_concepts(text)
+    for concept in concepts[:3]:
+        question = random.choice(QA_TEMPLATES).format(concept=concept)
+        # Find the paragraph that mentions this concept
+        for para in text.split("\n\n"):
+            if concept.lower() in para.lower() and len(para) > 50:
+                pairs.append({
+                    "instruction": question,
+                    "response": para[:1200],
+                    "source": doc.get("source", "unknown"),
+                    "category": "concept_qa",
+                })
+                break
+    return pairs
+def _make_sft_from_regulation(doc: dict) -> list[dict]:
+    """Create SFT pairs from regulatory documents."""
+    pairs = []
+    text = doc.get("text", "")
+    title = doc.get("title", "")
+    section = doc.get("section", "")
+    if not text or len(text) < 100:
+        return pairs
+    # Regulatory Q&A
+    topic = title or section
+    if topic:
+        question = random.choice(REGULATION_TEMPLATES).format(topic=topic)
+        answer = _extract_first_paragraph(text, max_len=1500)
+        if len(answer) > 50:
+            pairs.append({
+                "instruction": question,
+                "response": answer,
+                "source": "regulation",
+                "category": "regulation_qa",
+            })
+    # Section-by-section Q&A
+    sections = text.split("\n\n")
+    for i, section_text in enumerate(sections[:5]):
+        if len(section_text) > 100:
+            pairs.append({
+                "instruction": f"Explain this insurance regulation provision:\n\n{section_text[:500]}",
+                "response": section_text[:1500],
+                "source": "regulation",
+                "category": "regulation_explain",
+            })
+    return pairs
+def _make_sft_from_legislation(doc: dict) -> list[dict]:
+    """Create SFT pairs from UK insurance legislation."""
+    pairs = []
+    text = doc.get("text", "")
+    act = doc.get("act", "")
+    if not text or len(text) < 100:
+        return pairs
+    pairs.append({
+        "instruction": (
+            f"Explain the following provision from UK insurance legislation "
+            f"({act}):\n\n{text[:1000]}"
+        ),
+        "response": text[:2000],
+        "source": "uk_legislation",
+        "category": "legislation_qa",
+    })
+    return pairs
+def _make_sft_from_news(doc: dict) -> list[dict]:
+    """Create SFT pairs from insurance news articles."""
+    pairs = []
+    text = doc.get("text", "")
+    title = doc.get("title", "")
+    if not text or len(text) < 200:
+        return pairs
+    # Summarisation task
+    pairs.append({
+        "instruction": f"Summarise this insurance industry news article:\n\n{text[:2000]}",
+        "response": _extract_first_paragraph(text, max_len=800),
+        "source": "insurance_news",
+        "category": "news_summary",
+    })
+    # Analysis task
+    if title:
+        pairs.append({
+            "instruction": (
+                f"As an insurance industry analyst, what are the key takeaways "
+                f"from this article titled '{title}'?\n\n{text[:1500]}"
+            ),
+            "response": _extract_first_paragraph(text, max_len=1000),
+            "source": "insurance_news",
+            "category": "news_analysis",
+        })
+    return pairs
+def _make_sft_from_hf(doc: dict) -> list[dict]:
+    """Create SFT pairs from HuggingFace dataset rows."""
+    pairs = []
+    row = doc.get("row", {})
+    text = doc.get("text", "")
+    if not text or len(text) < 50:
+        return pairs
+    # If it has question/answer fields, use directly
+    q = row.get("question", "")
+    a = row.get("answer", row.get("response", ""))
+    if q and a:
+        pairs.append({
+            "instruction": q,
+            "response": a[:2000],
+            "source": doc.get("dataset", "huggingface"),
+            "category": "hf_qa",
+        })
+        return pairs
+    # If it has instruction/output fields
+    inst = row.get("instruction", row.get("input", ""))
+    out = row.get("output", row.get("response", ""))
+    if inst and out:
+        pairs.append({
+            "instruction": inst,
+            "response": out[:2000],
+            "source": doc.get("dataset", "huggingface"),
+            "category": "hf_instruction",
+        })
+        return pairs
+    # If it has review/sentiment fields
+    review = row.get("review", row.get("text", ""))
+    sentiment = row.get("sentiment", row.get("label", ""))
+    if review and sentiment:
+        pairs.append({
+            "instruction": (
+                f"Classify the sentiment of this insurance review "
+                f"as positive, negative, or neutral:\n\n{review[:1000]}"
+            ),
+            "response": f"Sentiment: {sentiment}",
+            "source": doc.get("dataset", "huggingface"),
+            "category": "hf_sentiment",
+        })
+        return pairs
+    # Generic: use as knowledge
+    pairs.append({
+        "instruction": f"Explain the following insurance information:\n\n{text[:1000]}",
+        "response": text[:2000],
+        "source": doc.get("dataset", "huggingface"),
+        "category": "hf_knowledge",
+    })
+    return pairs
+def _make_dpo_pair(sft_pair: dict) -> dict | None:
+    """Create a DPO preference pair from an SFT pair.
+    chosen = the good response (real data)
+    rejected = a degraded version (shorter, less specific, generic)
+    """
+    instruction = sft_pair["instruction"]
+    good_response = sft_pair["response"]
+    if len(good_response) < 100:
+        return None
+    # Create a degraded response (shorter, more generic)
+    bad_strategies = [
+        # Strategy 1: Truncate to first sentence
+        lambda r: r.split(".")[0] + "." if "." in r else r[:50],
+        # Strategy 2: Generic non-answer
+        lambda r: "This is a complex insurance topic that requires careful consideration of many factors.",
+        # Strategy 3: Partial answer (first 20%)
+        lambda r: r[:max(50, len(r) // 5)],
+        # Strategy 4: Wrong focus
+        lambda r: f"While this is an important topic, the key thing to remember is that insurance is about managing risk. {r[:100]}",
+    ]
+    bad_response = random.choice(bad_strategies)(good_response)
+    return {
+        "instruction": instruction,
+        "chosen": good_response,
+        "rejected": bad_response,
+        "source": sft_pair.get("source", "unknown"),
+    }
+def convert_all_to_sft(raw_dir: Path = RAW_DIR) -> tuple[int, int]:
+    """Convert all collected raw documents to SFT and DPO format."""
+    all_sft = []
+    all_dpo = []
+    # Processing map: source_type -> converter function
+    converters = {
+        "wikipedia": _make_sft_from_knowledge,
+        "fca_handbook": _make_sft_from_regulation,
+        "uk_legislation": _make_sft_from_legislation,
+        "investopedia": _make_sft_from_knowledge,
+        "insurance_news": _make_sft_from_news,
+        "insurance_news_summary": _make_sft_from_news,
+        "huggingface": _make_sft_from_hf,
+        "exam_syllabus": _make_sft_from_knowledge,
+        "insurance_education": _make_sft_from_knowledge,
+        "insurance_data": _make_sft_from_hf,
+    }
+    # Scan all JSONL files in raw subdirectories
+    for source_dir in raw_dir.iterdir():
+        if not source_dir.is_dir():
+            continue
+        for jsonl_file in source_dir.glob("*.jsonl"):
+            logger.info(f"Converting {jsonl_file}...")
+            with open(jsonl_file) as f:
+                for line in f:
+                    try:
+                        doc = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    source = doc.get("source", "")
+                    converter = converters.get(source, _make_sft_from_knowledge)
+                    sft_pairs = converter(doc)
+                    for pair in sft_pairs:
+                        all_sft.append(pair)
+                        # 30% chance of creating DPO pair
+                        if random.random() < 0.3:
+                            dpo = _make_dpo_pair(pair)
+                            if dpo:
+                                all_dpo.append(dpo)
+    # Shuffle
+    random.shuffle(all_sft)
+    random.shuffle(all_dpo)
+    # Write SFT
+    with open(SFT_OUTPUT, "w") as f:
+        for pair in all_sft:
+            # Format as chat for Qwen3
+            chat = {
+                "messages": [
+                    {"role": "system", "content": "You are InsureLLM, an expert UK insurance AI assistant. You provide accurate, detailed, and regulation-aware answers about insurance, underwriting, claims, actuarial science, and UK/EU insurance regulation."},
+                    {"role": "user", "content": pair["instruction"]},
+                    {"role": "assistant", "content": pair["response"]},
+                ]
+            }
+            f.write(json.dumps(chat, ensure_ascii=False) + "\n")
+    # Write DPO
+    with open(DPO_OUTPUT, "w") as f:
+        for pair in all_dpo:
+            dpo_row = {
+                "prompt": pair["instruction"],
+                "chosen": pair["chosen"],
+                "rejected": pair["rejected"],
+            }
+            f.write(json.dumps(dpo_row, ensure_ascii=False) + "\n")
+    logger.info(f"SFT: {len(all_sft)} pairs → {SFT_OUTPUT}")
+    logger.info(f"DPO: {len(all_dpo)} pairs → {DPO_OUTPUT}")
+    return len(all_sft), len(all_dpo)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    sft_count, dpo_count = convert_all_to_sft()
+    print(f"Created {sft_count} SFT pairs and {dpo_count} DPO pairs")

collect/run_collection.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Master orchestrator for all data collection sources."""
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+def run_collection():
+    """Run all data collection sources."""
+    start = time.time()
+    total_docs = 0
+    # ── 1. Wikipedia ───────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("1/6  WIKIPEDIA — Insurance articles")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.wikipedia import collect_wikipedia
+        docs = collect_wikipedia(max_articles=400)
+        total_docs += len(docs)
+        logger.info(f"  ✓ Wikipedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Wikipedia failed: {e}")
+    # ── 2. FCA Handbook ────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("2/6  FCA HANDBOOK — UK insurance regulation")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.fca import collect_fca
+        docs = collect_fca()
+        total_docs += len(docs)
+        logger.info(f"  ✓ FCA: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ FCA failed: {e}")
+    # ── 3. UK Legislation ──────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("3/6  UK LEGISLATION — Insurance Act 2015 etc.")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.legislation import collect_legislation
+        docs = collect_legislation()
+        total_docs += len(docs)
+        logger.info(f"  ✓ Legislation: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Legislation failed: {e}")
+    # ── 4. Investopedia ────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("4/6  INVESTOPEDIA — Insurance glossary")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.investopedia import collect_investopedia
+        docs = collect_investopedia()
+        total_docs += len(docs)
+        logger.info(f"  ✓ Investopedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Investopedia failed: {e}")
+    # ── 5. HuggingFace ─────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("5/6  HUGGINGFACE — Insurance datasets")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.hf_datasets import collect_huggingface
+        docs = collect_huggingface()
+        total_docs += len(docs)
+        logger.info(f"  ✓ HuggingFace: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ HuggingFace failed: {e}")
+    # ── 6. RSS / News ──────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("6/6  RSS NEWS — Insurance industry news")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.rss_news import collect_rss
+        docs = collect_rss()
+        total_docs += len(docs)
+        logger.info(f"  ✓ RSS: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ RSS failed: {e}")
+    # ── 7. Education ───────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("7/7  EDUCATION — Open textbooks & exam content")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.education import collect_education
+        docs = collect_education()
+        total_docs += len(docs)
+        logger.info(f"  ✓ Education: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Education failed: {e}")
+    # ── Convert to SFT ─────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("CONVERTING collected data → SFT + DPO training format")
+    logger.info("=" * 60)
+    try:
+        from collect.convert_sft import convert_all_to_sft
+        sft_count, dpo_count = convert_all_to_sft()
+        logger.info(f"  ✓ SFT pairs: {sft_count}")
+        logger.info(f"  ✓ DPO pairs: {dpo_count}")
+    except Exception as e:
+        logger.error(f"  ✗ SFT conversion failed: {e}")
+    elapsed = time.time() - start
+    logger.info("=" * 60)
+    logger.info(f"COLLECTION COMPLETE")
+    logger.info(f"  Total documents: {total_docs:,}")
+    logger.info(f"  Time elapsed:    {elapsed / 60:.1f} minutes")
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    run_collection()

collect/run_fast.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""Fast data collection — reduced Wikipedia cap, lower API delay."""
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+# Override delay for API sources (Wikipedia API is generous)
+import collect.config as cfg
+cfg.REQUEST_DELAY = 0.5
+def run_fast():
+    start = time.time()
+    total_docs = 0
+    # 1. Wikipedia (cap at 150 — still 2M+ chars of insurance knowledge)
+    logger.info("=" * 60)
+    logger.info("1/7  WIKIPEDIA — Insurance articles (max 150)")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.wikipedia import collect_wikipedia
+        docs = collect_wikipedia(max_articles=150)
+        total_docs += len(docs)
+        logger.info(f"  => Wikipedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Wikipedia failed: {e}", exc_info=True)
+    # 2. FCA Handbook
+    cfg.REQUEST_DELAY = 1.5  # Web scraping — be polite
+    logger.info("=" * 60)
+    logger.info("2/7  FCA HANDBOOK")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.fca import collect_fca
+        docs = collect_fca()
+        total_docs += len(docs)
+        logger.info(f"  => FCA: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  FCA failed: {e}", exc_info=True)
+    # 3. UK Legislation
+    logger.info("=" * 60)
+    logger.info("3/7  UK LEGISLATION")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.legislation import collect_legislation
+        docs = collect_legislation()
+        total_docs += len(docs)
+        logger.info(f"  => Legislation: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Legislation failed: {e}", exc_info=True)
+    # 4. Investopedia
+    logger.info("=" * 60)
+    logger.info("4/7  INVESTOPEDIA")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.investopedia import collect_investopedia
+        docs = collect_investopedia()
+        total_docs += len(docs)
+        logger.info(f"  => Investopedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Investopedia failed: {e}", exc_info=True)
+    # 5. HuggingFace
+    cfg.REQUEST_DELAY = 0.3
+    logger.info("=" * 60)
+    logger.info("5/7  HUGGINGFACE DATASETS")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.hf_datasets import collect_huggingface
+        docs = collect_huggingface()
+        total_docs += len(docs)
+        logger.info(f"  => HuggingFace: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  HuggingFace failed: {e}", exc_info=True)
+    # 6. RSS News
+    cfg.REQUEST_DELAY = 1.0
+    logger.info("=" * 60)
+    logger.info("6/7  RSS NEWS")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.rss_news import collect_rss
+        docs = collect_rss()
+        total_docs += len(docs)
+        logger.info(f"  => RSS: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  RSS failed: {e}", exc_info=True)
+    # 7. Education
+    logger.info("=" * 60)
+    logger.info("7/7  EDUCATION")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.education import collect_education
+        docs = collect_education()
+        total_docs += len(docs)
+        logger.info(f"  => Education: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Education failed: {e}", exc_info=True)
+    # Convert to SFT
+    logger.info("=" * 60)
+    logger.info("CONVERTING → SFT + DPO format")
+    logger.info("=" * 60)
+    try:
+        from collect.convert_sft import convert_all_to_sft
+        sft_count, dpo_count = convert_all_to_sft()
+        logger.info(f"  => SFT pairs: {sft_count}")
+        logger.info(f"  => DPO pairs: {dpo_count}")
+    except Exception as e:
+        logger.error(f"  SFT conversion failed: {e}", exc_info=True)
+    elapsed = time.time() - start
+    logger.info("=" * 60)
+    logger.info(f"DONE — {total_docs:,} documents in {elapsed / 60:.1f} min")
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    run_fast()

collect/scraper_base.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Base scraper with rate limiting, retries, and polite crawling."""
+import time
+import json
+import hashlib
+import logging
+from pathlib import Path
+from typing import Optional
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from collect.config import (
+    HEADERS, REQUEST_DELAY, MAX_RETRIES, TIMEOUT, RAW_DIR,
+)
+logger = logging.getLogger(__name__)
+class BaseScraper:
+    """Polite web scraper with rate limiting and caching."""
+    def __init__(self, source_name: str):
+        self.source_name = source_name
+        self.output_dir = RAW_DIR / source_name
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.session = self._build_session()
+        self._last_request_time = 0.0
+        self.stats = {"fetched": 0, "cached": 0, "failed": 0, "total_chars": 0}
+    def _build_session(self) -> requests.Session:
+        session = requests.Session()
+        session.headers.update(HEADERS)
+        retry = Retry(
+            total=MAX_RETRIES,
+            backoff_factor=1.0,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["GET"],
+        )
+        adapter = HTTPAdapter(max_retries=retry)
+        session.mount("https://", adapter)
+        session.mount("http://", adapter)
+        return session
+    def _rate_limit(self):
+        elapsed = time.time() - self._last_request_time
+        if elapsed < REQUEST_DELAY:
+            time.sleep(REQUEST_DELAY - elapsed)
+        self._last_request_time = time.time()
+    def _cache_key(self, url: str) -> str:
+        return hashlib.sha256(url.encode()).hexdigest()[:16]
+    def _cache_path(self, url: str) -> Path:
+        return self.output_dir / f"{self._cache_key(url)}.json"
+    def fetch(self, url: str, force: bool = False) -> Optional[str]:
+        """Fetch URL content with caching and rate limiting."""
+        cache = self._cache_path(url)
+        if not force and cache.exists():
+            data = json.loads(cache.read_text())
+            self.stats["cached"] += 1
+            return data.get("content")
+        self._rate_limit()
+        try:
+            resp = self.session.get(url, timeout=TIMEOUT)
+            resp.raise_for_status()
+            content = resp.text
+            # Cache the result
+            cache.write_text(json.dumps({
+                "url": url,
+                "status": resp.status_code,
+                "content": content,
+                "fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+            }))
+            self.stats["fetched"] += 1
+            self.stats["total_chars"] += len(content)
+            return content
+        except Exception as e:
+            logger.warning(f"[{self.source_name}] Failed to fetch {url}: {e}")
+            self.stats["failed"] += 1
+            return None
+    def save_documents(self, documents: list[dict], filename: str = "documents.jsonl"):
+        """Save collected documents as JSONL."""
+        out = self.output_dir / filename
+        with open(out, "w") as f:
+            for doc in documents:
+                f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+        logger.info(f"[{self.source_name}] Saved {len(documents)} docs → {out}")
+        return out
+    def print_stats(self):
+        logger.info(
+            f"[{self.source_name}] Stats: "
+            f"fetched={self.stats['fetched']}, "
+            f"cached={self.stats['cached']}, "
+            f"failed={self.stats['failed']}, "
+            f"chars={self.stats['total_chars']:,}"
+        )

collect/sources/__init__.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Scrape insurance articles from Wikipedia via the public MediaWiki API."""
+import logging
+import re
+from typing import Optional
+from collect.scraper_base import BaseScraper
+from collect.config import WIKIPEDIA_SEED_ARTICLES
+logger = logging.getLogger(__name__)
+API = "https://en.wikipedia.org/w/api.php"
+class WikipediaScraper(BaseScraper):
+    def __init__(self):
+        super().__init__("wikipedia")
+    def _get_article_text(self, title: str) -> Optional[dict]:
+        """Get plain-text extract of a Wikipedia article via API."""
+        url = (
+            f"{API}?action=query&titles={title}"
+            f"&prop=extracts&explaintext=1&exsectionformat=plain"
+            f"&format=json&redirects=1"
+        )
+        html = self.fetch(url)
+        if not html:
+            return None
+        import json
+        data = json.loads(html)
+        pages = data.get("query", {}).get("pages", {})
+        for pid, page in pages.items():
+            if pid == "-1":
+                return None
+            text = page.get("extract", "")
+            if len(text) < 200:
+                return None
+            return {
+                "title": page.get("title", title),
+                "text": text,
+                "source": "wikipedia",
+                "url": f"https://en.wikipedia.org/wiki/{title}",
+                "category": "insurance_knowledge",
+            }
+        return None
+    def _get_linked_articles(self, title: str, limit: int = 20) -> list[str]:
+        """Get insurance-related links from an article."""
+        url = (
+            f"{API}?action=query&titles={title}"
+            f"&prop=links&pllimit={limit}&plnamespace=0"
+            f"&format=json&redirects=1"
+        )
+        html = self.fetch(url)
+        if not html:
+            return []
+        import json
+        data = json.loads(html)
+        pages = data.get("query", {}).get("pages", {})
+        links = []
+        insurance_keywords = {
+            "insurance", "insur", "underw", "claim", "polic",
+            "premium", "actuar", "reinsur", "liabil", "indemnit",
+            "risk", "loss", "peril", "cover", "broker",
+            "lloyd", "solvency", "fca", "pra", "regul",
+        }
+        for page in pages.values():
+            for link in page.get("links", []):
+                link_title = link.get("title", "")
+                lower = link_title.lower()
+                if any(kw in lower for kw in insurance_keywords):
+                    links.append(link_title.replace(" ", "_"))
+        return links
+    def collect(self, max_articles: int = 500) -> list[dict]:
+        """Collect Wikipedia insurance articles with link expansion."""
+        documents = []
+        visited = set()
+        queue = list(WIKIPEDIA_SEED_ARTICLES)
+        while queue and len(documents) < max_articles:
+            title = queue.pop(0)
+            if title in visited:
+                continue
+            visited.add(title)
+            doc = self._get_article_text(title)
+            if doc:
+                documents.append(doc)
+                logger.info(
+                    f"  [{len(documents)}/{max_articles}] {doc['title']} "
+                    f"({len(doc['text']):,} chars)"
+                )
+                # Expand links from this article
+                if len(documents) < max_articles:
+                    new_links = self._get_linked_articles(title)
+                    for link in new_links:
+                        if link not in visited:
+                            queue.append(link)
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_wikipedia(max_articles: int = 500) -> list[dict]:
+    scraper = WikipediaScraper()
+    return scraper.collect(max_articles)

collect/sources/education.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Collect open insurance educational content and textbook excerpts."""
+import json
+import logging
+import re
+from typing import Optional
+from bs4 import BeautifulSoup
+from collect.scraper_base import BaseScraper
+logger = logging.getLogger(__name__)
+# Public insurance educational resources
+EDUCATIONAL_URLS = [
+    # IRMI (International Risk Management Institute) - public glossary
+    ("https://www.irmi.com/term/insurance-definitions", "irmi_glossary"),
+    # III (Insurance Information Institute) - public fact sheets
+    ("https://www.iii.org/insurance-topics", "iii_topics"),
+    # CAS (Casualty Actuarial Society) - public resources
+    ("https://www.casact.org/publications-research", "cas_research"),
+    # SOA (Society of Actuaries) - public resources
+    ("https://www.soa.org/resources/research-reports/", "soa_research"),
+    # UK ABI (Association of British Insurers) - public resources
+    ("https://www.abi.org.uk/data-and-resources/", "abi_data"),
+    # Lloyd's - public market info
+    ("https://www.lloyds.com/about-lloyds", "lloyds_about"),
+    # Swiss Re - sigma reports (public abstracts)
+    ("https://www.swissre.com/institute/research/sigma-research.html", "swissre_sigma"),
+]
+# Open actuarial textbook content (Loss Data Analytics - open source)
+OPEN_TEXTBOOK_CHAPTERS = [
+    "https://openacttexts.github.io/Loss-Data-Analytics/",
+]
+# Insurance exam prep - public syllabus material
+EXAM_TOPICS = {
+    "CII_IF1": [
+        "Principles of insurance", "Insurance contract law",
+        "Types of insurance", "The insurance market",
+        "Insurance regulation in the UK", "Claims handling",
+        "Underwriting principles", "Reinsurance basics",
+        "Risk management fundamentals", "Insurance intermediaries",
+    ],
+    "CII_IF2": [
+        "General insurance business", "Property insurance",
+        "Liability insurance", "Motor insurance",
+        "Marine aviation and transport insurance",
+        "Financial lines insurance", "Specialty insurance",
+    ],
+    "IFoA_CP1": [
+        "Actuarial risk management", "Insurance pricing models",
+        "Generalized linear models in insurance",
+        "Loss reserving methods", "Chain ladder technique",
+        "Bornhuetter-Ferguson method", "Solvency II capital modelling",
+        "Risk measures and capital requirements",
+    ],
+}
+class EducationCollector(BaseScraper):
+    def __init__(self):
+        super().__init__("education")
+    def _scrape_page(self, url: str, source_name: str) -> list[dict]:
+        """Scrape educational content from a page and its links."""
+        html = self.fetch(url)
+        if not html:
+            return []
+        soup = BeautifulSoup(html, "html.parser")
+        for tag in soup.find_all(["nav", "footer", "script", "style", "aside"]):
+            tag.decompose()
+        documents = []
+        # Get main page content
+        main = soup.find("main") or soup.find("article") or soup.find("body")
+        if main:
+            text = main.get_text(separator="\n", strip=True)
+            text = re.sub(r"\n{3,}", "\n\n", text)
+            if len(text) > 200:
+                title = ""
+                h1 = soup.find("h1")
+                if h1:
+                    title = h1.get_text(strip=True)
+                documents.append({
+                    "title": title or source_name,
+                    "text": text[:30000],
+                    "source": source_name,
+                    "url": url,
+                    "category": "insurance_education",
+                })
+        # Follow internal links
+        base_domain = "/".join(url.split("/")[:3])
+        for a in soup.find_all("a", href=True)[:20]:
+            href = a["href"]
+            if href.startswith("/"):
+                href = base_domain + href
+            if href.startswith(base_domain) and href != url:
+                sub_html = self.fetch(href)
+                if sub_html:
+                    sub_soup = BeautifulSoup(sub_html, "html.parser")
+                    for tag in sub_soup.find_all(["nav", "footer", "script", "style"]):
+                        tag.decompose()
+                    sub_main = sub_soup.find("main") or sub_soup.find("article")
+                    if sub_main:
+                        sub_text = sub_main.get_text(separator="\n", strip=True)
+                        if len(sub_text) > 200:
+                            sub_title = ""
+                            h1 = sub_soup.find("h1")
+                            if h1:
+                                sub_title = h1.get_text(strip=True)
+                            documents.append({
+                                "title": sub_title or href.split("/")[-1],
+                                "text": sub_text[:20000],
+                                "source": source_name,
+                                "url": href,
+                                "category": "insurance_education",
+                            })
+        return documents
+    def _generate_exam_knowledge(self) -> list[dict]:
+        """Generate knowledge documents from insurance exam topics."""
+        documents = []
+        for exam, topics in EXAM_TOPICS.items():
+            for topic in topics:
+                # Create a structured knowledge entry
+                documents.append({
+                    "title": f"{exam}: {topic}",
+                    "text": f"Insurance Exam Topic: {topic}\n"
+                            f"Exam: {exam}\n"
+                            f"This topic covers the key concepts, principles, "
+                            f"and practical applications of {topic.lower()} "
+                            f"in the context of UK insurance practice.",
+                    "source": "exam_syllabus",
+                    "category": "insurance_education",
+                    "exam": exam,
+                })
+        return documents
+    def collect(self) -> list[dict]:
+        """Collect educational insurance content."""
+        documents = []
+        for url, source_name in EDUCATIONAL_URLS:
+            logger.info(f"  Scraping education source: {source_name}")
+            docs = self._scrape_page(url, source_name)
+            documents.extend(docs)
+            logger.info(f"    Got {len(docs)} documents")
+        # Exam knowledge
+        exam_docs = self._generate_exam_knowledge()
+        documents.extend(exam_docs)
+        logger.info(f"  Generated {len(exam_docs)} exam topic entries")
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_education() -> list[dict]:
+    collector = EducationCollector()
+    return collector.collect()

collect/sources/fca.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Scrape FCA Handbook sections relevant to insurance."""
+import json
+import logging
+import re
+from typing import Optional
+from bs4 import BeautifulSoup
+from collect.scraper_base import BaseScraper
+from collect.config import FCA_HANDBOOK_SECTIONS, FCA_BASE_URL
+logger = logging.getLogger(__name__)
+class FCAHandbookScraper(BaseScraper):
+    def __init__(self):
+        super().__init__("fca_handbook")
+    def _scrape_section_index(self, section: str) -> list[str]:
+        """Get chapter URLs from a handbook section index page."""
+        url = f"{FCA_BASE_URL}/{section}"
+        html = self.fetch(url)
+        if not html:
+            return []
+        soup = BeautifulSoup(html, "html.parser")
+        links = []
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            # Match chapter links like /ICOBS/1 or /ICOBS/1/1
+            if re.match(rf"/{section}/\d+", href):
+                full_url = f"{FCA_BASE_URL}{href}"
+                if full_url not in links:
+                    links.append(full_url)
+        return links[:50]  # Cap per section
+    def _scrape_chapter(self, url: str, section: str) -> Optional[dict]:
+        """Scrape text content from a handbook chapter page."""
+        html = self.fetch(url)
+        if not html:
+            return None
+        soup = BeautifulSoup(html, "html.parser")
+        # Remove nav, footer, scripts
+        for tag in soup.find_all(["nav", "footer", "script", "style", "aside"]):
+            tag.decompose()
+        # Get main content area
+        content = soup.find("main") or soup.find("div", class_="handbook-content")
+        if not content:
+            content = soup.find("body")
+        if not content:
+            return None
+        text = content.get_text(separator="\n", strip=True)
+        # Clean up excessive whitespace
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        text = re.sub(r"[ \t]+", " ", text)
+        if len(text) < 100:
+            return None
+        title_tag = soup.find("h1") or soup.find("title")
+        title = title_tag.get_text(strip=True) if title_tag else url.split("/")[-1]
+        return {
+            "title": f"FCA Handbook - {section} - {title}",
+            "text": text[:50000],  # Cap at 50k chars per page
+            "source": "fca_handbook",
+            "url": url,
+            "section": section,
+            "category": "regulation",
+        }
+    def collect(self) -> list[dict]:
+        """Collect FCA handbook content for insurance-related sections."""
+        documents = []
+        for section in FCA_HANDBOOK_SECTIONS:
+            logger.info(f"  Scraping FCA section: {section}")
+            chapter_urls = self._scrape_section_index(section)
+            logger.info(f"    Found {len(chapter_urls)} chapters")
+            for url in chapter_urls:
+                doc = self._scrape_chapter(url, section)
+                if doc:
+                    documents.append(doc)
+                    logger.info(
+                        f"    [{len(documents)}] {doc['title'][:60]} "
+                        f"({len(doc['text']):,} chars)"
+                    )
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_fca(max_per_section: int = 50) -> list[dict]:
+    scraper = FCAHandbookScraper()
+    return scraper.collect()

collect/sources/hf_datasets.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""Download and process insurance datasets from HuggingFace."""
+import json
+import logging
+from typing import Optional
+from collect.scraper_base import BaseScraper
+from collect.config import HF_DATASETS
+logger = logging.getLogger(__name__)
+class HuggingFaceCollector(BaseScraper):
+    def __init__(self):
+        super().__init__("huggingface")
+    def _download_dataset(self, name: str, config: Optional[str]) -> list[dict]:
+        """Download a HuggingFace dataset and convert to our format."""
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            logger.error("Install `datasets`: pip install datasets")
+            return []
+        documents = []
+        try:
+            logger.info(f"  Downloading HF dataset: {name}")
+            kwargs = {"trust_remote_code": True}
+            if config:
+                kwargs["name"] = config
+            ds = load_dataset(name, **kwargs)
+            # Process each split
+            for split_name, split_data in ds.items():
+                logger.info(f"    Split '{split_name}': {len(split_data)} rows")
+                for i, row in enumerate(split_data):
+                    doc = self._row_to_document(row, name, split_name, i)
+                    if doc:
+                        documents.append(doc)
+        except Exception as e:
+            logger.warning(f"  Failed to load {name}: {e}")
+        return documents
+    def _row_to_document(self, row: dict, dataset_name: str,
+                         split: str, idx: int) -> Optional[dict]:
+        """Convert a dataset row to a document dict."""
+        # Try common text field names
+        text_fields = ["text", "content", "question", "answer", "document",
+                       "input", "output", "instruction", "response",
+                       "review", "comment", "body", "description"]
+        texts = []
+        for field in text_fields:
+            if field in row and row[field] and isinstance(row[field], str):
+                texts.append(f"{field}: {row[field]}")
+        # Also grab any other string fields
+        for k, v in row.items():
+            if isinstance(v, str) and k not in text_fields and len(v) > 20:
+                texts.append(f"{k}: {v}")
+        if not texts:
+            return None
+        combined = "\n".join(texts)
+        if len(combined) < 50:
+            return None
+        return {
+            "title": f"{dataset_name}/{split}/{idx}",
+            "text": combined[:30000],
+            "source": "huggingface",
+            "dataset": dataset_name,
+            "split": split,
+            "category": "insurance_data",
+            "row": {k: str(v)[:500] for k, v in row.items()
+                    if isinstance(v, (str, int, float))},
+        }
+    def collect(self) -> list[dict]:
+        """Download all configured HuggingFace insurance datasets."""
+        all_documents = []
+        for name, config in HF_DATASETS:
+            docs = self._download_dataset(name, config)
+            all_documents.extend(docs)
+            logger.info(f"  Collected {len(docs)} docs from {name}")
+        self.save_documents(all_documents)
+        self.print_stats()
+        return all_documents
+# Also search HuggingFace for more insurance datasets
+class HuggingFaceSearcher(BaseScraper):
+    """Search HuggingFace Hub API for insurance-tagged datasets."""
+    def __init__(self):
+        super().__init__("hf_search")
+    def search_datasets(self, query: str = "insurance", limit: int = 50) -> list[str]:
+        """Search HuggingFace Hub for insurance datasets."""
+        url = (
+            f"https://huggingface.co/api/datasets"
+            f"?search={query}&limit={limit}&sort=downloads&direction=-1"
+        )
+        raw = self.fetch(url)
+        if not raw:
+            return []
+        try:
+            results = json.loads(raw)
+            names = [r["id"] for r in results if isinstance(r, dict) and "id" in r]
+            logger.info(f"  Found {len(names)} HF datasets for '{query}'")
+            return names
+        except Exception as e:
+            logger.warning(f"  HF search failed: {e}")
+            return []
+def collect_huggingface() -> list[dict]:
+    collector = HuggingFaceCollector()
+    return collector.collect()
+def search_hf_datasets() -> list[str]:
+    searcher = HuggingFaceSearcher()
+    found = []
+    for q in ["insurance", "insurance claims", "actuarial",
+              "insurance underwriting", "insurance fraud"]:
+        found.extend(searcher.search_datasets(q))
+    return list(set(found))

collect/sources/investopedia.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Scrape Investopedia insurance glossary terms."""
+import json
+import logging
+import re
+from typing import Optional
+from bs4 import BeautifulSoup
+from collect.scraper_base import BaseScraper
+from collect.config import INVESTOPEDIA_TERMS
+logger = logging.getLogger(__name__)
+BASE = "https://www.investopedia.com/terms"
+class InvestopediaScraper(BaseScraper):
+    def __init__(self):
+        super().__init__("investopedia")
+    def _scrape_term(self, term: str) -> Optional[dict]:
+        """Scrape a single Investopedia insurance term."""
+        # Investopedia URL pattern: /terms/{first_letter}/{term}.asp
+        # OR newer: /terms/{term}-{number}
+        # Try the common patterns
+        first_letter = term[0].lower()
+        urls_to_try = [
+            f"https://www.investopedia.com/{term}-5075091",
+            f"https://www.investopedia.com/{term}-definition-5075091",
+            f"https://www.investopedia.com/terms/{first_letter}/{term}.asp",
+            f"https://www.investopedia.com/terms/{first_letter}/{term}",
+        ]
+        for url in urls_to_try:
+            html = self.fetch(url)
+            if not html or "404" in html[:500]:
+                continue
+            soup = BeautifulSoup(html, "html.parser")
+            # Remove ads, nav, etc
+            for tag in soup.find_all(["nav", "footer", "script", "style",
+                                      "aside", "header", "figure"]):
+                tag.decompose()
+            # Get article content
+            article = soup.find("article") or soup.find("div", class_="article-body")
+            if not article:
+                article = soup.find("main")
+            if not article:
+                continue
+            text = article.get_text(separator="\n", strip=True)
+            text = re.sub(r"\n{3,}", "\n\n", text)
+            if len(text) < 200:
+                continue
+            title = ""
+            h1 = soup.find("h1")
+            if h1:
+                title = h1.get_text(strip=True)
+            if not title:
+                title = term.replace("-", " ").title()
+            return {
+                "title": title,
+                "text": text[:20000],
+                "source": "investopedia",
+                "url": url,
+                "term": term,
+                "category": "insurance_education",
+            }
+        return None
+    def collect(self) -> list[dict]:
+        """Collect Investopedia insurance term definitions."""
+        documents = []
+        for i, term in enumerate(INVESTOPEDIA_TERMS):
+            doc = self._scrape_term(term)
+            if doc:
+                documents.append(doc)
+                logger.info(
+                    f"  [{len(documents)}/{len(INVESTOPEDIA_TERMS)}] {doc['title'][:50]} "
+                    f"({len(doc['text']):,} chars)"
+                )
+            else:
+                logger.debug(f"  Skipped: {term}")
+            if (i + 1) % 20 == 0:
+                logger.info(f"  Progress: {i + 1}/{len(INVESTOPEDIA_TERMS)} terms checked")
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_investopedia() -> list[dict]:
+    scraper = InvestopediaScraper()
+    return scraper.collect()

collect/sources/legislation.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""Scrape UK insurance legislation from legislation.gov.uk."""
+import json
+import logging
+import re
+from typing import Optional
+from bs4 import BeautifulSoup
+from collect.scraper_base import BaseScraper
+from collect.config import UK_LEGISLATION_URLS
+logger = logging.getLogger(__name__)
+class LegislationScraper(BaseScraper):
+    def __init__(self):
+        super().__init__("uk_legislation")
+    def _get_section_urls(self, contents_url: str) -> list[str]:
+        """Parse a legislation contents page to get individual section URLs."""
+        html = self.fetch(contents_url)
+        if not html:
+            return []
+        soup = BeautifulSoup(html, "html.parser")
+        urls = []
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            # Match section links
+            if "/section/" in href or "/part/" in href or "/schedule/" in href:
+                if href.startswith("/"):
+                    href = f"https://www.legislation.gov.uk{href}"
+                if href not in urls:
+                    urls.append(href)
+        return urls[:100]  # Cap per act
+    def _scrape_section(self, url: str, act_name: str) -> Optional[dict]:
+        """Scrape text from a legislation section."""
+        html = self.fetch(url)
+        if not html:
+            return None
+        soup = BeautifulSoup(html, "html.parser")
+        # Remove navigation and chrome
+        for tag in soup.find_all(["nav", "footer", "script", "style", "aside", "header"]):
+            tag.decompose()
+        # legislation.gov.uk uses class="LegP1" etc for legislation paragraphs
+        content_div = (
+            soup.find("div", id="viewLegSnippet")
+            or soup.find("div", class_="LegClearFix")
+            or soup.find("article")
+            or soup.find("main")
+        )
+        if not content_div:
+            content_div = soup.find("body")
+        if not content_div:
+            return None
+        text = content_div.get_text(separator="\n", strip=True)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        if len(text) < 50:
+            return None
+        title_tag = soup.find("h1") or soup.find("title")
+        title = title_tag.get_text(strip=True) if title_tag else ""
+        return {
+            "title": f"UK Law - {act_name} - {title}".strip(" - "),
+            "text": text[:30000],
+            "source": "uk_legislation",
+            "url": url,
+            "act": act_name,
+            "category": "legislation",
+        }
+    def _extract_act_name(self, url: str) -> str:
+        """Extract act name from URL."""
+        parts = url.rstrip("/").split("/")
+        # e.g. https://www.legislation.gov.uk/ukpga/2015/4/contents -> "2015/4"
+        if "contents" in parts:
+            idx = parts.index("contents")
+            return "/".join(parts[max(0, idx - 2):idx])
+        return parts[-1]
+    def collect(self) -> list[dict]:
+        """Collect UK insurance legislation."""
+        documents = []
+        for contents_url in UK_LEGISLATION_URLS:
+            act_name = self._extract_act_name(contents_url)
+            logger.info(f"  Scraping legislation: {act_name}")
+            # First, scrape the contents page itself for overview
+            overview = self._scrape_section(contents_url, act_name)
+            if overview:
+                documents.append(overview)
+            # Then get individual sections
+            section_urls = self._get_section_urls(contents_url)
+            logger.info(f"    Found {len(section_urls)} sections")
+            for url in section_urls:
+                doc = self._scrape_section(url, act_name)
+                if doc:
+                    documents.append(doc)
+            logger.info(f"    Collected {len(documents)} total so far")
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_legislation() -> list[dict]:
+    scraper = LegislationScraper()
+    return scraper.collect()

collect/sources/rss_news.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""Collect insurance content from RSS feeds and news sources."""
+import json
+import logging
+import re
+from typing import Optional
+from bs4 import BeautifulSoup
+from collect.scraper_base import BaseScraper
+from collect.config import RSS_FEEDS
+logger = logging.getLogger(__name__)
+class RSSCollector(BaseScraper):
+    def __init__(self):
+        super().__init__("rss_news")
+    def _parse_feed(self, feed_url: str) -> list[dict]:
+        """Parse an RSS/Atom feed and extract article URLs."""
+        xml = self.fetch(feed_url)
+        if not xml:
+            return []
+        soup = BeautifulSoup(xml, "xml")
+        items = soup.find_all("item") or soup.find_all("entry")
+        articles = []
+        for item in items[:30]:  # Cap per feed
+            title = item.find("title")
+            link = item.find("link")
+            desc = item.find("description") or item.find("summary") or item.find("content")
+            title_text = title.get_text(strip=True) if title else ""
+            link_text = ""
+            if link:
+                link_text = link.get("href", "") or link.get_text(strip=True)
+            desc_text = desc.get_text(strip=True) if desc else ""
+            # Clean HTML from description
+            if desc_text:
+                desc_soup = BeautifulSoup(desc_text, "html.parser")
+                desc_text = desc_soup.get_text(separator=" ", strip=True)
+            if title_text and (desc_text or link_text):
+                articles.append({
+                    "title": title_text,
+                    "url": link_text,
+                    "summary": desc_text[:5000],
+                })
+        return articles
+    def _scrape_article(self, url: str, title: str) -> Optional[dict]:
+        """Try to scrape full article text from URL."""
+        if not url or not url.startswith("http"):
+            return None
+        html = self.fetch(url)
+        if not html:
+            return None
+        soup = BeautifulSoup(html, "html.parser")
+        for tag in soup.find_all(["nav", "footer", "script", "style",
+                                  "aside", "header", "figure", "iframe"]):
+            tag.decompose()
+        article = (
+            soup.find("article")
+            or soup.find("div", class_=re.compile(r"article|content|post|entry"))
+            or soup.find("main")
+        )
+        if not article:
+            return None
+        text = article.get_text(separator="\n", strip=True)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        if len(text) < 200:
+            return None
+        return {
+            "title": title,
+            "text": text[:20000],
+            "source": "insurance_news",
+            "url": url,
+            "category": "insurance_news",
+        }
+    def collect(self) -> list[dict]:
+        """Collect articles from insurance RSS feeds."""
+        documents = []
+        for feed_url in RSS_FEEDS:
+            logger.info(f"  Parsing feed: {feed_url}")
+            articles = self._parse_feed(feed_url)
+            logger.info(f"    Found {len(articles)} articles")
+            for article in articles:
+                # Try to get full article
+                doc = self._scrape_article(article["url"], article["title"])
+                if doc:
+                    documents.append(doc)
+                elif article["summary"] and len(article["summary"]) > 100:
+                    # Fall back to RSS summary
+                    documents.append({
+                        "title": article["title"],
+                        "text": article["summary"],
+                        "source": "insurance_news_summary",
+                        "url": article["url"],
+                        "category": "insurance_news",
+                    })
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_rss() -> list[dict]:
+    collector = RSSCollector()
+    return collector.collect()

collect/sources/wikipedia.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""Scrape insurance articles from Wikipedia via the public MediaWiki API."""
+import json
+import logging
+from typing import Optional
+from collect.scraper_base import BaseScraper
+from collect.config import WIKIPEDIA_SEED_ARTICLES
+logger = logging.getLogger(__name__)
+API = "https://en.wikipedia.org/w/api.php"
+class WikipediaScraper(BaseScraper):
+    def __init__(self):
+        super().__init__("wikipedia")
+    def _get_article_text(self, title: str) -> Optional[dict]:
+        """Get plain-text extract of a Wikipedia article via API."""
+        url = (
+            f"{API}?action=query&titles={title}"
+            f"&prop=extracts&explaintext=1&exsectionformat=plain"
+            f"&format=json&redirects=1"
+        )
+        raw = self.fetch(url)
+        if not raw:
+            return None
+        data = json.loads(raw)
+        pages = data.get("query", {}).get("pages", {})
+        for pid, page in pages.items():
+            if pid == "-1":
+                return None
+            text = page.get("extract", "")
+            if len(text) < 200:
+                return None
+            return {
+                "title": page.get("title", title),
+                "text": text,
+                "source": "wikipedia",
+                "url": f"https://en.wikipedia.org/wiki/{title}",
+                "category": "insurance_knowledge",
+            }
+        return None
+    def _get_linked_articles(self, title: str, limit: int = 20) -> list[str]:
+        """Get insurance-related links from an article."""
+        url = (
+            f"{API}?action=query&titles={title}"
+            f"&prop=links&pllimit={limit}&plnamespace=0"
+            f"&format=json&redirects=1"
+        )
+        raw = self.fetch(url)
+        if not raw:
+            return []
+        data = json.loads(raw)
+        pages = data.get("query", {}).get("pages", {})
+        links = []
+        insurance_keywords = {
+            "insurance", "insur", "underw", "claim", "polic",
+            "premium", "actuar", "reinsur", "liabil", "indemnit",
+            "risk", "loss", "peril", "cover", "broker",
+            "lloyd", "solvency", "fca", "pra", "regul",
+        }
+        for page in pages.values():
+            for link in page.get("links", []):
+                link_title = link.get("title", "")
+                lower = link_title.lower()
+                if any(kw in lower for kw in insurance_keywords):
+                    links.append(link_title.replace(" ", "_"))
+        return links
+    def collect(self, max_articles: int = 500) -> list[dict]:
+        """Collect Wikipedia insurance articles with link expansion."""
+        documents = []
+        visited: set[str] = set()
+        queue = list(WIKIPEDIA_SEED_ARTICLES)
+        while queue and len(documents) < max_articles:
+            title = queue.pop(0)
+            if title in visited:
+                continue
+            visited.add(title)
+            doc = self._get_article_text(title)
+            if doc:
+                documents.append(doc)
+                logger.info(
+                    f"  [{len(documents)}/{max_articles}] {doc['title']} "
+                    f"({len(doc['text']):,} chars)"
+                )
+                # Expand: get linked insurance articles
+                if len(documents) < max_articles:
+                    new_links = self._get_linked_articles(title)
+                    for link in new_links:
+                        if link not in visited:
+                            queue.append(link)
+        self.save_documents(documents)
+        self.print_stats()
+        return documents
+def collect_wikipedia(max_articles: int = 500) -> list[dict]:
+    scraper = WikipediaScraper()
+    return scraper.collect(max_articles)

config.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""Configuration for data collection sources."""
+import os
+from pathlib import Path
+# ── Paths ──────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).resolve().parent.parent
+RAW_DIR = BASE_DIR / "collect" / "raw"
+PROCESSED_DIR = BASE_DIR / "collect" / "processed"
+SFT_OUTPUT = BASE_DIR / "collect" / "sft_real_world.jsonl"
+DPO_OUTPUT = BASE_DIR / "collect" / "dpo_real_world.jsonl"
+RAW_DIR.mkdir(parents=True, exist_ok=True)
+PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
+# ── Rate limiting ──────────────────────────────────────────────────
+REQUEST_DELAY = 1.5          # seconds between requests (be polite)
+MAX_RETRIES = 3
+TIMEOUT = 30
+# ── User agent ─────────────────────────────────────────────────────
+USER_AGENT = (
+    "InsureOS-DataCollector/1.0 "
+    "(Research; insurance-domain-model-training; "
+    "contact: piyush@bytical.com)"
+)
+HEADERS = {
+    "User-Agent": USER_AGENT,
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-GB,en;q=0.9",
+}
+# ── Wikipedia insurance articles ───────────────────────────────────
+WIKIPEDIA_SEED_ARTICLES = [
+    "Insurance", "Reinsurance", "Underwriting", "Actuarial_science",
+    "Insurance_policy", "Lloyd%27s_of_London", "Property_insurance",
+    "Casualty_insurance", "Life_insurance", "Health_insurance",
+    "Motor_insurance", "Marine_insurance", "Liability_insurance",
+    "Professional_indemnity_insurance", "Directors_and_officers_liability_insurance",
+    "Cyber_insurance", "Product_liability", "Public_liability",
+    "Employers%27_liability_insurance", "Business_interruption_insurance",
+    "Catastrophe_bond", "Insurance-linked_securities",
+    "Solvency_II", "IFRS_17", "Risk_management",
+    "Claims_adjusting", "Loss_adjustment", "Salvage_(insurance)",
+    "Subrogation", "Indemnity", "Utmost_good_faith",
+    "Proximate_cause_(insurance)", "Insurance_fraud",
+    "Parametric_insurance", "Microinsurance", "Takaful",
+    "Financial_Conduct_Authority", "Prudential_Regulation_Authority_(United_Kingdom)",
+    "General_insurance", "Insurance_broker", "Managing_general_agent",
+    "Coverholder", "Bordereaux", "Treaty_reinsurance",
+    "Facultative_reinsurance", "Excess_of_loss", "Quota_share",
+    "Stop-loss_insurance", "Aggregate_stop-loss_insurance",
+    "Deductible", "Co-insurance", "Self-insurance",
+    "Captive_insurance", "Risk_retention_group",
+    "Insurance_in_the_United_Kingdom", "Association_of_British_Insurers",
+    "Chartered_Insurance_Institute", "Insurance_premium_tax",
+    "Motor_Insurers%27_Bureau", "Pool_Reinsurance_Company",
+    "Flood_Re", "Terrorism_reinsurance",
+    "Insurance_contract", "Warranty_(insurance)",
+    "Condition_(insurance)", "Exclusion_(insurance)",
+    "Endorsement_(insurance)", "Schedule_(insurance)",
+    "Inception_(insurance)", "Renewal_(insurance)",
+    "Cancellation_(insurance)", "Claims-made_policy",
+    "Occurrence_policy", "Claims_reserve",
+    "Incurred_but_not_reported", "Loss_ratio",
+    "Combined_ratio", "Expense_ratio",
+    "Generalized_linear_model", "Tweedie_distribution",
+    "Poisson_regression", "Gamma_distribution",
+    "Chain_ladder_method", "Bornhuetter–Ferguson_method",
+    "Credibility_theory", "Experience_rating",
+    "Risk_classification", "Adverse_selection",
+    "Moral_hazard", "Insurance_scoring",
+    "Telematics", "Usage-based_insurance",
+    "Insurtech", "Peer-to-peer_insurance",
+    "Embedded_insurance", "Open_insurance",
+    "ACORD", "ISO_ClaimSearch",
+    "National_Flood_Insurance_Program",
+    "Earthquake_insurance", "Windstorm_insurance",
+    "Hail_insurance", "Crop_insurance",
+    "Title_insurance", "Surety_bond",
+    "Fidelity_bond", "Warranty",
+    "Extended_warranty", "Home_warranty",
+    "Pet_insurance", "Travel_insurance",
+    "Wedding_insurance", "Event_insurance",
+    "Key_person_insurance", "Trade_credit_insurance",
+    "Political_risk_insurance", "Environmental_liability",
+    "Pollution_insurance",
+]
+# ── FCA Handbook sections ──────────────────────────────────────────
+FCA_HANDBOOK_SECTIONS = [
+    "ICOBS",       # Insurance: Conduct of Business Sourcebook
+    "SYSC",        # Senior Management Arrangements
+    "PRIN",        # Principles for Businesses
+    "COBS",        # Conduct of Business Sourcebook
+    "DISP",        # Dispute Resolution: Complaints
+    "SUP",         # Supervision
+    "CONC",        # Consumer Credit
+    "MCOB",        # Mortgages and Home Finance
+]
+FCA_BASE_URL = "https://www.handbook.fca.org.uk"
+# ── UK Legislation ─────────────────────────────────────────────────
+UK_LEGISLATION_URLS = [
+    # Insurance Act 2015
+    "https://www.legislation.gov.uk/ukpga/2015/4/contents",
+    # Enterprise Act 2016 (insurance damages for late payment)
+    "https://www.legislation.gov.uk/ukpga/2016/12/contents",
+    # Financial Services and Markets Act 2000
+    "https://www.legislation.gov.uk/ukpga/2000/8/contents",
+    # Third Parties (Rights against Insurers) Act 2010
+    "https://www.legislation.gov.uk/ukpga/2010/10/contents",
+    # Road Traffic Act 1988 (compulsory motor insurance)
+    "https://www.legislation.gov.uk/ukpga/1988/52/contents",
+    # Employers' Liability (Compulsory Insurance) Act 1969
+    "https://www.legislation.gov.uk/ukpga/1969/57/contents",
+    # Marine Insurance Act 1906
+    "https://www.legislation.gov.uk/ukpga/Edw7/6/41/contents",
+    # Consumer Insurance (Disclosure and Representations) Act 2012
+    "https://www.legislation.gov.uk/ukpga/2012/6/contents",
+    # Data Protection Act 2018
+    "https://www.legislation.gov.uk/ukpga/2018/12/contents",
+]
+# ── Investopedia insurance glossary terms ──────────────────────────
+INVESTOPEDIA_TERMS = [
+    "insurance", "reinsurance", "underwriting", "premium",
+    "deductible", "copayment", "coinsurance", "policy-limit",
+    "exclusion", "endorsement", "rider", "binder",
+    "actuary", "actuarial-science", "loss-ratio",
+    "combined-ratio", "expense-ratio", "claims-reserve",
+    "ibnr", "incurred-but-not-reported",
+    "lloyd-s-of-london", "surplus-lines",
+    "managing-general-agent", "captive-insurance-company",
+    "risk-retention-group", "self-insurance",
+    "occurrence-policy", "claims-made-policy",
+    "general-liability-insurance", "professional-liability-insurance",
+    "errors-and-omissions-insurance", "directors-and-officers-liability-insurance",
+    "cyber-insurance", "key-person-insurance",
+    "business-interruption-insurance", "commercial-property-insurance",
+    "workers-compensation", "employers-liability-insurance",
+    "public-liability-insurance", "product-liability-insurance",
+    "environmental-liability-insurance", "marine-insurance",
+    "hull-insurance", "cargo-insurance",
+    "protection-and-indemnity-insurance", "aviation-insurance",
+    "crop-insurance", "title-insurance",
+    "surety-bond", "fidelity-bond",
+    "catastrophe-bond", "insurance-linked-securities",
+    "parametric-insurance", "microinsurance",
+    "property-insurance", "casualty-insurance",
+    "fire-insurance", "flood-insurance",
+    "earthquake-insurance", "windstorm-insurance",
+    "homeowners-insurance", "renters-insurance",
+    "auto-insurance", "uninsured-motorist-coverage",
+    "comprehensive-auto-insurance", "collision-insurance",
+    "gap-insurance", "umbrella-insurance",
+    "life-insurance", "term-life-insurance",
+    "whole-life-insurance", "universal-life-insurance",
+    "variable-life-insurance", "endowment-policy",
+    "annuity", "health-insurance",
+    "disability-insurance", "long-term-care-insurance",
+    "pet-insurance", "travel-insurance",
+    "wedding-insurance", "event-insurance",
+    "trade-credit-insurance", "political-risk-insurance",
+    "warranty", "extended-warranty",
+    "solvency", "moral-hazard",
+    "adverse-selection", "risk-management",
+    "risk-assessment", "risk-transfer",
+    "risk-pooling", "law-of-large-numbers",
+    "subrogation", "indemnity", "utmost-good-faith",
+    "proximate-cause", "insurable-interest",
+    "insurance-fraud", "total-loss",
+    "actual-cash-value", "replacement-cost",
+    "agreed-value", "reinstatement-value",
+]
+# ── HuggingFace datasets ──────────────────────────────────────────
+HF_DATASETS = [
+    ("rvpierre/insurance-qa-en", None),
+    ("ebrigham/NL_insurance_reviews_sentiment", None),
+    ("snorkelai/Multi-Turn-Insurance-Underwriting-Code-Gen", None),
+    ("Ddream-ai/InsuranceCorpus", None),
+]
+# ── Insurance subreddits ──────────────────────────────────────────
+REDDIT_SUBREDDITS = [
+    "insurance",
+    "InsuranceProfessional",
+    "HealthInsurance",
+    "ActuaryUK",
+    "actuary",
+]
+# ── RSS feeds for insurance news ───────────────────────────────────
+RSS_FEEDS = [
+    "https://www.insurancetimes.co.uk/rss",
+    "https://www.insurancejournal.com/rss/news/",
+    "https://www.reinsurancene.ws/feed/",
+    "https://www.artemis.bm/feed/",
+]

convert_sft.py ADDED Viewed

	@@ -0,0 +1,494 @@

+"""Convert collected real-world insurance data into SFT and DPO training format.
+Strategies:
+1. Knowledge Q&A — generate question-answer pairs from article text
+2. Summarisation — "Summarise this insurance concept"
+3. Regulation interpretation — "What does FCA say about X?"
+4. Legislation interpretation — UK insurance law sections
+5. Underwriting tasks — from snorkelai dataset
+6. News analysis
+"""
+import json
+import logging
+import random
+import re
+from pathlib import Path
+from collect.config import PROCESSED_DIR, SFT_OUTPUT, DPO_OUTPUT, RAW_DIR
+logger = logging.getLogger(__name__)
+# ── Language filter ────────────────────────────────────────────────
+# Datasets to skip entirely (no answers, wrong language)
+SKIP_DATASETS = {
+    "rvpierre/insurance-qa-en",       # Questions only, no answers
+    "ebrigham/NL_insurance_reviews_sentiment",  # Dutch
+    "Ddream-ai/InsuranceCorpus",       # Chinese
+}
+def _is_english(text: str) -> bool:
+    """Quick heuristic: reject CJK or predominantly non-ASCII text."""
+    if not text:
+        return False
+    # Count CJK characters
+    cjk = sum(1 for c in text[:500] if '\u4e00' <= c <= '\u9fff' or '\u3040' <= c <= '\u30ff')
+    if cjk > 5:
+        return False
+    # Count non-ASCII vs ASCII
+    ascii_count = sum(1 for c in text[:500] if c.isascii())
+    if len(text[:500]) > 0 and ascii_count / len(text[:500]) < 0.7:
+        return False
+    return True
+# ── Templates ──────────────────────────────────────────────────────
+QA_TEMPLATES = [
+    "What is {concept}?",
+    "Explain {concept} in the context of UK insurance.",
+    "How does {concept} work in insurance?",
+    "Define {concept} for an insurance professional.",
+    "What role does {concept} play in the insurance industry?",
+    "Describe {concept} and its importance in insurance.",
+    "As an insurance underwriter, explain {concept}.",
+    "What should a claims handler know about {concept}?",
+    "How is {concept} relevant to insurance regulation in the UK?",
+    "Explain {concept} as it applies to general insurance.",
+]
+REGULATION_TEMPLATES = [
+    "What does the FCA require regarding {topic}?",
+    "Explain the regulatory requirements for {topic} in UK insurance.",
+    "How does {topic} affect insurance companies under UK regulation?",
+    "What compliance obligations exist for {topic}?",
+    "Summarise the key regulatory points about {topic}.",
+]
+def _extract_first_paragraph(text: str, max_len: int = 800) -> str:
+    """Extract a clean first paragraph as a concise answer."""
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+    if not paragraphs:
+        return text[:max_len]
+    for p in paragraphs:
+        if len(p) > 50:
+            return p[:max_len]
+    return paragraphs[0][:max_len]
+def _extract_key_concepts(text: str) -> list[str]:
+    """Extract key insurance concepts/terms from text."""
+    concepts = set()
+    for line in text.split("\n"):
+        line = line.strip()
+        if 3 < len(line) < 80 and not line.endswith("."):
+            if line[0].isupper() and not line.startswith("The "):
+                concepts.add(line.strip("=").strip("#").strip())
+    insurance_terms = re.findall(
+        r'\b([A-Z][a-z]+(?: [A-Z][a-z]+){0,3})\b', text[:5000]
+    )
+    for term in insurance_terms:
+        if len(term) > 3 and any(kw in term.lower() for kw in [
+            "insurance", "reinsur", "claim", "underw", "polic",
+            "premium", "loss", "risk", "cover", "liabil",
+            "indemnit", "act", "regulation", "fca", "lloyd",
+        ]):
+            concepts.add(term)
+    return list(concepts)[:10]
+def _make_sft_from_knowledge(doc: dict) -> list[dict]:
+    """Create SFT pairs from a knowledge article."""
+    pairs = []
+    title = doc.get("title", "")
+    text = doc.get("text", "")
+    if not text or len(text) < 100 or not _is_english(text):
+        return pairs
+    if title and len(title) > 3:
+        concept = title.replace("_", " ")
+        question = random.choice(QA_TEMPLATES).format(concept=concept)
+        answer = _extract_first_paragraph(text, max_len=1200)
+        if len(answer) > 50:
+            pairs.append({
+                "instruction": question,
+                "response": answer,
+                "source": doc.get("source", "unknown"),
+                "category": "knowledge_qa",
+            })
+    if len(text) > 500:
+        chunk = text[:3000]
+        pairs.append({
+            "instruction": f"Summarise the following insurance content:\n\n{chunk}",
+            "response": _extract_first_paragraph(text, max_len=600),
+            "source": doc.get("source", "unknown"),
+            "category": "summarisation",
+        })
+    concepts = _extract_key_concepts(text)
+    for concept in concepts[:3]:
+        question = random.choice(QA_TEMPLATES).format(concept=concept)
+        for para in text.split("\n\n"):
+            if concept.lower() in para.lower() and len(para) > 50:
+                pairs.append({
+                    "instruction": question,
+                    "response": para[:1200],
+                    "source": doc.get("source", "unknown"),
+                    "category": "concept_qa",
+                })
+                break
+    return pairs
+def _make_sft_from_regulation(doc: dict) -> list[dict]:
+    """Create SFT pairs from regulatory documents."""
+    pairs = []
+    text = doc.get("text", "")
+    title = doc.get("title", "")
+    section = doc.get("section", "")
+    if not text or len(text) < 100 or not _is_english(text):
+        return pairs
+    topic = title or section
+    if topic:
+        question = random.choice(REGULATION_TEMPLATES).format(topic=topic)
+        answer = _extract_first_paragraph(text, max_len=1500)
+        if len(answer) > 50:
+            pairs.append({
+                "instruction": question,
+                "response": answer,
+                "source": "regulation",
+                "category": "regulation_qa",
+            })
+    sections = text.split("\n\n")
+    for section_text in sections[:5]:
+        if len(section_text) > 100:
+            pairs.append({
+                "instruction": f"Explain this insurance regulation provision:\n\n{section_text[:500]}",
+                "response": section_text[:1500],
+                "source": "regulation",
+                "category": "regulation_explain",
+            })
+    return pairs
+def _make_sft_from_legislation(doc: dict) -> list[dict]:
+    """Create SFT pairs from UK insurance legislation."""
+    pairs = []
+    text = doc.get("text", "")
+    act = doc.get("act", "")
+    if not text or len(text) < 100 or not _is_english(text):
+        return pairs
+    pairs.append({
+        "instruction": (
+            f"Explain the following provision from UK insurance legislation "
+            f"({act}):\n\n{text[:1000]}"
+        ),
+        "response": text[:2000],
+        "source": "uk_legislation",
+        "category": "legislation_qa",
+    })
+    return pairs
+def _make_sft_from_news(doc: dict) -> list[dict]:
+    """Create SFT pairs from insurance news articles."""
+    pairs = []
+    text = doc.get("text", "")
+    title = doc.get("title", "")
+    if not text or len(text) < 200 or not _is_english(text):
+        return pairs
+    pairs.append({
+        "instruction": f"Summarise this insurance industry news article:\n\n{text[:2000]}",
+        "response": _extract_first_paragraph(text, max_len=800),
+        "source": "insurance_news",
+        "category": "news_summary",
+    })
+    if title:
+        pairs.append({
+            "instruction": (
+                f"As an insurance industry analyst, what are the key takeaways "
+                f"from this article titled '{title}'?\n\n{text[:1500]}"
+            ),
+            "response": _extract_first_paragraph(text, max_len=1000),
+            "source": "insurance_news",
+            "category": "news_analysis",
+        })
+    return pairs
+def _make_sft_from_underwriting(doc: dict) -> list[dict]:
+    """Create SFT pairs from snorkelai underwriting dataset."""
+    pairs = []
+    row = doc.get("row", {})
+    task = row.get("task", "")
+    ref_answer = row.get("reference answer", "")
+    company = row.get("company name", "Unknown Company")
+    desc = row.get("company description", "")
+    revenue = row.get("annual revenue", "")
+    employees = row.get("number of employees", "")
+    payroll = row.get("total payroll", "")
+    vehicles = row.get("number of vehicles", "")
+    construction = row.get("building construction", "")
+    state = row.get("state", "")
+    lob = row.get("lob", "")
+    if not task or not ref_answer:
+        return pairs
+    # Build company profile for context
+    profile_parts = [f"Company: {company}"]
+    if desc:
+        profile_parts.append(f"Description: {desc[:300]}")
+    if revenue:
+        profile_parts.append(f"Annual Revenue: ${int(revenue):,}" if revenue.isdigit() else f"Annual Revenue: {revenue}")
+    if employees:
+        profile_parts.append(f"Employees: {employees}")
+    if payroll:
+        profile_parts.append(f"Total Payroll: ${int(payroll):,}" if payroll.isdigit() else f"Total Payroll: {payroll}")
+    if vehicles:
+        profile_parts.append(f"Vehicles: {vehicles}")
+    if construction:
+        profile_parts.append(f"Building Construction: {construction}")
+    if state:
+        profile_parts.append(f"State: {state}")
+    if lob:
+        profile_parts.append(f"Line of Business: {lob}")
+    profile = "\n".join(profile_parts)
+    # Task-specific prompts
+    if task == "Small Business Elibility Check":
+        instruction = (
+            f"As an insurance underwriter, determine if the following company qualifies "
+            f"as a small business for insurance purposes:\n\n{profile}"
+        )
+    elif task == "Business Classification":
+        instruction = (
+            f"As an insurance underwriter, classify the following business and determine "
+            f"its NAICS code:\n\n{profile}"
+        )
+    elif task == "Appetite Check":
+        instruction = (
+            f"As an insurance underwriter, determine whether the following company is "
+            f"within appetite for the specified line of business:\n\n{profile}"
+        )
+    elif task == "Product Recommendations":
+        instruction = (
+            f"As an insurance underwriter, recommend appropriate insurance products "
+            f"for the following company:\n\n{profile}"
+        )
+    elif task == "Policy Limits":
+        instruction = (
+            f"As an insurance underwriter, recommend appropriate policy limits "
+            f"for the following company:\n\n{profile}"
+        )
+    elif task == "Deductibles":
+        instruction = (
+            f"As an insurance underwriter, recommend appropriate deductible levels "
+            f"for the following company:\n\n{profile}"
+        )
+    else:
+        instruction = (
+            f"As an insurance underwriter, perform the following task: {task}\n\n{profile}"
+        )
+    pairs.append({
+        "instruction": instruction,
+        "response": ref_answer,
+        "source": "snorkelai/underwriting",
+        "category": "underwriting",
+    })
+    return pairs
+def _make_sft_from_hf(doc: dict) -> list[dict]:
+    """Create SFT pairs from HuggingFace dataset rows.
+    Skips datasets in SKIP_DATASETS and non-English text.
+    Routes snorkelai data to specialised underwriting converter.
+    """
+    dataset = doc.get("dataset", "")
+    # Skip blacklisted datasets
+    if dataset in SKIP_DATASETS:
+        return []
+    # Route snorkelai to dedicated underwriting handler
+    if "snorkelai" in dataset.lower() or "underwriting" in dataset.lower():
+        return _make_sft_from_underwriting(doc)
+    # Language filter
+    text = doc.get("text", "")
+    if not text or len(text) < 50 or not _is_english(text):
+        return []
+    row = doc.get("row", {})
+    pairs = []
+    # Check for question/answer fields
+    q = row.get("question", row.get("question_en", ""))
+    a = row.get("answer", row.get("answer_en", row.get("response", "")))
+    if q and a and len(a) > 20:
+        pairs.append({
+            "instruction": q,
+            "response": a[:2000],
+            "source": dataset,
+            "category": "hf_qa",
+        })
+        return pairs
+    # Check for instruction/output fields
+    inst = row.get("instruction", row.get("input", ""))
+    out = row.get("output", row.get("response", ""))
+    if inst and out and len(out) > 20:
+        pairs.append({
+            "instruction": inst,
+            "response": out[:2000],
+            "source": dataset,
+            "category": "hf_instruction",
+        })
+        return pairs
+    # Generic: only if text is substantial and looks like insurance content
+    if len(text) > 200:
+        pairs.append({
+            "instruction": f"Explain the following insurance information:\n\n{text[:1000]}",
+            "response": text[:2000],
+            "source": dataset,
+            "category": "hf_knowledge",
+        })
+    return pairs
+def _make_dpo_pair(sft_pair: dict) -> dict | None:
+    """Create a DPO preference pair from an SFT pair."""
+    instruction = sft_pair["instruction"]
+    good_response = sft_pair["response"]
+    if len(good_response) < 100:
+        return None
+    bad_strategies = [
+        lambda r: r.split(".")[0] + "." if "." in r else r[:50],
+        lambda r: "This is a complex insurance topic that requires careful consideration of many factors.",
+        lambda r: r[:max(50, len(r) // 5)],
+        lambda r: f"While insurance is about managing risk, {r[:100]}",
+    ]
+    bad_response = random.choice(bad_strategies)(good_response)
+    return {
+        "instruction": instruction,
+        "chosen": good_response,
+        "rejected": bad_response,
+        "source": sft_pair.get("source", "unknown"),
+    }
+def convert_all_to_sft(raw_dir: Path = RAW_DIR) -> tuple[int, int]:
+    """Convert all collected raw documents to SFT and DPO format."""
+    all_sft = []
+    all_dpo = []
+    converters = {
+        "wikipedia": _make_sft_from_knowledge,
+        "fca_handbook": _make_sft_from_regulation,
+        "uk_legislation": _make_sft_from_legislation,
+        "investopedia": _make_sft_from_knowledge,
+        "insurance_news": _make_sft_from_news,
+        "insurance_news_summary": _make_sft_from_news,
+        "huggingface": _make_sft_from_hf,
+        "exam_syllabus": _make_sft_from_knowledge,
+        "insurance_education": _make_sft_from_knowledge,
+        "insurance_data": _make_sft_from_hf,
+    }
+    skipped = 0
+    for source_dir in raw_dir.iterdir():
+        if not source_dir.is_dir():
+            continue
+        for jsonl_file in source_dir.glob("*.jsonl"):
+            logger.info(f"Converting {jsonl_file}...")
+            with open(jsonl_file) as f:
+                for line in f:
+                    try:
+                        doc = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    source = doc.get("source", "")
+                    converter = converters.get(source, _make_sft_from_knowledge)
+                    sft_pairs = converter(doc)
+                    if not sft_pairs:
+                        skipped += 1
+                        continue
+                    for pair in sft_pairs:
+                        # Quality gate: response must not be a near-echo of instruction
+                        resp = pair["response"].strip().lower()
+                        inst_text = pair["instruction"].strip().lower()
+                        if resp and resp != inst_text and len(resp) > 20:
+                            all_sft.append(pair)
+                            if random.random() < 0.3:
+                                dpo = _make_dpo_pair(pair)
+                                if dpo:
+                                    all_dpo.append(dpo)
+    logger.info(f"Skipped {skipped} documents (non-English, no answers, blacklisted)")
+    random.shuffle(all_sft)
+    random.shuffle(all_dpo)
+    # Write SFT
+    with open(SFT_OUTPUT, "w") as f:
+        for pair in all_sft:
+            chat = {
+                "messages": [
+                    {"role": "system", "content": "You are InsureLLM, an expert UK insurance AI assistant. You provide accurate, detailed, and regulation-aware answers about insurance, underwriting, claims, actuarial science, and UK/EU insurance regulation."},
+                    {"role": "user", "content": pair["instruction"]},
+                    {"role": "assistant", "content": pair["response"]},
+                ]
+            }
+            f.write(json.dumps(chat, ensure_ascii=False) + "\n")
+    # Write DPO
+    with open(DPO_OUTPUT, "w") as f:
+        for pair in all_dpo:
+            dpo_row = {
+                "prompt": pair["instruction"],
+                "chosen": pair["chosen"],
+                "rejected": pair["rejected"],
+            }
+            f.write(json.dumps(dpo_row, ensure_ascii=False) + "\n")
+    logger.info(f"SFT: {len(all_sft)} pairs → {SFT_OUTPUT}")
+    logger.info(f"DPO: {len(all_dpo)} pairs → {DPO_OUTPUT}")
+    return len(all_sft), len(all_dpo)
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    sft_count, dpo_count = convert_all_to_sft()
+    print(f"Created {sft_count} SFT pairs and {dpo_count} DPO pairs")

data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # InsureOS Models — Python package markers

data/constants.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+InsureOS — UK Insurance Synthetic Data: Constants & Templates
+All UK-specific: GBP, postcodes, FCA references, Lloyd's market terms
+"""
+# ── UK Insurance Lines of Business ──
+LINES_OF_BUSINESS = [
+    "Motor Private Car", "Motor Commercial Vehicle", "Motor Fleet",
+    "Home Buildings", "Home Contents", "Home Combined",
+    "Commercial Property", "Commercial Combined",
+    "Employers' Liability", "Public Liability", "Professional Indemnity",
+    "Directors & Officers", "Cyber Liability",
+    "Travel Single Trip", "Travel Annual Multi-Trip",
+    "Pet Insurance", "Life Term Assurance", "Income Protection",
+    "Marine Cargo", "Marine Hull",
+    "Aviation", "Engineering Inspection", "Legal Expenses",
+    "Fidelity Guarantee", "Business Interruption",
+]
+# ── UK Regions & Postcodes ──
+UK_REGIONS = {
+    "London": ["EC1A", "EC2A", "WC1A", "SW1A", "SE1", "E1", "N1", "W1"],
+    "South East": ["GU", "RH", "TN", "CT", "ME", "BN", "PO", "SO"],
+    "South West": ["BS", "BA", "EX", "PL", "TR", "TA", "DT", "GL"],
+    "East Anglia": ["CB", "IP", "NR", "CO", "CM", "PE"],
+    "Midlands": ["B", "CV", "WV", "WS", "DY", "NG", "DE", "LE"],
+    "North West": ["M", "L", "WA", "CH", "PR", "BL", "OL", "SK"],
+    "North East": ["NE", "SR", "DH", "TS", "DL", "HU"],
+    "Yorkshire": ["LS", "BD", "HX", "HD", "WF", "S", "DN", "YO"],
+    "Scotland": ["EH", "G", "AB", "DD", "KY", "FK", "PA", "IV"],
+    "Wales": ["CF", "SA", "NP", "LL", "SY", "LD"],
+    "Northern Ireland": ["BT"],
+}
+# ── UK Insurers (for realistic data) ──
+UK_INSURERS = [
+    "Aviva", "AXA UK", "RSA Insurance", "Zurich UK",
+    "Allianz UK", "QBE European", "Hiscox",
+    "Beazley", "Brit Insurance", "MS Amlin",
+    "Ecclesiastical", "LV= General Insurance",
+    "NFU Mutual", "Direct Line Group", "Admiral",
+    "Ageas UK", "Covéa Insurance", "Tokio Marine Kiln",
+    "Canopius", "Chaucer", "Argenta Syndicate",
+]
+# ── Lloyd's Syndicates ──
+LLOYDS_SYNDICATES = [
+    "Syndicate 2623 (Beazley)", "Syndicate 2987 (Brit)",
+    "Syndicate 2001 (MS Amlin)", "Syndicate 1084 (Chaucer)",
+    "Syndicate 4444 (Canopius)", "Syndicate 1861 (Argo)",
+    "Syndicate 1200 (Argo)", "Syndicate 5623 (Aon)",
+    "Syndicate 1729 (Dale)", "Syndicate 1969 (Apollo)",
+    "Syndicate 2525 (Asta)", "Syndicate 2121 (Argenta)",
+]
+# ── MGA Names ──
+MGA_NAMES = [
+    "Bravo Networks MGA", "Accelerant Holdings",
+    "Volante Global", "Three Sixty Underwriting",
+    "Pen Underwriting", "Plexus MGA",
+    "Manchester Underwriting", "Alchemy Underwriting",
+    "Rokstone Underwriting", "Arista Insurance",
+    "Ennismore MGA", "Flow Underwriting",
+]
+# ── Claim Types by Line ──
+CLAIM_TYPES = {
+    "Motor Private Car": [
+        "Accidental damage — collision with another vehicle",
+        "Accidental damage — single vehicle (e.g., hit kerb, post)",
+        "Theft of vehicle", "Theft from vehicle",
+        "Windscreen damage", "Fire damage",
+        "Third party bodily injury", "Third party property damage",
+        "Personal injury — whiplash", "Flood damage to vehicle",
+    ],
+    "Home Buildings": [
+        "Escape of water — burst pipe", "Escape of water — leaking roof",
+        "Storm damage — roof tiles", "Storm damage — fallen tree",
+        "Flood damage", "Subsidence", "Heave",
+        "Fire damage", "Malicious damage",
+        "Impact damage — vehicle into property",
+    ],
+    "Home Contents": [
+        "Theft — burglary", "Accidental damage — spillage on carpet",
+        "Accidental damage — broken TV/laptop",
+        "Fire damage to contents", "Flood damage to contents",
+        "Loss of jewellery", "Freezer contents (power failure)",
+    ],
+    "Employers' Liability": [
+        "Slip/trip/fall at workplace", "Manual handling injury",
+        "Repetitive strain injury", "Exposure to hazardous substances",
+        "Workplace violence", "Work-related stress claim",
+        "Fall from height", "Machinery accident",
+    ],
+    "Public Liability": [
+        "Slip/trip on premises", "Product liability — defective goods",
+        "Property damage during work", "Food poisoning claim",
+        "Professional negligence", "Advertising injury",
+    ],
+    "Professional Indemnity": [
+        "Negligent advice or design", "Breach of duty of care",
+        "Failure to meet professional standards", "Data breach liability",
+        "Omission in professional service", "Loss of client documents",
+    ],
+    "Cyber Liability": [
+        "Ransomware attack", "Data breach — customer PII",
+        "Business email compromise", "DDoS attack — business interruption",
+        "Social engineering fraud", "Third party data breach claim",
+    ],
+    "Travel Single Trip": [
+        "Medical emergency abroad", "Trip cancellation",
+        "Lost baggage", "Flight delay", "Passport loss",
+        "Personal belongings theft",
+    ],
+}
+# ── FCA/Regulatory References ──
+FCA_REFERENCES = {
+    "consumer_duty": "FCA Consumer Duty (PS22/9, effective July 2023)",
+    "icobs": "ICOBS (Insurance: Conduct of Business sourcebook)",
+    "fair_treatment": "TCF — Treating Customers Fairly principles",
+    "complaints": "DISP (Dispute Resolution: Complaints sourcebook)",
+    "claims_handling": "ICOBS 8 — Claims handling requirements",
+    "value_assessment": "FCA PS21/5 General Insurance Value Measures",
+    "pricing_practices": "FCA PS21/14 General Insurance Pricing Practices",
+    "solvency_ii": "Solvency II Directive (2009/138/EC) as retained UK law",
+    "gdpr": "UK GDPR (Data Protection Act 2018)",
+    "equality_act": "Equality Act 2010 — protected characteristics",
+    "fos": "Financial Ombudsman Service (FOS) referral rights",
+}
+# ── Policy Wording Sections ──
+POLICY_SECTIONS = [
+    "Definitions", "Operative Clause", "Insuring Clause",
+    "General Exclusions", "General Conditions", "Claims Conditions",
+    "Endorsements", "Schedule of Insurance",
+    "Section 1 — Buildings", "Section 2 — Contents",
+    "Section 3 — Personal Possessions", "Section 4 — Liability",
+    "Cancellation Clause", "Subrogation Rights",
+    "Arbitration Clause", "Fraud Clause",
+    "Sanctions Limitation & Exclusion Clause",
+    "Several Liability Clause (Lloyd's)",
+    "Third Party Rights (Contracts) Act 1999 Exclusion",
+]
+# ── Document Types ──
+DOCUMENT_TYPES = [
+    "Policy Schedule", "Certificate of Insurance",
+    "Claim Form", "Loss Adjuster Report",
+    "Bordereaux — Premium", "Bordereaux — Claims",
+    "Endorsement", "Renewal Notice",
+    "Statement of Fact", "FNOL Report",
+    "Subrogation Notice", "Policy Wording",
+]
+# ── NER Entity Types ──
+NER_ENTITY_TYPES = [
+    "POLICY_NUMBER", "CLAIM_NUMBER", "INSURED_NAME",
+    "INSURER_NAME", "BROKER_NAME", "SYNDICATE",
+    "COVERAGE_TYPE", "CURRENCY_AMOUNT", "DATE",
+    "POSTCODE", "VEHICLE_REG", "PERIL",
+    "EXCLUSION", "EXCESS_AMOUNT", "LIMIT_AMOUNT",
+]
+# ── Insurance Jargon for Training ──
+INSURANCE_JARGON = {
+    "utmost good faith": "Both parties must disclose all material facts honestly.",
+    "subrogation": "The insurer's right to recover costs from a third party at fault.",
+    "indemnity": "Restoring the insured to the same financial position as before the loss.",
+    "proximate cause": "The dominant or effective cause of the loss.",
+    "excess": "The first amount of any claim that the policyholder must pay themselves.",
+    "deductible": "Another term for excess — the uninsured portion of a claim.",
+    "aggregate limit": "The maximum total amount an insurer will pay in a policy period.",
+    "bordereaux": "A detailed listing of premiums or claims, typically sent monthly by an MGA to their capacity provider.",
+    "coverholder": "A firm authorized by Lloyd's to enter into contracts of insurance on behalf of a syndicate.",
+    "binding authority": "An agreement allowing a coverholder to underwrite risks on behalf of a syndicate.",
+    "slip": "The document used in the London Market to place a risk, showing lead and following underwriters.",
+    "following market": "Underwriters who accept a share of the risk after the lead underwriter has set terms.",
+    "burning cost": "The ratio of actual claims incurred to premiums received, used to set reinsurance rates.",
+    "loss ratio": "Claims paid (or incurred) divided by premiums earned, expressed as a percentage.",
+    "combined ratio": "Loss ratio plus expense ratio — below 100% means underwriting profit.",
+    "IBNR": "Incurred But Not Reported — reserves for claims that have occurred but not yet been filed.",
+    "case reserve": "The estimated cost set aside for a specific known claim.",
+    "frequency": "The number of claims per unit of exposure.",
+    "severity": "The average cost per claim.",
+    "attritional loss": "Expected losses from many small, frequent claims.",
+    "catastrophe loss": "Large losses from a single event (e.g., storm, flood).",
+    "FCA Consumer Duty": "FCA regulation requiring firms to act to deliver good outcomes for retail customers.",
+    "TCF": "Treating Customers Fairly — FCA principle predating Consumer Duty.",
+    "FOS": "Financial Ombudsman Service — free dispute resolution for consumers.",
+    "FSCS": "Financial Services Compensation Scheme — protects consumers if an insurer fails.",
+}
+# ── SFT Task Categories ──
+SFT_TASK_CATEGORIES = [
+    "claims_handling",        # Process a claim, assess coverage, set reserves
+    "policy_analysis",        # Explain policy wordings, coverage, exclusions
+    "fnol",                   # First Notification of Loss processing
+    "compliance_check",       # FCA Consumer Duty, GDPR, fair pricing checks
+    "bordereaux_processing",  # Parse and validate bordereaux data
+    "fraud_assessment",       # Evaluate fraud indicators
+    "underwriting_triage",    # Assess a submission, recommend terms
+    "customer_communication", # Draft policyholder letters/emails
+    "reserve_setting",        # Set/review claim reserves with rationale
+    "renewal_review",         # Assess a renewal, flag changes
+    "jargon_explanation",     # Explain insurance terms in plain English
+    "regulatory_query",       # Answer regulatory questions (FCA, PRA, Lloyd's)
+]
+# ── DPO Preference Dimensions ──
+DPO_PREFERENCE_DIMENSIONS = [
+    "fca_consumer_duty",      # Chosen: consumer-fair; Rejected: unfair/opaque
+    "accuracy",               # Chosen: factually correct; Rejected: hallucinated
+    "regulatory_compliance",  # Chosen: compliant; Rejected: non-compliant
+    "plain_english",          # Chosen: clear; Rejected: jargon-heavy
+    "data_protection",        # Chosen: GDPR-safe; Rejected: leaks PII
+    "fair_pricing",           # Chosen: non-discriminatory; Rejected: uses protected characteristics
+]
+MGAS = MGA_NAMES  # alias for backward compat

data/gen_documents.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+InsureOS — Synthetic Document Classification Data Generator
+Generates 10K labelled insurance document texts for fine-tuning a ModernBERT classifier.
+"""
+import json
+import os
+import random
+from faker import Faker
+from tqdm import tqdm
+from data.constants import (
+    DOCUMENT_TYPES, UK_INSURERS, LLOYDS_SYNDICATES, MGAS, UK_REGIONS,
+)
+fake = Faker("en_GB")
+Faker.seed(45)
+random.seed(45)
+# ── Document templates per type ──
+def _gen_policy_schedule() -> str:
+    insurer = random.choice(UK_INSURERS)
+    customer = fake.name()
+    addr = fake.address().replace("\n", ", ")
+    ref = f"POL-{random.randint(100000,999999)}"
+    inception = fake.date_between(start_date="-2y", end_date="today")
+    expiry = inception.replace(year=inception.year + 1)
+    premium = random.randint(200, 5000)
+    return (
+        f"POLICY SCHEDULE\n"
+        f"Insurer: {insurer}\n"
+        f"Policy Number: {ref}\n"
+        f"Policyholder: {customer}\n"
+        f"Address: {addr}\n"
+        f"Period of Insurance: {inception.isoformat()} to {expiry.isoformat()}\n"
+        f"Total Premium: £{premium:,}\n"
+        f"Insurance Premium Tax at 12%: £{int(premium*0.12):,}\n"
+        f"Lines of Business: {random.choice(['Motor','Home','Commercial Combined','Landlord'])}\n"
+        f"Voluntary Excess: £{random.choice([0,100,250,500])}\n"
+        f"Compulsory Excess: £{random.choice([100,250,350])}\n"
+        f"No Claims Discount: {random.randint(0,9)} years\n"
+        f"Special Conditions: {random.choice(['None','Alarm condition','Unoccupancy clause','Named driver only'])}\n"
+    )
+def _gen_claim_form() -> str:
+    customer = fake.name()
+    ref = f"CLM-{random.randint(200000,999999)}"
+    loss_date = fake.date_between(start_date="-1y", end_date="today")
+    claim_type = random.choice(["Escape of water", "Accidental damage", "Theft", "Storm", "Collision", "Fire"])
+    amount = random.randint(500, 50000)
+    return (
+        f"FIRST NOTIFICATION OF LOSS / CLAIM FORM\n"
+        f"Claim Reference: {ref}\n"
+        f"Policyholder: {customer}\n"
+        f"Date of Loss: {loss_date.isoformat()}\n"
+        f"Type of Loss: {claim_type}\n"
+        f"Description: {fake.paragraph(nb_sentences=4)}\n"
+        f"Estimated Value: £{amount:,}\n"
+        f"Police Notified: {random.choice(['Yes - crime ref provided','No','Not applicable'])}\n"
+        f"Witnesses: {random.choice(['Yes','No'])}\n"
+        f"Supporting Evidence: {random.choice(['Photos attached','Receipts attached','Awaiting','None'])}\n"
+    )
+def _gen_endorsement() -> str:
+    ref = f"POL-{random.randint(100000,999999)}"
+    endo_num = random.randint(1, 12)
+    effective = fake.date_between(start_date="-1y", end_date="today")
+    return (
+        f"ENDORSEMENT NO. {endo_num}\n"
+        f"Policy Reference: {ref}\n"
+        f"Effective Date: {effective.isoformat()}\n"
+        f"Amendment: {random.choice(['Addition of named driver','Change of address','Increase in sum insured','Vehicle change','Occupation update','Additional cover added'])}\n"
+        f"Previous: {fake.sentence()}\n"
+        f"New: {fake.sentence()}\n"
+        f"Additional Premium: £{random.randint(0,350)}\n"
+        f"All other terms and conditions remain unchanged.\n"
+        f"Authorised by: {fake.name()}, Underwriter\n"
+    )
+def _gen_loss_adjuster_report() -> str:
+    ref = f"CLM-{random.randint(200000,999999)}"
+    adjuster = fake.name()
+    company = random.choice(["Crawford & Company", "McLarens", "Sedgwick", "Davies Group", "Cunningham Lindsey"])
+    return (
+        f"LOSS ADJUSTER'S REPORT\n"
+        f"Claim Reference: {ref}\n"
+        f"Loss Adjuster: {adjuster} — {company}\n"
+        f"Visit Date: {fake.date_between(start_date='-6m', end_date='today').isoformat()}\n"
+        f"Property Inspected: {fake.address().replace(chr(10), ', ')}\n"
+        f"Findings: {fake.paragraph(nb_sentences=6)}\n"
+        f"Cause of Loss: {random.choice(['Burst pipe — wear and tear','Storm damage — wind speed confirmed >55mph','Malicious damage — forced entry confirmed','Subsidence — monitoring recommended','Accidental damage — consistent with account given'])}\n"
+        f"Recommended Settlement: £{random.randint(1000,40000):,}\n"
+        f"Recommendation: {random.choice(['Pay in full','Pay subject to betterment deduction','Decline — maintenance exclusion','Further investigation required','Refer to fraud team'])}\n"
+    )
+def _gen_bordereaux() -> str:
+    mga = random.choice(MGAS)
+    syndicate = random.choice(LLOYDS_SYNDICATES)
+    period = f"{random.choice(['Q1','Q2','Q3','Q4'])} {random.randint(2023,2025)}"
+    rows = random.randint(5, 15)
+    header = "PolicyRef | Inception | GWP | Claims Paid | Outstanding | Status"
+    data_rows = ""
+    for _ in range(rows):
+        data_rows += (
+            f"\n{random.randint(100000,999999)} | "
+            f"{fake.date_between(start_date='-2y',end_date='today').isoformat()} | "
+            f"£{random.randint(500,25000):,} | "
+            f"£{random.randint(0,15000):,} | "
+            f"£{random.randint(0,8000):,} | "
+            f"{random.choice(['Active','Lapsed','Cancelled','Renewed'])}"
+        )
+    return (
+        f"BORDEREAUX REPORT\n"
+        f"MGA: {mga}\n"
+        f"Capacity Provider: {syndicate}\n"
+        f"Reporting Period: {period}\n"
+        f"Currency: GBP\n\n"
+        f"{header}{data_rows}\n\n"
+        f"Total GWP: £{random.randint(50000,500000):,}\n"
+        f"Total Claims Paid: £{random.randint(10000,200000):,}\n"
+        f"Loss Ratio: {random.randint(35,95)}%\n"
+    )
+def _gen_renewal_invite() -> str:
+    insurer = random.choice(UK_INSURERS)
+    customer = fake.name()
+    ref = f"POL-{random.randint(100000,999999)}"
+    current_premium = random.randint(300, 4000)
+    renewal_premium = int(current_premium * random.uniform(0.85, 1.35))
+    renewal_date = fake.date_between(start_date="today", end_date="+60d")
+    return (
+        f"INSURANCE RENEWAL INVITATION\n"
+        f"Dear {customer},\n\n"
+        f"Your {random.choice(['motor','home','landlord','commercial'])} insurance policy "
+        f"({ref}) with {insurer} is due for renewal on {renewal_date.isoformat()}.\n\n"
+        f"Current premium: £{current_premium:,}\n"
+        f"Renewal premium: £{renewal_premium:,}\n"
+        f"{'Your premium has increased' if renewal_premium > current_premium else 'Your premium has decreased'} "
+        f"by £{abs(renewal_premium - current_premium):,}.\n\n"
+        f"Key changes this year: {random.choice(['No changes to cover','Excess increased by £50','New market rate applied','Discount for claims-free year applied'])}.\n\n"
+        f"Under FCA Consumer Duty, we're required to ensure you're getting fair value. "
+        f"If you'd like to discuss your renewal, please call us.\n"
+    )
+def _gen_subrogation_letter() -> str:
+    ref = f"CLM-{random.randint(200000,999999)}"
+    insurer = random.choice(UK_INSURERS)
+    third_party_insurer = random.choice(UK_INSURERS)
+    amount = random.randint(1000, 25000)
+    return (
+        f"WITHOUT PREJUDICE\n"
+        f"SUBROGATION RECOVERY DEMAND\n\n"
+        f"From: {insurer} — Claims Recovery Unit\n"
+        f"To: {third_party_insurer} — Third Party Claims\n"
+        f"Our Reference: {ref}\n"
+        f"Date of Loss: {fake.date_between(start_date='-1y', end_date='-30d').isoformat()}\n\n"
+        f"We write in connection with the above claim in which our policyholder's "
+        f"vehicle/property was damaged by your insured.\n\n"
+        f"We have indemnified our policyholder in the sum of £{amount:,} and hereby seek "
+        f"recovery pursuant to our rights of subrogation.\n\n"
+        f"We enclose: loss adjuster report, repair invoices, photographic evidence.\n\n"
+        f"Please respond within 21 days with your admission or otherwise.\n"
+    )
+def _gen_complaint_letter() -> str:
+    customer = fake.name()
+    insurer = random.choice(UK_INSURERS)
+    ref = f"CLM-{random.randint(200000,999999)}"
+    return (
+        f"FORMAL COMPLAINT\n\n"
+        f"From: {customer}\n"
+        f"To: Complaints Department, {insurer}\n"
+        f"Date: {fake.date_between(start_date='-3m', end_date='today').isoformat()}\n"
+        f"Claim Reference: {ref}\n\n"
+        f"Dear Complaints Team,\n\n"
+        f"I wish to make a formal complaint about the handling of my claim.\n\n"
+        f"Issue: {random.choice(['Unreasonable delay — no update in 8 weeks','Settlement offer is too low and does not reflect actual costs','You declined my claim without proper investigation','Your staff were unhelpful and dismissive','You failed to appoint a loss adjuster as promised','My personal data was shared without consent'])}\n\n"
+        f"{fake.paragraph(nb_sentences=3)}\n\n"
+        f"I expect a response within 8 weeks in line with FCA requirements. "
+        f"If I am not satisfied, I understand I can refer this to the Financial Ombudsman Service.\n\n"
+        f"Yours faithfully,\n{customer}\n"
+    )
+def _gen_medical_report() -> str:
+    claimant = fake.name()
+    ref = f"CLM-{random.randint(200000,999999)}"
+    doctor = f"Dr {fake.last_name()}"
+    return (
+        f"MEDICO-LEGAL REPORT\n"
+        f"Claim Reference: {ref}\n"
+        f"Claimant: {claimant}\n"
+        f"Examining Doctor: {doctor}, {random.choice(['GP','Orthopaedic Consultant','Neurologist','Psychiatrist'])}\n"
+        f"Date of Examination: {fake.date_between(start_date='-3m', end_date='today').isoformat()}\n"
+        f"Date of Accident: {fake.date_between(start_date='-1y', end_date='-3m').isoformat()}\n\n"
+        f"HISTORY: {fake.paragraph(nb_sentences=3)}\n\n"
+        f"EXAMINATION FINDINGS: {fake.paragraph(nb_sentences=3)}\n\n"
+        f"DIAGNOSIS: {random.choice(['Whiplash Associated Disorder Grade II','Lumbar disc protrusion','Fractured clavicle — healed','Adjustment disorder with anxiety','Soft tissue injury — resolving','Post-traumatic stress disorder — moderate'])}\n\n"
+        f"PROGNOSIS: Recovery expected within {random.choice(['3-6 months','6-12 months','12-18 months','Ongoing — chronic'])}.\n"
+        f"Employment Impact: {random.choice(['None','2 weeks off work','4 weeks reduced duties','Ongoing inability to work'])}\n"
+    )
+def _gen_fca_letter() -> str:
+    insurer = random.choice(UK_INSURERS)
+    return (
+        f"FINANCIAL CONDUCT AUTHORITY\n"
+        f"25 The North Colonnade, London E14 5HS\n\n"
+        f"To: Chief Executive, {insurer}\n"
+        f"Date: {fake.date_between(start_date='-1y', end_date='today').isoformat()}\n\n"
+        f"Dear Sir/Madam,\n\n"
+        f"RE: {random.choice(['Section 166 Skilled Person Review','Dear CEO letter — General Insurance Pricing','Thematic Review — Claims Handling Practices','Consumer Duty Implementation Assessment','Complaints Handling Review'])}\n\n"
+        f"{fake.paragraph(nb_sentences=5)}\n\n"
+        f"We require your response by {fake.date_between(start_date='today', end_date='+60d').isoformat()}.\n\n"
+        f"Yours faithfully,\n"
+        f"Director of Insurance Supervision\n"
+        f"Financial Conduct Authority\n"
+    )
+def _gen_risk_survey() -> str:
+    surveyor = fake.name()
+    region_name, region = random.choice(list(UK_REGIONS.items()))
+    return (
+        f"COMMERCIAL RISK SURVEY REPORT\n"
+        f"Surveyor: {surveyor} — {random.choice(['Zurich Risk Engineering','AXA Risk Consulting','RSA Risk Control','Aviva Risk Management'])}\n"
+        f"Property: {fake.company()} — {fake.address().replace(chr(10), ', ')}\n"
+        f"Date: {fake.date_between(start_date='-6m', end_date='today').isoformat()}\n\n"
+        f"OCCUPANCY: {random.choice(['Office','Warehouse','Retail','Manufacturing','Restaurant','Hotel'])}\n"
+        f"CONSTRUCTION: {random.choice(['Brick/tile','Steel frame/composite','Timber frame','Concrete'])}\n"
+        f"FIRE PROTECTION: {random.choice(['Sprinklers — full','Sprinklers — partial','Extinguishers only','None'])}\n"
+        f"SECURITY: {random.choice(['Intruder alarm — monitored','CCTV + alarm','Basic locks only','Security guard 24/7'])}\n"
+        f"FLOOD RISK: {random.choice(['Zone 1 — minimal','Zone 2 — low','Zone 3a — moderate','Zone 3b — high'])}\n\n"
+        f"RECOMMENDATIONS:\n"
+        f"1. {fake.sentence()}\n"
+        f"2. {fake.sentence()}\n"
+        f"3. {fake.sentence()}\n\n"
+        f"OVERALL RISK GRADE: {random.choice(['A — Excellent','B — Good','C — Average','D — Below Average','E — Poor'])}\n"
+    )
+def _gen_slip() -> str:
+    syndicate = random.choice(LLOYDS_SYNDICATES)
+    broker = random.choice(["Aon", "Marsh", "WTW", "Howden", "Lockton", "Gallagher"])
+    return (
+        f"LLOYD'S MARKET PLACING SLIP\n"
+        f"UMR: B{random.randint(1000,9999)}{random.choice('ABCDEFGH')}{random.randint(10000,99999)}\n"
+        f"Broker: {broker}\n"
+        f"Lead Underwriter: {syndicate}\n\n"
+        f"ASSURED: {fake.company()}\n"
+        f"PERIOD: {fake.date_between(start_date='today', end_date='+30d').isoformat()} to "
+        f"{fake.date_between(start_date='+365d', end_date='+395d').isoformat()}\n"
+        f"TYPE: {random.choice(['Property All Risks','General Liability','Professional Indemnity','Cyber','Marine Cargo','D&O'])}\n"
+        f"LIMIT: {random.choice(['£1,000,000','£2,500,000','£5,000,000','£10,000,000'])} any one occurrence\n"
+        f"DEDUCTIBLE: {random.choice(['£10,000','£25,000','£50,000','£100,000'])}\n"
+        f"RATE: {random.uniform(0.1, 2.5):.3f}%\n"
+        f"PREMIUM: £{random.randint(10000, 500000):,}\n"
+        f"LEAD LINE: {random.randint(10, 40)}%\n"
+        f"FOLLOW CAPACITY: {random.choice(['Fully placed','85% placed — seeking balance','Open — marketing'])}\n"
+    )
+# ── Generator map ──
+DOC_GENERATORS = {
+    "Policy Schedule": _gen_policy_schedule,
+    "Claim Form / FNOL": _gen_claim_form,
+    "Endorsement": _gen_endorsement,
+    "Loss Adjuster Report": _gen_loss_adjuster_report,
+    "Bordereaux": _gen_bordereaux,
+    "Renewal Notice": _gen_renewal_invite,
+    "Subrogation Letter": _gen_subrogation_letter,
+    "Complaint": _gen_complaint_letter,
+    "Medical Report": _gen_medical_report,
+    "Regulatory Correspondence": _gen_fca_letter,
+    "Risk Survey": _gen_risk_survey,
+    "Lloyd's Slip": _gen_slip,
+}
+def generate_document_dataset(n: int = 10000, output_path: str = "data/output/insurance_docs_10k.jsonl"):
+    """Generate n labelled document classification examples."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    labels = list(DOC_GENERATORS.keys())
+    per_label = n // len(labels)
+    remainder = n % len(labels)
+    records = []
+    for i, (label, gen_fn) in enumerate(DOC_GENERATORS.items()):
+        count = per_label + (1 if i < remainder else 0)
+        for _ in tqdm(range(count), desc=f"Docs — {label}"):
+            records.append({
+                "text": gen_fn(),
+                "label": label,
+                "label_id": labels.index(label),
+            })
+    random.shuffle(records)
+    with open(output_path, "w") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    print(f"\n✓ Generated {len(records)} document classification examples → {output_path}")
+    from collections import Counter
+    dist = Counter(r["label"] for r in records)
+    for lab, count in sorted(dist.items()):
+        print(f"  {lab}: {count}")
+    return output_path
+if __name__ == "__main__":
+    generate_document_dataset()

data/gen_dpo.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+InsureOS — Synthetic DPO Preference Data Generator
+Generates chosen/rejected pairs for Direct Preference Optimization.
+Chosen = FCA-compliant, accurate, plain English
+Rejected = Non-compliant, hallucinated, jargon-heavy, or unfair
+"""
+import json
+import random
+import os
+from pathlib import Path
+from faker import Faker
+from tqdm import tqdm
+from data.constants import (
+    UK_INSURERS, FCA_REFERENCES, CLAIM_TYPES, INSURANCE_JARGON,
+    DPO_PREFERENCE_DIMENSIONS,
+)
+fake = Faker("en_GB")
+Faker.seed(43)
+random.seed(43)
+def _rand_gbp(low: int, high: int) -> str:
+    return f"£{random.randint(low, high):,}"
+def _rand_claim_number() -> str:
+    return f"CLM-{random.randint(200000, 999999)}"
+def gen_fca_consumer_duty_pair() -> dict:
+    """Chosen: consumer-fair response. Rejected: opaque, unhelpful response."""
+    customer = fake.name()
+    insurer = random.choice(UK_INSURERS)
+    claim_num = _rand_claim_number()
+    claim_value = random.randint(2000, 30000)
+    prompt = (
+        f"{customer} has submitted a home insurance claim ({claim_num}) for escape of water "
+        f"damage valued at £{claim_value:,}. The claim has been open for 6 weeks with no update. "
+        f"The customer calls asking for a progress update. Draft the response."
+    )
+    chosen = (
+        f"Thank you for calling, {customer}. I sincerely apologise for the delay in updating you "
+        f"on claim {claim_num} — you should have received regular updates, and I'm sorry that hasn't happened.\n\n"
+        f"Here's where we are:\n"
+        f"- Our loss adjuster visited on [date] and their report is now with our assessment team\n"
+        f"- We expect to have a decision within the next 5 working days\n"
+        f"- I'm assigning myself as your dedicated handler so you have one person to contact\n\n"
+        f"My direct number is [number] and email is [email]. I'll call you by [specific date] with an update, "
+        f"even if we're still working on it.\n\n"
+        f"I understand this is stressful, especially when you're living with the damage. Is there anything "
+        f"else I can help with right now? If the damage is causing any issues with living in the property, "
+        f"please let me know and we can look at interim measures.\n\n"
+        f"If you're unhappy with how we've handled this, you have the right to make a formal complaint, "
+        f"and if we can't resolve it, you can refer to the Financial Ombudsman Service free of charge."
+    )
+    rejected = (
+        f"As per your policy reference, claim {claim_num} is currently under assessment. "
+        f"The claims process can take up to 90 days as stipulated in Section 4, Clause 7(b) of your "
+        f"policy schedule. We will contact you in due course when a determination has been made "
+        f"regarding the indemnification of your peril-related loss under the escape of water cover "
+        f"provision. No further action is required from you at this time. Please do not contact us "
+        f"again as this may delay the process. Ref: FCA/ICOBS/8.1.1R. Claims Department."
+    )
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "dimension": "fca_consumer_duty",
+    }
+def gen_accuracy_pair() -> dict:
+    """Chosen: factually accurate. Rejected: hallucinated facts."""
+    customer = fake.name()
+    line = "Motor Private Car"
+    excess = random.choice([250, 350, 500])
+    claim_amount = random.randint(3000, 15000)
+    prompt = (
+        f"{customer} asks: 'My car was hit in a car park while I was shopping. "
+        f"The other driver left a note with their details. My excess is £{excess}. "
+        f"How does the claims process work and will I get my excess back?'"
+    )
+    chosen = (
+        f"Here's how this works:\n\n"
+        f"1. **Report the claim** to your insurer with the other driver's details from the note\n"
+        f"2. **Your insurer assesses** the damage and arranges repairs\n"
+        f"3. **You pay your excess** (£{excess}) upfront when repairs are done\n"
+        f"4. **Subrogation** — because the other driver was at fault, your insurer will pursue them "
+        f"(or their insurer) to recover the claim cost\n"
+        f"5. **If recovery is successful**, you get your £{excess} excess back\n\n"
+        f"**Important points:**\n"
+        f"- This should be a non-fault claim, so it shouldn't affect your no-claims discount (NCD) "
+        f"once the other party's insurer admits liability\n"
+        f"- However, while the claim is open, your NCD may be temporarily affected at renewal\n"
+        f"- Keep the note safe — it's your key evidence\n"
+        f"- If the other driver disputes fault, your insurer handles the negotiation\n"
+        f"- Typical timeline: 6-12 weeks for a straightforward non-fault claim"
+    )
+    rejected = (
+        f"Since the other driver left a note, this is automatically a non-fault claim and you won't "
+        f"need to pay any excess at all. Your insurer is legally required to waive the excess under "
+        f"Section 152 of the Road Traffic Act 1998 when fault is admitted in writing. Your no-claims "
+        f"bonus is protected by law in this scenario and cannot be affected. The other driver's insurer "
+        f"must pay within 14 days under the Motor Insurers' Bureau agreement. You should receive "
+        f"£{claim_amount:,} plus £{excess} compensation directly to your bank account."
+    )
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "dimension": "accuracy",
+    }
+def gen_regulatory_compliance_pair() -> dict:
+    """Chosen: compliant advice. Rejected: non-compliant recommendation."""
+    mga = random.choice(["Plexus MGA", "Pen Underwriting", "Three Sixty Underwriting"])
+    syndicate = random.choice(["Syndicate 2623", "Syndicate 2001", "Syndicate 4444"])
+    prompt = (
+        f"As an MGA ({mga}), we want to speed up our claims settlement process. "
+        f"Our current average is 45 days. Our capacity provider ({syndicate}) has asked us "
+        f"to settle faster. What can we do?"
+    )
+    chosen = (
+        f"Here are compliant approaches to reduce settlement time:\n\n"
+        f"**Process improvements:**\n"
+        f"1. Implement automated FNOL with structured data capture — reduces manual re-keying by 60%\n"
+        f"2. Use AI-assisted document extraction for supporting evidence (photos, invoices, reports)\n"
+        f"3. Create fast-track thresholds: auto-settle claims under £1,000 with documented approval rules\n"
+        f"4. Set up same-day payment processing for approved settlements\n\n"
+        f"**Important compliance guardrails:**\n"
+        f"- All fast-track settlements must still comply with {FCA_REFERENCES['claims_handling']}\n"
+        f"- Even automated decisions need a human review mechanism for customers to challenge\n"
+        f"- Under {FCA_REFERENCES['consumer_duty']}, faster isn't always better — ensure the settlement "
+        f"amount is fair, not just fast\n"
+        f"- Data sharing between you and {syndicate} must comply with {FCA_REFERENCES['gdpr']} — "
+        f"ensure your data processing agreement covers automated decision-making\n"
+        f"- Maintain full audit trail for {syndicate}'s regulatory reporting\n"
+        f"- Check your binding authority agreement for any settlement authority limits\n\n"
+        f"**Target:** 15-20 days average for straightforward claims is achievable with these changes."
+    )
+    rejected = (
+        f"The fastest way to settle claims is to just approve everything under £5,000 automatically "
+        f"without investigation. Most small claims are genuine anyway. You can also skip the loss adjuster "
+        f"for any claim under £10,000 — they just slow things down. For larger claims, just offer 70% of "
+        f"the claimed amount as a quick settlement — most people will accept rather than wait. "
+        f"Don't worry too much about documentation, {syndicate} mainly cares about the numbers. "
+        f"If customers complain, tell them the settlement is final and they signed the policy terms."
+    )
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "dimension": "regulatory_compliance",
+    }
+def gen_plain_english_pair() -> dict:
+    """Chosen: plain English. Rejected: impenetrable jargon."""
+    customer = fake.name()
+    insurer = random.choice(UK_INSURERS)
+    prompt = (
+        f"{customer} has received a letter from {insurer} declining part of their "
+        f"home insurance claim. They don't understand why. Explain the partial decline."
+    )
+    chosen = (
+        f"Dear {customer},\n\n"
+        f"I understand the letter from {insurer} was confusing. Let me explain what happened "
+        f"in straightforward terms.\n\n"
+        f"**What was claimed:** Your claim was for water damage to your kitchen ceiling and "
+        f"the cost of fixing the leaking pipe that caused it.\n\n"
+        f"**What's being paid:** The damage to your ceiling — £2,800 for replastering and "
+        f"redecorating, minus your £250 excess = £2,550 being paid to you.\n\n"
+        f"**What's NOT being paid (and why):** The cost of repairing the pipe itself (£450). "
+        f"This is because your policy covers **damage caused by** the leak, but not the repair "
+        f"of the pipe that leaked. Think of it like this — if a tree falls on your roof, the "
+        f"insurance covers fixing the roof, but not removing the tree from the garden.\n\n"
+        f"This is a standard term in almost all home insurance policies. It's not unique to {insurer}.\n\n"
+        f"**What you can do:**\n"
+        f"- If you think this is wrong, call us and we'll review\n"
+        f"- If you're still unhappy, you can make a formal complaint\n"
+        f"- If we can't resolve it, the Financial Ombudsman Service is free to use\n\n"
+        f"Is there anything else I can clarify?"
+    )
+    rejected = (
+        f"Re: Policy {random.randint(100000, 999999)} — Partial Declinature Notification\n\n"
+        f"Dear Policyholder,\n\n"
+        f"Further to our loss adjuster's report dated [date] and pursuant to the terms, conditions, "
+        f"and exclusions set forth in your policy schedule (Section 2, Subsection 4(a)(iii)), we have "
+        f"determined that the proximate cause of the peril giving rise to the consequential damage falls "
+        f"within the indemnifiable perils as enumerated in the Operative Clause; however, the antecedent "
+        f"mechanical failure of the plumbing infrastructure constitutes a maintenance obligation under "
+        f"General Condition 7(b) — Maintenance and Reasonable Precautions, and is therefore excluded "
+        f"from indemnification under the aforegoing policy provisions. The quantum of the remaining "
+        f"indemnifiable loss, net of the contractual voluntary excess, has been calculated on an "
+        f"indemnity basis per the Basis of Settlement clause. No betterment has been applied. "
+        f"Please refer to your policy booklet, pages 47-62, for the complete exclusionary provisions. "
+        f"Yours faithfully, Claims Department."
+    )
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "dimension": "plain_english",
+    }
+def gen_data_protection_pair() -> dict:
+    """Chosen: GDPR-safe. Rejected: casually leaks PII."""
+    customer_a = fake.name()
+    customer_b = fake.name()
+    insurer = random.choice(UK_INSURERS)
+    prompt = (
+        f"A third party ({customer_b}) calls {insurer} asking about the status of a "
+        f"motor insurance claim they've made against {customer_a}'s policy. "
+        f"{customer_b} says they were the other driver in the accident and wants to know "
+        f"if the claim has been accepted and when they'll be paid. How should you respond?"
+    )
+    chosen = (
+        f"I understand you're waiting for an update on your claim, and I appreciate your patience.\n\n"
+        f"However, I'm unable to share any details about the claim status or the other "
+        f"party's policy with you, as this is protected by data protection law "
+        f"({FCA_REFERENCES['gdpr']}).\n\n"
+        f"**What you should do:**\n"
+        f"1. Contact **your own insurer** — they can liaise with us on your behalf through "
+        f"the standard inter-insurer claims process\n"
+        f"2. If you don't have insurance (uninsured loss), you can pursue the claim through "
+        f"the other party's insurer, but we can only correspond with you about YOUR claim "
+        f"against our policyholder — we cannot discuss our policyholder's policy details\n"
+        f"3. We can confirm we've received notification of the incident — but nothing more\n\n"
+        f"If you provide your contact details and your own insurer's reference, we can ensure "
+        f"any correspondence is directed appropriately.\n\n"
+        f"I know this might feel frustrating, but these protections exist for everyone, "
+        f"including you if someone were to call about your policy."
+    )
+    rejected = (
+        f"Yes, {customer_a}'s claim has been accepted. Their policy number is [number] and "
+        f"they have fully comprehensive cover with us. The claim was valued at £4,500 and we've "
+        f"already approved it. {customer_a} lives at {fake.address()} and has been with us for "
+        f"3 years. We'll be settling the third party element with you directly. Can I take your "
+        f"bank details and I'll process the payment? The settlement should be in your account "
+        f"within 5 working days."
+    )
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "dimension": "data_protection",
+    }
+def gen_fair_pricing_pair() -> dict:
+    """Chosen: non-discriminatory pricing. Rejected: uses protected characteristics."""
+    insurer = random.choice(UK_INSURERS)
+    prompt = (
+        f"As a pricing actuary at {insurer}, you're asked to review a new rating factor "
+        f"proposal for motor insurance. The data science team wants to add 'first name' as "
+        f"a predictive feature because it improves the model's loss ratio prediction by 3%. "
+        f"Should we include it?"
+    )
+    chosen = (
+        f"**Recommendation: DO NOT include 'first name' as a rating factor.**\n\n"
+        f"**Why it's statistically predictive:** First names correlate with age, gender, ethnicity, "
+        f"and socioeconomic background. The 3% improvement is almost certainly because the model is "
+        f"using first name as a proxy for one or more protected characteristics.\n\n"
+        f"**Why it's not allowed:**\n\n"
+        f"1. **Gender proxy** — under EU/UK law (Test-Achats ruling + {FCA_REFERENCES['equality_act']}), "
+        f"gender cannot be used as a rating factor. First names like 'Sarah' vs 'James' directly "
+        f"proxy gender.\n\n"
+        f"2. **Ethnicity proxy** — names like 'Mohammed' or 'Priya' correlate with ethnicity. "
+        f"Using these would constitute indirect race discrimination under the {FCA_REFERENCES['equality_act']}.\n\n"
+        f"3. **Age proxy** — naming trends change over decades (e.g., 'Ethel' suggests older, "
+        f"'Jayden' suggests younger). This could amount to indirect age discrimination.\n\n"
+        f"4. **FCA enforcement** — the FCA explicitly monitors for proxy discrimination in pricing "
+        f"under {FCA_REFERENCES['pricing_practices']}. Using first name would trigger supervisory action.\n\n"
+        f"**What to do instead:**\n"
+        f"- Review what legitimate risk factors the 3% improvement might be capturing\n"
+        f"- Can you add those factors directly (e.g., vehicle type, postcode, occupation)?\n"
+        f"- Document this decision for your fair pricing audit trail"
+    )
+    rejected = (
+        f"The 3% improvement is significant and would give us a competitive edge. First name is "
+        f"publicly available information, so there's no GDPR issue. We should include it. "
+        f"Technically, we're not using gender directly — we're using a name, which is different. "
+        f"If the regulator asks, we can explain it's a legitimate predictor based on claims data. "
+        f"Lots of insurers use similar features, they just don't talk about it. The model is only "
+        f"reflecting real-world risk patterns, and it's up to the market to price accurately."
+    )
+    return {
+        "prompt": prompt,
+        "chosen": chosen,
+        "rejected": rejected,
+        "dimension": "fair_pricing",
+    }
+# ── Master Generator ──
+DPO_GENERATORS = [
+    gen_fca_consumer_duty_pair,
+    gen_accuracy_pair,
+    gen_regulatory_compliance_pair,
+    gen_plain_english_pair,
+    gen_data_protection_pair,
+    gen_fair_pricing_pair,
+]
+def generate_dpo_dataset(n: int = 5000, output_path: str = "data/output/insurance_dpo_5k.jsonl"):
+    """Generate n DPO preference pairs."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    per_gen = n // len(DPO_GENERATORS)
+    remainder = n % len(DPO_GENERATORS)
+    records = []
+    for i, gen_fn in enumerate(DPO_GENERATORS):
+        count = per_gen + (1 if i < remainder else 0)
+        for _ in tqdm(range(count), desc=f"DPO — {gen_fn.__name__}"):
+            pair = gen_fn()
+            records.append({
+                "prompt": [
+                    {"role": "system", "content": "You are InsureLLM, a specialist UK insurance AI assistant."},
+                    {"role": "user", "content": pair["prompt"]},
+                ],
+                "chosen": [{"role": "assistant", "content": pair["chosen"]}],
+                "rejected": [{"role": "assistant", "content": pair["rejected"]}],
+                "dimension": pair["dimension"],
+            })
+    random.shuffle(records)
+    with open(output_path, "w") as f:
+        for record in records:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    print(f"\n✓ Generated {len(records)} DPO preference pairs → {output_path}")
+    from collections import Counter
+    dist = Counter(r["dimension"] for r in records)
+    for dim, count in sorted(dist.items()):
+        print(f"  {dim}: {count}")
+    return output_path
+if __name__ == "__main__":
+    generate_dpo_dataset()

data/gen_ner.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+InsureOS — Synthetic NER (Named Entity Recognition) Data Generator
+Generates 8K token-labelled insurance text examples in IOB2 format for ModernBERT NER.
+"""
+import json
+import os
+import random
+from datetime import timedelta
+from faker import Faker
+from tqdm import tqdm
+from data.constants import (
+    UK_INSURERS, LLOYDS_SYNDICATES, MGAS, UK_REGIONS,
+    NER_ENTITY_TYPES, FCA_REFERENCES,
+)
+fake = Faker("en_GB")
+Faker.seed(46)
+random.seed(46)
+# Entity types with IOB2 labels:
+#   PERSON, ORG, INSURER, MGA, SYNDICATE, POLICY_NUMBER, CLAIM_NUMBER,
+#   MONEY, DATE, POSTCODE, LOB, REGULATION, PERIL, VEHICLE, ADDRESS
+def _postcode() -> str:
+    region_name, region = random.choice(list(UK_REGIONS.items()))
+    prefix = random.choice(region)
+    return f"{prefix}{random.randint(1,29)} {random.randint(1,9)}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}{random.choice('ABCDEFGHJKLMNPRSTUVWXY')}"
+def _policy_ref() -> str:
+    return f"POL-{random.randint(100000, 999999)}"
+def _claim_ref() -> str:
+    return f"CLM-{random.randint(200000, 999999)}"
+def _amount() -> str:
+    val = random.choice([
+        random.randint(100, 999),
+        random.randint(1000, 9999),
+        random.randint(10000, 99999),
+        random.randint(100000, 999999),
+    ])
+    return f"£{val:,}"
+def _date_str() -> str:
+    d = fake.date_between(start_date="-3y", end_date="+1y")
+    return d.strftime(random.choice(["%d/%m/%Y", "%d %B %Y", "%Y-%m-%d"]))
+def _vehicle() -> str:
+    makes = ["Ford Fiesta", "VW Golf", "BMW 3 Series", "Toyota Yaris", "Kia Sportage",
+             "Vauxhall Corsa", "Mercedes A-Class", "Tesla Model 3", "Nissan Qashqai", "Audi A3"]
+    return random.choice(makes)
+def _peril() -> str:
+    return random.choice([
+        "escape of water", "storm damage", "theft", "fire", "flood",
+        "accidental damage", "subsidence", "malicious damage", "collision",
+        "burst pipe", "lightning strike", "impact damage", "vandalism",
+    ])
+def _regulation() -> str:
+    return random.choice(list(FCA_REFERENCES.values()) + [
+        "ICOBS 8.1.1R", "DISP 1.3", "PRIN 2A", "Consumer Duty",
+        "FCA PS21/5", "Equality Act 2010", "GDPR Article 6",
+    ])
+def _lob() -> str:
+    return random.choice([
+        "motor insurance", "home insurance", "commercial combined",
+        "employers' liability", "public liability", "professional indemnity",
+        "property insurance", "cyber insurance", "D&O insurance",
+    ])
+# ── Sentence templates with entity slots ──
+TEMPLATES = [
+    # 0 — claim notification
+    lambda: _build(
+        "{PERSON} reported a {PERIL} claim ({CLAIM_NUMBER}) on {DATE}. "
+        "The loss occurred at {POSTCODE} and is covered under {LOB} policy {POLICY_NUMBER} "
+        "with {INSURER}. Estimated value: {MONEY}."
+    ),
+    # 1 — subrogation
+    lambda: _build(
+        "{INSURER} is pursuing subrogation recovery of {MONEY} against {ORG} "
+        "in respect of claim {CLAIM_NUMBER} dated {DATE}. "
+        "The policyholder {PERSON} resides at {POSTCODE}."
+    ),
+    # 2 — Lloyd's placement
+    lambda: _build(
+        "{SYNDICATE} has written a {MONEY} line on the {LOB} facility "
+        "brokered for {ORG} by {MGA}. Inception date {DATE}."
+    ),
+    # 3 — regulatory
+    lambda: _build(
+        "Under {REGULATION}, {INSURER} must provide {PERSON} with a final response "
+        "to their {PERIL} claim ({CLAIM_NUMBER}) by {DATE}. "
+        "The claim value is {MONEY}."
+    ),
+    # 4 — renewal
+    lambda: _build(
+        "{PERSON}'s {LOB} policy {POLICY_NUMBER} with {INSURER} is due for renewal on {DATE}. "
+        "Current premium: {MONEY}. Property at {POSTCODE}."
+    ),
+    # 5 — vehicle claim
+    lambda: _build(
+        "{PERSON} was driving a {VEHICLE} when the {PERIL} incident occurred on {DATE} "
+        "near {POSTCODE}. Claim {CLAIM_NUMBER} has been opened with {INSURER} for {MONEY}."
+    ),
+    # 6 — MGA bordereaux
+    lambda: _build(
+        "{MGA} submitted the {DATE} bordereaux to {SYNDICATE} showing {MONEY} GWP "
+        "across {LOB} business. Contact: {PERSON}."
+    ),
+    # 7 — complaint
+    lambda: _build(
+        "{PERSON} has filed a complaint against {INSURER} regarding claim {CLAIM_NUMBER}. "
+        "Per {REGULATION}, we must respond by {DATE}. Claim relates to {PERIL} at {POSTCODE}. "
+        "Amount disputed: {MONEY}."
+    ),
+    # 8 — loss adjuster
+    lambda: _build(
+        "Loss adjuster {PERSON} from {ORG} inspected the {PERIL} damage at {POSTCODE} on {DATE}. "
+        "They recommend a settlement of {MONEY} on claim {CLAIM_NUMBER} under {LOB} cover."
+    ),
+    # 9 — medical
+    lambda: _build(
+        "Dr {PERSON} examined the claimant in connection with claim {CLAIM_NUMBER} "
+        "dated {DATE}. The {PERIL} incident at {POSTCODE} resulted in injuries. "
+        "{INSURER} has reserved {MONEY} under the {LOB} policy."
+    ),
+    # 10 — endorsement
+    lambda: _build(
+        "Endorsement applied to {POLICY_NUMBER}: {PERSON} has changed vehicle to {VEHICLE}. "
+        "Effective {DATE}. Additional premium: {MONEY}. Insurer: {INSURER}."
+    ),
+    # 11 — fraud referral
+    lambda: _build(
+        "Claim {CLAIM_NUMBER} by {PERSON} for {PERIL} ({MONEY}) has been referred to the fraud team. "
+        "Policy {POLICY_NUMBER} with {INSURER} started on {DATE}. "
+        "Property postcode: {POSTCODE}. Cf. {REGULATION}."
+    ),
+]
+def _build(template: str) -> tuple[list[str], list[str]]:
+    """Fill template slots and return (tokens, iob_tags)."""
+    # Generate entity values
+    entities = {
+        "PERSON": fake.name(),
+        "ORG": fake.company(),
+        "INSURER": random.choice(UK_INSURERS),
+        "MGA": random.choice(MGAS),
+        "SYNDICATE": random.choice(LLOYDS_SYNDICATES),
+        "POLICY_NUMBER": _policy_ref(),
+        "CLAIM_NUMBER": _claim_ref(),
+        "MONEY": _amount(),
+        "DATE": _date_str(),
+        "POSTCODE": _postcode(),
+        "LOB": _lob(),
+        "REGULATION": _regulation(),
+        "PERIL": _peril(),
+        "VEHICLE": _vehicle(),
+    }
+    # Parse template to get ordered (text_fragment, entity_type) pairs
+    tokens = []
+    tags = []
+    remaining = template
+    while remaining:
+        # Find next entity slot
+        best_pos = len(remaining)
+        best_key = None
+        for key in entities:
+            marker = "{" + key + "}"
+            pos = remaining.find(marker)
+            if pos != -1 and pos < best_pos:
+                best_pos = pos
+                best_key = key
+        if best_key is None:
+            # No more entities — tokenize remaining text
+            for tok in remaining.split():
+                tokens.append(tok)
+                tags.append("O")
+            break
+        marker = "{" + best_key + "}"
+        # Text before entity
+        before = remaining[:best_pos]
+        for tok in before.split():
+            if tok:
+                tokens.append(tok)
+                tags.append("O")
+        # Entity tokens
+        entity_value = entities[best_key]
+        entity_tokens = entity_value.split()
+        for j, etok in enumerate(entity_tokens):
+            tokens.append(etok)
+            tags.append(f"B-{best_key}" if j == 0 else f"I-{best_key}")
+        remaining = remaining[best_pos + len(marker):]
+    return tokens, tags
+def generate_ner_dataset(n: int = 8000, output_path: str = "data/output/insurance_ner_8k.jsonl"):
+    """Generate n NER examples in token-level IOB2 format."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    records = []
+    for _ in tqdm(range(n), desc="NER examples"):
+        gen_fn = random.choice(TEMPLATES)
+        tokens, tags = gen_fn()
+        records.append({
+            "tokens": tokens,
+            "ner_tags": tags,
+            "text": " ".join(tokens),
+        })
+    random.shuffle(records)
+    with open(output_path, "w") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    # Stats
+    all_tags = set()
+    for rec in records:
+        all_tags.update(rec["ner_tags"])
+    entity_tags = sorted(t for t in all_tags if t != "O")
+    print(f"\n✓ Generated {len(records)} NER examples → {output_path}")
+    print(f"  Entity types found: {len(entity_tags)}")
+    for t in entity_tags:
+        count = sum(1 for rec in records for tag in rec["ner_tags"] if tag == t)
+        print(f"    {t}: {count}")
+    return output_path
+if __name__ == "__main__":
+    generate_ner_dataset()

data/gen_sft.py ADDED Viewed

	@@ -0,0 +1,1192 @@

+"""
+InsureOS — Synthetic SFT Data Generator
+Generates 10K instruction-response pairs for UK insurance fine-tuning.
+100% synthetic — no real PII, no real policy data.
+"""
+import json
+import random
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+from faker import Faker
+from tqdm import tqdm
+from data.constants import (
+    LINES_OF_BUSINESS, UK_REGIONS, UK_INSURERS, LLOYDS_SYNDICATES,
+    MGA_NAMES, CLAIM_TYPES, FCA_REFERENCES, POLICY_SECTIONS,
+    INSURANCE_JARGON, SFT_TASK_CATEGORIES,
+)
+fake = Faker("en_GB")
+Faker.seed(42)
+random.seed(42)
+def _rand_gbp(low: int, high: int) -> str:
+    return f"£{random.randint(low, high):,}"
+def _rand_policy_number() -> str:
+    prefix = random.choice(["POL", "UW", "BA", "PI", "CL", "MGA"])
+    return f"{prefix}-{random.randint(100000, 999999)}"
+def _rand_claim_number() -> str:
+    return f"CLM-{random.randint(200000, 999999)}"
+def _rand_date(start_year: int = 2022, end_year: int = 2026) -> str:
+    start = datetime(start_year, 1, 1)
+    end = datetime(end_year, 3, 31)
+    delta = (end - start).days
+    d = start + timedelta(days=random.randint(0, delta))
+    return d.strftime("%d/%m/%Y")
+def _rand_postcode() -> str:
+    region = random.choice(list(UK_REGIONS.values()))
+    prefix = random.choice(region)
+    num = random.randint(1, 28)
+    suffix = f"{random.randint(1,9)}{random.choice('ABCDEFGHJKLMNPQRSTUVWXY')}{random.choice('ABCDEFGHJKLMNPQRSTUVWXY')}"
+    return f"{prefix}{num} {suffix}"
+def _rand_vehicle_reg() -> str:
+    area = random.choice(["AB", "BA", "CA", "DA", "EA", "FA", "GA", "HA", "KA", "LA", "MA"])
+    age = random.choice(["21", "22", "23", "24", "25", "71", "72", "73", "74", "75"])
+    letters = "".join(random.choices("ABCDEFGHJKLMNPRSTUVWXY", k=3))
+    return f"{area}{age} {letters}"
+# ────────────────────────────────────────────
+# SFT Template Generators (one per category)
+# ────────────────────────────────────────────
+def gen_claims_handling() -> dict:
+    line = random.choice(list(CLAIM_TYPES.keys()))
+    claim_type = random.choice(CLAIM_TYPES[line])
+    insured = fake.name()
+    policy_num = _rand_policy_number()
+    claim_num = _rand_claim_number()
+    date_of_loss = _rand_date(2024, 2026)
+    claim_value = random.randint(500, 150000)
+    excess = random.choice([100, 150, 200, 250, 300, 500, 1000])
+    postcode = _rand_postcode()
+    insurer = random.choice(UK_INSURERS)
+    instruction = (
+        f"You are a claims handler for {insurer}. Process the following claim.\n\n"
+        f"Policy Number: {policy_num}\n"
+        f"Claim Number: {claim_num}\n"
+        f"Insured: {insured}\n"
+        f"Line of Business: {line}\n"
+        f"Date of Loss: {date_of_loss}\n"
+        f"Location: {postcode}\n"
+        f"Claim Type: {claim_type}\n"
+        f"Estimated Value: £{claim_value:,}\n"
+        f"Policy Excess: £{excess:,}\n\n"
+        f"Assess coverage, set an initial reserve, identify any red flags, "
+        f"and draft next steps. Ensure compliance with {FCA_REFERENCES['claims_handling']}."
+    )
+    net_claim = max(0, claim_value - excess)
+    red_flags = []
+    if claim_value > 50000:
+        red_flags.append("High-value claim — requires senior adjuster review")
+    if "theft" in claim_type.lower():
+        red_flags.append("Theft claim — verify police report reference")
+    if "fire" in claim_type.lower():
+        red_flags.append("Fire claim — fire brigade report required")
+    if "flood" in claim_type.lower():
+        red_flags.append("Flood claim — check Environment Agency flood map for postcode")
+    if "subsidence" in claim_type.lower():
+        red_flags.append("Subsidence claim — require arboricultural report and site investigation")
+    if not red_flags:
+        red_flags.append("No immediate red flags identified")
+    response = (
+        f"## Claims Assessment — {claim_num}\n\n"
+        f"**Coverage Check:** {line} policy {policy_num} for {insured} "
+        f"is active. The reported loss — {claim_type} — falls within the "
+        f"scope of standard {line} cover, subject to policy terms and exclusions.\n\n"
+        f"**Initial Reserve:** £{net_claim:,} (estimated value £{claim_value:,} "
+        f"less policy excess of £{excess:,})\n\n"
+        f"**Red Flags:**\n"
+    )
+    for flag in red_flags:
+        response += f"- {flag}\n"
+    response += (
+        f"\n**Next Steps:**\n"
+        f"1. Confirm policy was in force on {date_of_loss} and premium paid\n"
+        f"2. Obtain insured's full statement of events\n"
+        f"3. Appoint loss adjuster if claim exceeds £{min(10000, claim_value):,} threshold\n"
+        f"4. Request supporting documentation (photos, receipts, police/fire report if applicable)\n"
+        f"5. Check for any relevant exclusions in policy wording\n"
+        f"6. Update claims bordereaux for MGA reporting\n\n"
+        f"**Compliance Note:** Under {FCA_REFERENCES['claims_handling']}, we must handle this "
+        f"claim promptly and fairly. The insured must be kept informed at each stage. "
+        f"Under {FCA_REFERENCES['consumer_duty']}, we must act to deliver a good outcome "
+        f"for the customer throughout the claims journey."
+    )
+    return {"instruction": instruction, "response": response, "category": "claims_handling"}
+def gen_policy_analysis() -> dict:
+    line = random.choice(LINES_OF_BUSINESS[:12])
+    section = random.choice(POLICY_SECTIONS)
+    insurer = random.choice(UK_INSURERS)
+    policy_num = _rand_policy_number()
+    exclusion_examples = {
+        "General Exclusions": [
+            "War, terrorism, and nuclear risks",
+            "Wear and tear, gradual deterioration",
+            "Loss arising from illegal acts by the insured",
+            "Pre-existing defects known to the insured",
+            "Sanctions — loss connected to sanctioned territories (Russia, Iran, North Korea, etc.)",
+        ],
+        "Claims Conditions": [
+            "Insured must notify claims within 30 days of discovery",
+            "Insured must not admit liability without insurer's written consent",
+            "Insurer has the right to take over defence and settlement of any claim",
+            "Failure to comply with claims conditions may invalidate the claim",
+        ],
+    }
+    relevant_exclusions = exclusion_examples.get(section, [
+        f"Standard {section} provisions apply per market wording",
+        "Refer to policy schedule for specific sub-limits and deductibles",
+    ])
+    instruction = (
+        f"Analyse the '{section}' section of a {line} policy issued by {insurer} "
+        f"(Policy: {policy_num}). Explain what this section covers, key exclusions, "
+        f"and any implications for the policyholder. Use plain English suitable for "
+        f"a consumer, in line with FCA Consumer Duty requirements."
+    )
+    response = (
+        f"## Policy Analysis — {section}\n\n"
+        f"**Policy:** {policy_num} ({line}, issued by {insurer})\n\n"
+        f"**What this section does:**\n"
+        f"The '{section}' section sets out "
+    )
+    if section == "General Exclusions":
+        response += (
+            "the circumstances and events that are NOT covered by this policy. "
+            "These apply to all sections of your cover.\n\n"
+            "**Key exclusions in plain English:**\n"
+        )
+        for exc in relevant_exclusions:
+            response += f"- {exc}\n"
+        response += (
+            f"\n**What this means for you:** If your loss falls under any of these "
+            f"exclusions, the insurer will not pay your claim. It's important to read "
+            f"these carefully and ask your broker if anything is unclear.\n\n"
+        )
+    elif section == "Claims Conditions":
+        response += (
+            "the rules you must follow when making a claim. If you don't follow "
+            "these procedures, the insurer may reduce or refuse your claim.\n\n"
+            "**Key conditions in plain English:**\n"
+        )
+        for cond in relevant_exclusions:
+            response += f"- {cond}\n"
+        response += (
+            f"\n**What this means for you:** Report any potential claim promptly. "
+            f"Don't accept blame or make payments without contacting your insurer first. "
+            f"Keep all evidence and documentation.\n\n"
+        )
+    else:
+        response += (
+            f"the terms relating to {section.lower()} under this {line} policy. "
+            f"This defines the scope, limits, and conditions that apply.\n\n"
+            f"**Key points:**\n"
+        )
+        for item in relevant_exclusions:
+            response += f"- {item}\n"
+        response += "\n"
+    response += (
+        f"**Consumer Duty Reminder:** Under {FCA_REFERENCES['consumer_duty']}, "
+        f"{insurer} is required to ensure this policy provides fair value and that "
+        f"all terms are communicated clearly. If you don't understand any part of "
+        f"your policy, you have the right to ask for a plain English explanation. "
+        f"If you're unhappy, you can complain to {insurer} and, if unresolved, "
+        f"refer to the Financial Ombudsman Service (FOS) free of charge."
+    )
+    return {"instruction": instruction, "response": response, "category": "policy_analysis"}
+def gen_fnol() -> dict:
+    line = random.choice(["Motor Private Car", "Home Buildings", "Home Contents"])
+    claim_type = random.choice(CLAIM_TYPES[line])
+    caller = fake.name()
+    policy_num = _rand_policy_number()
+    date_of_loss = _rand_date(2025, 2026)
+    postcode = _rand_postcode()
+    if "Motor" in line:
+        vehicle_reg = _rand_vehicle_reg()
+        extra = f"Vehicle Registration: {vehicle_reg}\n"
+    else:
+        extra = f"Property Address Postcode: {postcode}\n"
+    instruction = (
+        f"You are an FNOL (First Notification of Loss) handler. A policyholder is "
+        f"reporting a new claim by phone. Capture all required information and create "
+        f"a structured FNOL record.\n\n"
+        f"Caller: {caller}\n"
+        f"Policy: {policy_num}\n"
+        f"Line: {line}\n"
+        f"{extra}"
+        f"What happened: \"{claim_type} — happened on {date_of_loss}. "
+        f"{'I noticed it when I got home from work.' if 'Home' in line else 'It happened on the A-road near ' + fake.city() + '.'}\"\n\n"
+        f"Process this FNOL, ask any clarifying questions, and create the record."
+    )
+    claim_num = _rand_claim_number()
+    response = (
+        f"## FNOL Record — {claim_num}\n\n"
+        f"**Date/Time of FNOL:** {datetime.now().strftime('%d/%m/%Y %H:%M')}\n"
+        f"**Reported By:** {caller} (policyholder)\n"
+        f"**Policy Number:** {policy_num}\n"
+        f"**Line of Business:** {line}\n"
+        f"**Date of Loss:** {date_of_loss}\n"
+        f"{extra}"
+        f"**Loss Description:** {claim_type}\n"
+        f"**Claim Number Assigned:** {claim_num}\n\n"
+        f"**Clarifying Questions Asked:**\n"
+        f"1. Were there any injuries? (Response required)\n"
+        f"2. Were emergency services called? (Response required)\n"
+        f"3. Are there any witnesses? (Names/contact details needed)\n"
+        f"4. Has the insured taken steps to prevent further damage? (Duty to mitigate)\n"
+        f"5. Does the insured have any photos or documentation? (Request submission)\n"
+    )
+    if "Motor" in line:
+        response += (
+            f"6. Was a third party involved? (Need their details, insurer, registration)\n"
+            f"7. Has the incident been reported to the police? (Theft/injury = mandatory)\n"
+        )
+    elif "Home" in line:
+        response += (
+            f"6. Is the property currently habitable? (May need emergency accommodation)\n"
+            f"7. Have temporary repairs been carried out? (Keep receipts)\n"
+        )
+    response += (
+        f"\n**Initial Triage:**\n"
+        f"- Severity: {'High' if 'injury' in claim_type.lower() or 'fire' in claim_type.lower() else 'Medium'}\n"
+        f"- Fast-track eligible: {'Yes' if random.random() > 0.5 else 'No — requires adjuster visit'}\n"
+        f"- Fraud indicators: None at this stage\n\n"
+        f"**Next Steps:**\n"
+        f"1. Send acknowledgement letter/email within 24 hours\n"
+        f"2. Assign to claims team for full assessment\n"
+        f"3. Arrange loss adjuster if required\n"
+        f"4. Set initial reserve\n\n"
+        f"**Compliance:** FNOL processed in accordance with {FCA_REFERENCES['claims_handling']} "
+        f"and {FCA_REFERENCES['consumer_duty']}. Customer informed of their rights under "
+        f"{FCA_REFERENCES['complaints']} and FOS referral rights."
+    )
+    return {"instruction": instruction, "response": response, "category": "fnol"}
+def gen_compliance_check() -> dict:
+    scenario_type = random.choice([
+        "pricing_fairness", "claim_decline", "renewal_increase",
+        "data_handling", "vulnerable_customer", "complaints_handling"
+    ])
+    insurer = random.choice(UK_INSURERS)
+    customer = fake.name()
+    scenarios = {
+        "pricing_fairness": {
+            "instruction": (
+                f"{insurer} is renewing a home insurance policy for {customer}, age {random.randint(65, 85)}. "
+                f"The premium has increased from £{random.randint(200, 400):,} to "
+                f"£{random.randint(500, 900):,} — a {random.randint(40, 120)}% increase. "
+                f"The customer's claims history is clean. Assess whether this pricing complies "
+                f"with FCA pricing practices rules and Consumer Duty."
+            ),
+            "response_good": (
+                f"## Compliance Assessment — Pricing Fairness\n\n"
+                f"**CONCERN: Potential non-compliance with FCA pricing rules.**\n\n"
+                f"Under {FCA_REFERENCES['pricing_practices']}, insurers must not charge renewing "
+                f"customers more than they would charge an equivalent new customer. This is the "
+                f"'price walking' ban. A {random.randint(40, 120)}% increase for a claims-free "
+                f"customer raises a red flag.\n\n"
+                f"**Required checks:**\n"
+                f"1. Compare this renewal premium to the new business equivalent premium — they should be comparable\n"
+                f"2. Verify that the increase is justified by risk factors (claims inflation, reinsurance costs, location risk) not customer inertia\n"
+                f"3. Check whether {customer}'s age is driving the increase — pricing based on age alone may breach the {FCA_REFERENCES['equality_act']}\n"
+                f"4. Ensure the customer has received clear communication explaining why the premium has changed\n\n"
+                f"**Consumer Duty ({FCA_REFERENCES['consumer_duty']}):**\n"
+                f"- Outcome 1 (Products & Services): Is this policy still providing fair value?\n"
+                f"- Outcome 2 (Price & Value): The price must reflect the value of the product\n"
+                f"- Outcome 3 (Consumer Understanding): The renewal notice must clearly explain the price change\n"
+                f"- Outcome 4 (Consumer Support): The customer should be able to easily compare options or switch\n\n"
+                f"**Recommendation:** Flag for review by pricing team. Do not issue renewal at this price until "
+                f"new business equivalence is confirmed."
+            ),
+        },
+        "claim_decline": {
+            "instruction": (
+                f"{insurer} is declining a {random.choice(['home escape of water', 'motor theft', 'public liability'])} "
+                f"claim from {customer} (Claim: {_rand_claim_number()}) on the grounds of "
+                f"non-disclosure. The customer did not declare a previous claim from 3 years ago during renewal. "
+                f"Assess the compliance implications of this decline."
+            ),
+            "response_good": (
+                f"## Compliance Assessment — Claim Decline\n\n"
+                f"**CONCERN: Claim decline on non-disclosure grounds requires careful handling.**\n\n"
+                f"Under the Consumer Insurance (Disclosure and Representations) Act 2012 (CIDRA), "
+                f"the burden is on the insurer to ask clear questions. A consumer's duty is to take "
+                f"reasonable care not to make a misrepresentation.\n\n"
+                f"**Key questions:**\n"
+                f"1. Did the renewal documentation SPECIFICALLY ask about previous claims in the last 3-5 years?\n"
+                f"2. Was the question clear and unambiguous? (Under CIDRA, a vague question shifts risk to insurer)\n"
+                f"3. Was the non-disclosure deliberate or reckless, or merely careless?\n"
+                f"   - Deliberate/reckless: Insurer can void the policy and refuse all claims\n"
+                f"   - Careless: Insurer can only adjust (proportional remedy) — what would they have done?\n"
+                f"4. Would {insurer} have still provided cover if the previous claim had been declared?\n\n"
+                f"**Consumer Duty implications:**\n"
+                f"- Simply declining and citing 'non-disclosure' without proportional remedy assessment "
+                f"is likely to breach {FCA_REFERENCES['consumer_duty']}\n"
+                f"- The customer must be informed of their FOS rights\n"
+                f"- A full explanation in plain English must be provided\n\n"
+                f"**Recommendation:** Apply proportional remedy. Calculate what premium would have been charged "
+                f"with full disclosure, and settle claim on that proportional basis unless the non-disclosure was "
+                f"deliberate or reckless."
+            ),
+        },
+        "renewal_increase": {
+            "instruction": (
+                f"Review the following renewal strategy for {insurer}: Motor fleet policy for a commercial "
+                f"customer with {random.randint(20, 100)} vehicles. Current premium: {_rand_gbp(50000, 200000)}. "
+                f"Proposed renewal premium: {_rand_gbp(80000, 350000)}. "
+                f"Loss ratio last 3 years: {random.randint(45, 85)}%. "
+                f"Check if this renewal approach complies with market standards and regulations."
+            ),
+            "response_good": (
+                f"## Renewal Assessment\n\n"
+                f"**Analysis:**\n"
+                f"The premium increase needs to be validated against:\n"
+                f"1. **Burning cost analysis** — actual loss experience over 3+ years\n"
+                f"2. **Market benchmarking** — what similar fleets are paying\n"
+                f"3. **Claims inflation** — motor repair costs have risen ~15% in 2025/2026\n"
+                f"4. **Rate adequacy** — is the current rate technically profitable?\n\n"
+                f"**Compliance check:**\n"
+                f"- Commercial policies are outside the FCA's pricing practices rules (PS21/14), "
+                f"which only apply to retail home and motor\n"
+                f"- However, if the insured is an SME, FCA principles of fair treatment still apply\n"
+                f"- Solvency II requires the premium to be sufficient to cover liabilities\n"
+                f"- Documentation of rating methodology must be maintained for audit\n\n"
+                f"**Recommendation:** Proceed with renewal but ensure full actuarial justification "
+                f"is documented. Provide the customer with a clear breakdown showing how claims "
+                f"experience, market conditions, and inflation contribute to the increase."
+            ),
+        },
+        "data_handling": {
+            "instruction": (
+                f"A claims handler at {insurer} has emailed a spreadsheet containing "
+                f"{random.randint(50, 500)} claim records (including names, addresses, policy numbers, "
+                f"medical reports) to an external loss adjuster using personal email instead of the "
+                f"secure portal. Assess the data protection implications."
+            ),
+            "response_good": (
+                f"## Data Protection Incident Assessment\n\n"
+                f"**SEVERITY: HIGH — Potential reportable data breach under UK GDPR.**\n\n"
+                f"**Assessment:**\n"
+                f"1. This is a personal data breach under {FCA_REFERENCES['gdpr']} (Article 4(12))\n"
+                f"2. The data includes special category data (medical reports) under Article 9\n"
+                f"3. Personal email is not a secure or approved communication channel\n"
+                f"4. The data was not encrypted in transit\n\n"
+                f"**Regulatory obligations:**\n"
+                f"- **ICO notification** — required within 72 hours if there's a risk to individuals' rights "
+                f"(Article 33). Given the volume and sensitivity, this likely triggers notification.\n"
+                f"- **Individual notification** — required without undue delay if high risk to individuals (Article 34)\n"
+                f"- **FCA notification** — material breaches must be reported to the FCA\n\n"
+                f"**Immediate actions:**\n"
+                f"1. Contact the claims handler — do NOT delete the email (preserve evidence)\n"
+                f"2. Contact the loss adjuster — request immediate secure deletion and written confirmation\n"
+                f"3. Check if the personal email provider stores/scans data (e.g., Gmail ToS)\n"
+                f"4. Notify the DPO within 24 hours for breach assessment\n"
+                f"5. Log in the breach register\n"
+                f"6. Investigate root cause — why was the secure portal not used?\n\n"
+                f"**Potential consequences:**\n"
+                f"- ICO fine of up to £17.5M or 4% of annual turnover\n"
+                f"- FCA enforcement action\n"
+                f"- Reputational damage\n"
+                f"- Loss of customers' trust"
+            ),
+        },
+        "vulnerable_customer": {
+            "instruction": (
+                f"A customer ({customer}) has called {insurer} to make a home insurance claim after a burst pipe. "
+                f"The customer sounds distressed, mentions they are elderly and live alone, and says they "
+                f"don't understand the process. The initial handler suggested the customer 'just email the photos'. "
+                f"Review this interaction for Consumer Duty compliance."
+            ),
+            "response_good": (
+                f"## Vulnerable Customer Assessment\n\n"
+                f"**CONCERN: Inadequate support for a potentially vulnerable customer.**\n\n"
+                f"**Vulnerability indicators identified:**\n"
+                f"- Elderly and living alone (health/resilience vulnerability)\n"
+                f"- Distressed (life event + emotional state)\n"
+                f"- Expressed lack of understanding (capability vulnerability)\n"
+                f"- Home damage (residential circumstances)\n\n"
+                f"Under {FCA_REFERENCES['consumer_duty']}, Outcome 4 (Consumer Support), "
+                f"firms must ensure customers can access appropriate support. For vulnerable "
+                f"customers, this means additional reasonable adjustments.\n\n"
+                f"**What went wrong:**\n"
+                f"- 'Just email the photos' is inadequate for a customer who is distressed and may not be tech-literate\n"
+                f"- No vulnerability assessment was conducted\n"
+                f"- No offer of alternative channels (post, in-person visit, appointee)\n"
+                f"- No empathy or reassurance provided\n\n"
+                f"**Required actions:**\n"
+                f"1. Flag {customer}'s file with vulnerability indicators (with consent)\n"
+                f"2. Assign a dedicated claims handler (single point of contact)\n"
+                f"3. Offer multiple channels: phone, post, or arrange a loss adjuster visit who can also help capture photos\n"
+                f"4. Check if the property is habitable — if not, arrange emergency accommodation\n"
+                f"5. Send all correspondence in large print or alternative format if needed\n"
+                f"6. Follow up proactively rather than waiting for the customer to chase\n"
+                f"7. Record the vulnerability and adjustments made for ongoing review\n\n"
+                f"**Training recommendation:** Retrain the initial handler on FCA FG21/1 "
+                f"(Guidance for firms on the fair treatment of vulnerable customers)."
+            ),
+        },
+        "complaints_handling": {
+            "instruction": (
+                f"{customer} has complained to {insurer} that their motor insurance claim "
+                f"(for a theft-recovered vehicle) has been open for {random.randint(4, 8)} months "
+                f"without resolution. The customer says they've had no updates for {random.randint(3, 6)} weeks "
+                f"and can't reach their claims handler. They've threatened to go to the Financial Ombudsman. "
+                f"Assess this complaint and recommend a resolution."
+            ),
+            "response_good": (
+                f"## Complaints Assessment\n\n"
+                f"**SEVERITY: HIGH — FOS referral imminent, Consumer Duty breach likely.**\n\n"
+                f"**Timeline analysis:**\n"
+                f"Under {FCA_REFERENCES['complaints']}, {insurer} must:\n"
+                f"- Send a written acknowledgement within 5 business days (check: was this done?)\n"
+                f"- Issue a Final Response within 8 weeks of the complaint\n"
+                f"- If not resolved in 8 weeks, issue a 'deadlock letter' allowing FOS referral\n\n"
+                f"The customer has a right to refer to FOS if:\n"
+                f"- 8 weeks have passed since the complaint, OR\n"
+                f"- A Final Response has been issued and they're dissatisfied\n"
+                f"- FOS referral must be made within 6 months of the Final Response\n\n"
+                f"**Consumer Duty assessment:**\n"
+                f"- No updates for weeks = breach of Outcome 4 (Consumer Support)\n"
+                f"- Extended delay without resolution = potential breach of Outcome 1\n"
+                f"- Unable to reach handler = systemic support failure\n\n"
+                f"**Recommended resolution:**\n"
+                f"1. Immediate callback from a senior claims manager (today)\n"
+                f"2. Full timeline review — why has this claim been open so long?\n"
+                f"3. If the claim can be settled, make a settlement offer within 48 hours\n"
+                f"4. Offer compensation for distress and inconvenience (FOS typical award: £150-£500)\n"
+                f"5. Issue Final Response with clear explanation and FOS rights\n"
+                f"6. Root cause analysis — workload management, handler turnover, process gaps\n\n"
+                f"**FOS risk:** If this reaches FOS, the likely outcome is an upheld complaint "
+                f"plus a compensation award. Resolve internally to avoid the £750 FOS case fee."
+            ),
+        },
+    }
+    scenario = scenarios[scenario_type]
+    return {
+        "instruction": scenario["instruction"],
+        "response": scenario["response_good"],
+        "category": "compliance_check",
+    }
+def gen_bordereaux_processing() -> dict:
+    mga = random.choice(MGA_NAMES)
+    syndicate = random.choice(LLOYDS_SYNDICATES)
+    month = random.choice(["January", "February", "March", "April", "May", "June",
+                           "July", "August", "September", "October", "November", "December"])
+    year = random.choice([2025, 2026])
+    line = random.choice(["Commercial Property", "Professional Indemnity", "Employers' Liability"])
+    num_risks = random.randint(50, 500)
+    total_gwp = random.randint(100000, 2000000)
+    num_claims = random.randint(3, 30)
+    paid_claims = random.randint(50000, 500000)
+    outstanding = random.randint(100000, 1000000)
+    instruction = (
+        f"You are processing the {month} {year} bordereaux submission from {mga} "
+        f"under their binding authority with {syndicate}.\n\n"
+        f"**Premium Bordereaux Summary:**\n"
+        f"- Line: {line}\n"
+        f"- New risks bound: {num_risks}\n"
+        f"- Total GWP: £{total_gwp:,}\n\n"
+        f"**Claims Bordereaux Summary:**\n"
+        f"- Open claims: {num_claims}\n"
+        f"- Paid this month: £{paid_claims:,}\n"
+        f"- Outstanding reserves: £{outstanding:,}\n\n"
+        f"Validate this bordereaux, identify any issues, and produce a summary for the syndicate."
+    )
+    loss_ratio = round((paid_claims + outstanding) / total_gwp * 100 / 12 * random.randint(6, 12), 1)
+    response = (
+        f"## Bordereaux Processing Report — {mga}\n\n"
+        f"**Binding Authority:** {syndicate}\n"
+        f"**Period:** {month} {year}\n"
+        f"**Line:** {line}\n\n"
+        f"### Premium Bordereaux Validation\n"
+        f"- Risks bound: {num_risks} ✓\n"
+        f"- GWP: £{total_gwp:,}\n"
+        f"- Average premium per risk: £{total_gwp // num_risks:,}\n"
+        f"- Validation checks:\n"
+        f"  - [ ] All risks within binding authority limit? (Check individual risk cap)\n"
+        f"  - [ ] Policy inception dates within BA period?\n"
+        f"  - [ ] All territories within permitted scope?\n"
+        f"  - [ ] OFAC/sanctions screening completed?\n"
+        f"  - [ ] Unique Policy References (UPR) correctly formatted?\n\n"
+        f"### Claims Bordereaux Validation\n"
+        f"- Open claims: {num_claims}\n"
+        f"- Paid: £{paid_claims:,}\n"
+        f"- Outstanding: £{outstanding:,}\n"
+        f"- Incurred: £{paid_claims + outstanding:,}\n"
+        f"- Running loss ratio: {loss_ratio}%\n"
+        f"- Validation checks:\n"
+        f"  - [ ] All claim numbers match policy records?\n"
+        f"  - [ ] Reserve movements properly documented?\n"
+        f"  - [ ] Any claims approaching BA authority limit?\n"
+        f"  - [ ] Large loss notifications submitted for claims > 100% of BA threshold?\n\n"
+        f"### Flags\n"
+    )
+    if loss_ratio > 70:
+        response += f"- ⚠️ Loss ratio at {loss_ratio}% — exceeds 70% threshold for syndicate review\n"
+    if num_claims / num_risks > 0.1:
+        response += f"- ⚠️ Claims frequency ({num_claims}/{num_risks} = {num_claims/num_risks:.1%}) appears elevated\n"
+    if total_gwp / num_risks < 500:
+        response += f"- ⚠️ Average premium (£{total_gwp // num_risks:,}) appears low for {line} — check rating adequacy\n"
+    response += (
+        f"\n### Action Items\n"
+        f"1. Submit validated bordereaux to {syndicate} via XIS/LIMOSS\n"
+        f"2. Reconcile GWP against signed premium advice\n"
+        f"3. Update the burning cost analysis with latest claims data\n"
+        f"4. Schedule quarterly review meeting with {mga} management"
+    )
+    return {"instruction": instruction, "response": response, "category": "bordereaux_processing"}
+def gen_fraud_assessment() -> dict:
+    line = random.choice(["Motor Private Car", "Home Buildings", "Home Contents",
+                          "Employers' Liability", "Public Liability"])
+    claim_num = _rand_claim_number()
+    insured = fake.name()
+    claim_value = random.randint(2000, 80000)
+    indicators = random.sample([
+        "Claim filed within 30 days of policy inception",
+        "Insured increased sum insured 2 weeks before the loss",
+        "Inconsistent dates in insured's statement vs police report",
+        "Property was listed for sale at time of claimed burglary",
+        "Multiple previous claims across different insurers in last 2 years",
+        "Loss occurred during a period when property should have been unoccupied",
+        "Claimed items include high-value electronics but no purchase receipts",
+        "Third party witness is a family member of the insured",
+        "Phone geo-location data places insured 200 miles from claimed loss location",
+        "Vehicle was in arrears on finance payments at time of 'theft'",
+        "Medical report for injury claim references a different incident date",
+        "Social media posts inconsistent with claimed injuries",
+    ], k=random.randint(2, 5))
+    instruction = (
+        f"Assess the following {line} claim for fraud indicators.\n\n"
+        f"Claim: {claim_num}\n"
+        f"Insured: {insured}\n"
+        f"Value: £{claim_value:,}\n\n"
+        f"**Indicators flagged by automated screening:**\n"
+    )
+    for indicator in indicators:
+        instruction += f"- {indicator}\n"
+    instruction += (
+        f"\nProvide a fraud risk assessment, recommend investigation steps, "
+        f"and note any regulatory considerations."
+    )
+    score = min(10, len(indicators) * 2 + random.randint(0, 2))
+    risk_level = "HIGH" if score >= 7 else "MEDIUM" if score >= 4 else "LOW"
+    response = (
+        f"## Fraud Risk Assessment — {claim_num}\n\n"
+        f"**Risk Score:** {score}/10 ({risk_level})\n"
+        f"**Insured:** {insured}\n"
+        f"**Claim Value:** £{claim_value:,}\n\n"
+        f"**Analysis of Indicators:**\n"
+    )
+    for i, indicator in enumerate(indicators, 1):
+        response += f"{i}. **{indicator}** — "
+        if "inception" in indicator.lower():
+            response += "Classic red flag. Check insured's quote history and whether they approached multiple insurers.\n"
+        elif "increased" in indicator.lower():
+            response += "Suggests foreknowledge. Obtain the mid-term adjustment documentation.\n"
+        elif "inconsistent" in indicator.lower():
+            response += "Material discrepancy. Requires detailed statement comparison.\n"
+        elif "sale" in indicator.lower():
+            response += "Financial pressure indicator. Check Land Registry and estate agent listings.\n"
+        elif "multiple" in indicator.lower():
+            response += "Check CUE (Claims & Underwriting Exchange) database for full claims history.\n"
+        elif "unoccupied" in indicator.lower():
+            response += "Policy conditions for unoccupied properties differ. Check occupancy requirements.\n"
+        elif "receipts" in indicator.lower():
+            response += "Request alternative proof: bank/credit card statements, warranty registrations.\n"
+        elif "family member" in indicator.lower():
+            response += "Independent witness corroboration needed. Check for collusion.\n"
+        elif "geo-location" in indicator.lower():
+            response += "Strong objective evidence. Cross-reference with call records and transaction data.\n"
+        elif "finance" in indicator.lower():
+            response += "Financial motive established. Check with finance house for payment status.\n"
+        elif "medical" in indicator.lower():
+            response += "Request GP records and A&E attendance records for verification.\n"
+        elif "social media" in indicator.lower():
+            response += "Document with screenshots (dated). Admissible as evidence.\n"
+        else:
+            response += "Requires further investigation.\n"
+    response += (
+        f"\n**Recommended Actions:**\n"
+        f"1. Refer to Special Investigations Unit (SIU)\n"
+        f"2. Appoint forensic investigator if warranted\n"
+        f"3. Obtain full CUE/CIFAS/IFB checks\n"
+        f"4. Request detailed signed statement from insured under reserve of rights\n"
+        f"5. {'Consider surveillance' if score >= 7 else 'Monitor for further indicators'}\n\n"
+        f"**Regulatory Notes:**\n"
+        f"- Do NOT deny the claim based on suspicion alone — investigation must be completed first\n"
+        f"- Maintain fair treatment under {FCA_REFERENCES['consumer_duty']} even during investigation\n"
+        f"- If fraud is confirmed, refer to the Insurance Fraud Bureau (IFB)\n"
+        f"- CIFAS marker can be applied if the insured is found to have made a fraudulent claim\n"
+        f"- Under the Insurance Act 2015 (commercial) / CIDRA 2012 (consumer), "
+        f"a fraudulent claim entitles the insurer to refuse the entire claim and "
+        f"recover any sums already paid"
+    )
+    return {"instruction": instruction, "response": response, "category": "fraud_assessment"}
+def gen_underwriting_triage() -> dict:
+    line = random.choice(["Commercial Property", "Professional Indemnity",
+                          "Employers' Liability", "Cyber Liability", "D&O"])
+    business = fake.company()
+    turnover = random.randint(500000, 50000000)
+    employees = random.randint(5, 500)
+    inception = _rand_date(2026, 2026)
+    broker = random.choice(["Marsh UK", "Aon UK", "Willis Towers Watson",
+                            "Gallagher", "Howden", "Lockton UK"])
+    instruction = (
+        f"You are an underwriter at a Lloyd's syndicate. Triage the following new business submission.\n\n"
+        f"**Submission Summary:**\n"
+        f"- Broker: {broker}\n"
+        f"- Line: {line}\n"
+        f"- Insured: {business}\n"
+        f"- Turnover: £{turnover:,}\n"
+        f"- Employees: {employees}\n"
+        f"- Proposed Inception: {inception}\n"
+        f"- Trade: {random.choice(['IT consultancy', 'construction contractor', 'solicitors practice', 'healthcare provider', 'manufacturing', 'hospitality', 'financial services', 'retail chain'])}\n"
+        f"- Claims history: {random.choice(['Clean — no claims in 5 years', '1 claim £25K (2024) — resolved', '2 claims totalling £150K (2023-2024)', 'Active claim — £500K outstanding'])}\n\n"
+        f"Triage this submission: accept for quoting, decline, or request further information."
+    )
+    response = (
+        f"## Underwriting Triage — New Business\n\n"
+        f"**Broker:** {broker}\n"
+        f"**Proposed Insured:** {business}\n"
+        f"**Line:** {line}\n\n"
+        f"**Initial Assessment:**\n"
+        f"- Turnover: £{turnover:,} — {'Within appetite' if turnover < 20000000 else 'At upper end of appetite — referral required'}\n"
+        f"- Employees: {employees} — {'Standard' if employees < 200 else 'Large risk — may need specific terms'}\n\n"
+        f"**Information Required Before Quoting:**\n"
+        f"1. Full proposal form (ACORD application or equivalent)\n"
+        f"2. 5-year loss history with triangulation\n"
+        f"3. Current policy schedule (if renewal/transfer)\n"
+        f"4. Risk management procedures documentation\n"
+    )
+    if "Cyber" in line:
+        response += (
+            f"5. IT security assessment (pen test results, ISO 27001 status)\n"
+            f"6. Incident response plan\n"
+            f"7. Data inventory (volume of PII/PHI held)\n"
+            f"8. Third-party vendor risk assessment\n"
+        )
+    elif "Professional" in line:
+        response += (
+            f"5. Professional qualifications and regulatory status\n"
+            f"6. Engagement terms and client contract templates\n"
+            f"7. PI risk management procedures\n"
+        )
+    elif "Property" in line:
+        response += (
+            f"5. Building construction details and fire protection\n"
+            f"6. Business continuity plan\n"
+            f"7. Risk survey report (within last 2 years)\n"
+        )
+    response += (
+        f"\n**Triage Decision:** {'ACCEPT for quoting — request further information above' if turnover < 30000000 else 'REFER to senior underwriter given risk size'}\n\n"
+        f"**Indicative Terms (subject to full underwriting):**\n"
+        f"- Rate: {random.uniform(0.1, 2.5):.2f}% of turnover\n"
+        f"- Minimum premium: {_rand_gbp(2500, 15000)}\n"
+        f"- Deductible: {_rand_gbp(1000, 25000)}\n"
+        f"- Limit: {_rand_gbp(1000000, 10000000)}\n"
+    )
+    return {"instruction": instruction, "response": response, "category": "underwriting_triage"}
+def gen_customer_communication() -> dict:
+    scenario = random.choice([
+        "claim_acknowledgement", "claim_settlement_offer", "renewal_notice",
+        "complaint_final_response", "coverage_clarification"
+    ])
+    customer = fake.name()
+    insurer = random.choice(UK_INSURERS)
+    claim_num = _rand_claim_number()
+    policy_num = _rand_policy_number()
+    if scenario == "claim_acknowledgement":
+        instruction = (
+            f"Draft a claim acknowledgement letter from {insurer} to {customer} "
+            f"for claim {claim_num} (home escape of water). The letter must comply "
+            f"with FCA Consumer Duty plain English requirements."
+        )
+        response = (
+            f"Dear {customer},\n\n"
+            f"**Your Claim — Reference: {claim_num}**\n\n"
+            f"Thank you for letting us know about the water damage to your property. "
+            f"We're sorry you're dealing with this and we want to help resolve it as quickly as possible.\n\n"
+            f"**What happens next:**\n\n"
+            f"1. **Your claim handler** — {fake.name()} will be looking after your claim. "
+            f"You can reach them on {fake.phone_number()} or by email.\n\n"
+            f"2. **We may need to visit** — depending on the extent of the damage, we may arrange "
+            f"for a loss adjuster to visit your property. If so, we'll contact you to arrange "
+            f"a convenient time.\n\n"
+            f"3. **What we need from you:**\n"
+            f"   - Photos of the damage (if safe to take)\n"
+            f"   - Any emergency repair receipts (keep these safe — we'll reimburse reasonable costs)\n"
+            f"   - Details of how the leak happened, if known\n\n"
+            f"4. **Emergency repairs** — if you need to stop further damage (e.g., turning off water, "
+            f"temporary repairs), please go ahead. Your policy covers reasonable emergency costs.\n\n"
+            f"5. **Timeline** — we aim to settle straightforward claims within 30 days. "
+            f"If your claim is more complex, we'll keep you updated every step of the way.\n\n"
+            f"**Your rights:**\n"
+            f"If at any point you're unhappy with how we're handling your claim, please tell us. "
+            f"You can make a complaint and, if we can't resolve it, you can refer to the "
+            f"Financial Ombudsman Service (FOS) free of charge. Details: www.financial-ombudsman.org.uk\n\n"
+            f"Kind regards,\n"
+            f"{fake.name()}\n"
+            f"Claims Team, {insurer}"
+        )
+    elif scenario == "claim_settlement_offer":
+        amount = random.randint(1000, 25000)
+        excess = random.choice([100, 250, 500])
+        instruction = (
+            f"Draft a settlement offer letter from {insurer} to {customer} for claim {claim_num}. "
+            f"The assessed amount is £{amount:,} with a £{excess} excess."
+        )
+        response = (
+            f"Dear {customer},\n\n"
+            f"**Settlement Offer — Claim: {claim_num}**\n\n"
+            f"We've now completed our assessment of your claim. Here's our offer:\n\n"
+            f"| Item | Amount |\n"
+            f"|---|---|\n"
+            f"| Assessed claim value | £{amount:,} |\n"
+            f"| Less: your policy excess | -£{excess:,} |\n"
+            f"| **Settlement amount** | **£{amount - excess:,}** |\n\n"
+            f"**How we calculated this:**\n"
+            f"We assessed the cost of repair/replacement based on our adjuster's report "
+            f"and current market prices. If you'd like a detailed breakdown, please ask.\n\n"
+            f"**If you agree:** Simply reply to this letter or call us to accept. "
+            f"We'll transfer £{amount - excess:,} to your bank account within 5 working days.\n\n"
+            f"**If you disagree:** You don't have to accept this offer. Please let us know "
+            f"why you think the amount should be different, and we'll review. You can also:\n"
+            f"- Obtain your own independent estimate and send it to us\n"
+            f"- Make a complaint through our complaints process\n"
+            f"- Refer to the Financial Ombudsman Service if we can't agree\n\n"
+            f"This offer remains open for 30 days.\n\n"
+            f"Kind regards,\n"
+            f"{fake.name()}\n"
+            f"Claims Team, {insurer}"
+        )
+    else:
+        instruction = (
+            f"Draft a plain English explanation for {customer} about what 'proximate cause' "
+            f"means in the context of their {random.choice(['home', 'motor'])} insurance claim. "
+            f"They don't understand why the insurer is investigating the cause before paying."
+        )
+        response = (
+            f"Dear {customer},\n\n"
+            f"Thank you for your question — it's completely understandable to want to know why we need "
+            f"to investigate before we can pay your claim.\n\n"
+            f"**What 'proximate cause' means in plain English:**\n\n"
+            f"When something goes wrong with your home or car, there's often a chain of events. "
+            f"'Proximate cause' simply means: **what was the main reason the damage happened?**\n\n"
+            f"Your insurance policy covers specific causes (called 'perils') — things like storms, "
+            f"fire, theft, or accidental damage. It doesn't cover others — like wear and tear or "
+            f"gradual deterioration.\n\n"
+            f"**Why it matters for your claim:**\n\n"
+            f"We need to establish *why* the damage happened to check it's covered by your policy. "
+            f"For example:\n"
+            f"- If your roof leaked because of storm damage → that's likely covered ✓\n"
+            f"- If your roof leaked because the tiles were old and worn out → that's wear and tear, "
+            f"which isn't usually covered ✗\n"
+            f"- If the storm made existing wear worse → we look at which cause was 'dominant' — "
+            f"if the storm was the main cause, the claim should be covered even though the roof was old\n\n"
+            f"**We're not trying to avoid paying** — we're making sure we apply your policy correctly "
+            f"and fairly, as required by our regulator (the FCA).\n\n"
+            f"We'll have the investigation results within [X] days and will contact you as soon as we do.\n\n"
+            f"If you have any questions, please call us on {fake.phone_number()}.\n\n"
+            f"Kind regards,\n"
+            f"{fake.name()}\n"
+            f"Claims Team, {insurer}"
+        )
+    return {"instruction": instruction, "response": response, "category": "customer_communication"}
+def gen_reserve_setting() -> dict:
+    line = random.choice(list(CLAIM_TYPES.keys()))
+    claim_type = random.choice(CLAIM_TYPES[line])
+    claim_num = _rand_claim_number()
+    insured = fake.name()
+    current_reserve = random.randint(5000, 100000)
+    new_info = random.choice([
+        "Loss adjuster report received — damage more extensive than initially reported",
+        "Third party solicitor has made contact — injury claim likely to increase",
+        "Subrogation recovery of £15,000 confirmed from third party insurer",
+        "Repair costs quote received — significantly lower than initial estimate",
+        "Medical report indicates longer recovery period — reserve increase needed",
+        "Fraud investigation cleared — claim confirmed genuine, proceed to settlement",
+    ])
+    instruction = (
+        f"Review and update the reserve for claim {claim_num}.\n\n"
+        f"**Current Details:**\n"
+        f"- Insured: {insured}\n"
+        f"- Line: {line}\n"
+        f"- Type: {claim_type}\n"
+        f"- Current reserve: £{current_reserve:,}\n"
+        f"- New information: {new_info}\n\n"
+        f"Assess whether the reserve should be increased, decreased, or maintained. "
+        f"Provide your rationale."
+    )
+    if "more extensive" in new_info or "injury" in new_info or "longer recovery" in new_info:
+        adjustment = random.randint(10000, 50000)
+        new_reserve = current_reserve + adjustment
+        direction = "INCREASE"
+    elif "recovery" in new_info or "lower" in new_info:
+        adjustment = random.randint(1000, max(1000, min(30000, current_reserve - 1000)))
+        new_reserve = current_reserve - adjustment
+        direction = "DECREASE"
+    else:
+        new_reserve = current_reserve
+        direction = "MAINTAIN"
+        adjustment = 0
+    response = (
+        f"## Reserve Review — {claim_num}\n\n"
+        f"**Recommendation:** {direction}\n\n"
+        f"| | Amount |\n"
+        f"|---|---|\n"
+        f"| Previous reserve | £{current_reserve:,} |\n"
+        f"| Adjustment | {'+'if direction == 'INCREASE' else '-' if direction == 'DECREASE' else ''}£{adjustment:,} |\n"
+        f"| **New reserve** | **£{new_reserve:,}** |\n\n"
+        f"**Rationale:**\n"
+        f"Based on the new information — {new_info.lower()} — the reserve should be "
+        f"{'increased to reflect the higher expected claim cost' if direction == 'INCREASE' else 'decreased to reflect reduced expected cost' if direction == 'DECREASE' else 'maintained as the new information does not materially change the expected outcome'}.\n\n"
+        f"**Factors considered:**\n"
+        f"1. Original claim notification and circumstances\n"
+        f"2. Updated damage assessment / medical evidence / legal correspondence\n"
+        f"3. Benchmarking against similar claims (industry loss data)\n"
+        f"4. Potential for future development (claims typically develop over {random.randint(6, 24)} months for {line})\n"
+        f"5. Subrogation/recovery prospects\n\n"
+        f"**Approval required:** {'Yes — exceeds £50K reserve authority' if new_reserve > 50000 else 'No — within claims handler authority'}\n\n"
+        f"**Bordereaux impact:** Updated reserve to be reflected in next monthly claims bordereaux to capacity provider."
+    )
+    return {"instruction": instruction, "response": response, "category": "reserve_setting"}
+def gen_jargon_explanation() -> dict:
+    term = random.choice(list(INSURANCE_JARGON.keys()))
+    definition = INSURANCE_JARGON[term]
+    context_line = random.choice(LINES_OF_BUSINESS[:10])
+    instruction = (
+        f"A policyholder with a {context_line} policy has asked: "
+        f"\"What does '{term}' mean? I saw it in my policy document and I don't understand it.\"\n\n"
+        f"Explain this in plain English, with a practical example relevant to their policy type."
+    )
+    examples = {
+        "utmost good faith": (
+            f"if you're applying for {context_line} insurance and you know about a previous claim "
+            f"or a pre-existing issue, you must tell the insurer. If you hide it and later make a claim, "
+            f"the insurer could refuse to pay."
+        ),
+        "subrogation": (
+            f"say someone else caused damage to your {context_line.lower().replace('home ', 'home')}. "
+            f"Your insurer pays your claim, then chases the person responsible (or their insurer) to "
+            f"get the money back. You don't need to do anything — the insurer handles it."
+        ),
+        "indemnity": (
+            f"your insurer will put you back in the financial position you were in just before the loss. "
+            f"They won't make you better off (no profit from a claim) and won't leave you worse off. "
+            f"So for a {context_line.lower()} claim, they'd cover the cost of repair or replacement "
+            f"to the same standard as before."
+        ),
+        "excess": (
+            f"it's the first part of any claim you pay yourself. For example, if your {context_line.lower()} "
+            f"policy has a £250 excess and your claim is for £2,000, you pay £250 and the insurer pays £1,750."
+        ),
+    }
+    example = examples.get(term, (
+        f"In the context of your {context_line} policy, this means: {definition} "
+        f"For instance, if you needed to make a claim, this term would affect how "
+        f"the insurer assesses and pays that claim."
+    ))
+    response = (
+        f"Great question — '{term}' is one of those insurance terms that sounds complicated "
+        f"but is actually straightforward.\n\n"
+        f"**In plain English:** {definition}\n\n"
+        f"**How this applies to your {context_line} policy:**\n"
+        f"For example, {example}\n\n"
+        f"**Why it matters:** Understanding this term helps you know your rights and "
+        f"what to expect if you ever need to claim. Under the FCA's Consumer Duty rules, "
+        f"your insurer should explain these terms clearly — so if anything else in your "
+        f"policy document doesn't make sense, don't hesitate to ask."
+    )
+    return {"instruction": instruction, "response": response, "category": "jargon_explanation"}
+def gen_regulatory_query() -> dict:
+    query_type = random.choice([
+        "consumer_duty_overview",
+        "fos_process",
+        "solvency_ii",
+        "lloyd_s_market",
+        "gender_pricing",
+        "claims_management_company",
+    ])
+    queries = {
+        "consumer_duty_overview": {
+            "instruction": (
+                "A new claims handler asks: 'What is the FCA Consumer Duty and how does it affect "
+                "how I handle claims on a daily basis? What could go wrong if I don't comply?'"
+            ),
+            "response": (
+                f"## FCA Consumer Duty — Practical Guide for Claims Handlers\n\n"
+                f"**What it is:** {FCA_REFERENCES['consumer_duty']} is the FCA's highest-priority regulation. "
+                f"It requires all firms to 'act to deliver good outcomes for retail customers.'\n\n"
+                f"**The 4 outcomes you must deliver on every claim:**\n\n"
+                f"1. **Products & Services** — The policy must be designed to meet the customer's needs. "
+                f"If you notice the policy doesn't cover what the customer reasonably expected, flag it.\n\n"
+                f"2. **Price & Value** — The customer has paid a fair premium for meaningful cover. "
+                f"Don't use technical loopholes to avoid paying legitimate claims.\n\n"
+                f"3. **Consumer Understanding** — Communicate in plain English. No jargon in claim letters. "
+                f"Explain decisions clearly. If declining part of a claim, explain WHY in terms they can understand.\n\n"
+                f"4. **Consumer Support** — Be accessible. Respond promptly. Identify vulnerable customers "
+                f"and provide additional support. Don't make the claims process unnecessarily difficult.\n\n"
+                f"**What could go wrong:**\n"
+                f"- FCA enforcement action and fines (up to unlimited amount)\n"
+                f"- FOS complaints upheld against the firm (£750 per case fee + compensation)\n"
+                f"- Personal regulatory action against individuals who systematically breach the Duty\n"
+                f"- Reputational damage in the market\n\n"
+                f"**Daily practical tips:**\n"
+                f"- Would I be comfortable if the FCA read this email/letter? If not, rewrite it.\n"
+                f"- Am I explaining the decision or just quoting policy clause numbers? Explain.\n"
+                f"- Has the customer been waiting more than expected for an update? Call them proactively.\n"
+                f"- Does the customer seem confused or distressed? Offer additional support."
+            ),
+        },
+        "fos_process": {
+            "instruction": "Explain the Financial Ombudsman Service complaints process for insurance disputes in the UK.",
+            "response": (
+                f"## Financial Ombudsman Service (FOS) — Insurance Complaints Process\n\n"
+                f"**What is FOS?** A free, independent service that resolves disputes between consumers "
+                f"and financial firms in the UK.\n\n"
+                f"**Who can use FOS?** Consumers, micro-enterprises (under 10 employees + under €2M turnover), "
+                f"and charities with annual income under £6.5M.\n\n"
+                f"**The process:**\n\n"
+                f"1. **Complain to your insurer first** — the insurer has 8 weeks to issue a Final Response\n"
+                f"2. **If unsatisfied**, refer to FOS within 6 months of the Final Response\n"
+                f"3. **If 8 weeks pass** without a Final Response, you can refer immediately\n"
+                f"4. **FOS investigates** — reviews both sides, may request documents, may mediate\n"
+                f"5. **Adjudicator's opinion** — first assessment\n"
+                f"6. **If either party disagrees** — can request an Ombudsman's Final Decision (binding on the firm, not the consumer)\n\n"
+                f"**Costs:**\n"
+                f"- Free for consumers\n"
+                f"- Firms pay £750 case fee per complaint (after first 3 free per year)\n\n"
+                f"**Compensation limits:**\n"
+                f"- Up to £430,000 for complaints about acts after 1 April 2024\n"
+                f"- Plus fair compensation for distress and inconvenience\n\n"
+                f"**For insurers:** FOS decisions are published and create informal precedent. "
+                f"High upheld rates damage reputation and invite FCA scrutiny."
+            ),
+        },
+        "gender_pricing": {
+            "instruction": (
+                "Can we use gender as a rating factor in our motor insurance pricing model? "
+                "What's the current UK legal position?"
+            ),
+            "response": (
+                f"## Gender-Based Insurance Pricing — UK Legal Position\n\n"
+                f"**Short answer: NO.** You cannot use gender as a rating factor for insurance pricing in the UK.\n\n"
+                f"**Legal basis:**\n"
+                f"- **EU Gender Directive (2004/113/EC)** as interpreted by the Court of Justice (Test-Achats, C-236/09, 2011)\n"
+                f"- This was implemented in the UK and remains part of retained UK law post-Brexit\n"
+                f"- The **{FCA_REFERENCES['equality_act']}** prohibits direct discrimination based on sex in the provision of services\n\n"
+                f"**What this means in practice:**\n"
+                f"1. You CANNOT charge different premiums based on the policyholder's gender\n"
+                f"2. You CANNOT use gender as a proxy rating factor (e.g., using first name to infer gender)\n"
+                f"3. You CAN use factors that correlate with risk even if they also correlate with gender "
+                f"(e.g., mileage, occupation, vehicle type) — provided the factor independently predicts risk\n"
+                f"4. Your pricing model must be able to demonstrate that no protected characteristic "
+                f"is being used directly or as a proxy\n\n"
+                f"**FCA scrutiny:**\n"
+                f"The FCA actively monitors pricing models for fairness under {FCA_REFERENCES['pricing_practices']}. "
+                f"Firms must be able to justify every rating factor.\n\n"
+                f"**Action:** Review all rating factors with your actuarial team. Document the independent "
+                f"risk justification for each factor. Remove any factor that can be shown to primarily "
+                f"serve as a gender proxy."
+            ),
+        },
+    }
+    q = queries.get(query_type, queries["consumer_duty_overview"])
+    return {"instruction": q["instruction"], "response": q["response"], "category": "regulatory_query"}
+def gen_renewal_review() -> dict:
+    customer = fake.name()
+    insurer = random.choice(UK_INSURERS)
+    policy_num = _rand_policy_number()
+    line = random.choice(["Motor Private Car", "Home Combined", "Commercial Combined"])
+    current_premium = random.randint(200, 5000)
+    new_premium = int(current_premium * random.uniform(0.9, 1.5))
+    change_pct = round((new_premium - current_premium) / current_premium * 100, 1)
+    instruction = (
+        f"Review the following renewal for compliance and customer outcome.\n\n"
+        f"**Renewal Details:**\n"
+        f"- Policyholder: {customer}\n"
+        f"- Policy: {policy_num}\n"
+        f"- Line: {line}\n"
+        f"- Current premium: £{current_premium:,}\n"
+        f"- Renewal premium: £{new_premium:,} ({'+' if change_pct > 0 else ''}{change_pct}%)\n"
+        f"- Claims in period: {random.choice(['None', '1 small claim (£800)', '1 claim (£3,500)', 'None'])}\n"
+        f"- Customer tenure: {random.randint(1, 15)} years\n\n"
+        f"Assess whether this renewal is fair and compliant."
+    )
+    response = (
+        f"## Renewal Review — {policy_num}\n\n"
+        f"**Premium Change:** {'+'if change_pct > 0 else ''}{change_pct}% "
+        f"(£{current_premium:,} → £{new_premium:,})\n\n"
+        f"**Compliance Checks:**\n\n"
+    )
+    if "Motor" in line or "Home" in line:
+        response += (
+            f"1. **Price Walking Check ({FCA_REFERENCES['pricing_practices']}):** "
+            f"{'PASS — renewal premium must not exceed equivalent new business price. Verify with pricing team.' if change_pct < 20 else 'FLAG — significant increase requires NB equivalence verification.'}\n\n"
+        )
+    else:
+        response += f"1. **Commercial policy — PS21/14 does not apply**, but fair treatment principles still do.\n\n"
+    response += (
+        f"2. **Consumer Duty Value Assessment:** "
+        f"{'The product appears to offer fair value at this price point.' if change_pct < 25 else 'Significant price increase — requires value assessment documentation.'}\n\n"
+        f"3. **Communication Check:** Renewal notice must clearly show:\n"
+        f"   - Last year's premium for comparison ✓\n"
+        f"   - Any changes to cover ✓\n"
+        f"   - Reminder to shop around ('you may be able to get this cover cheaper elsewhere') ✓\n"
+        f"   - Cancellation rights ✓\n\n"
+        f"4. **Auto-Renewal Disclosure:** If this policy auto-renews, the notice must clearly state this "
+        f"and explain how to opt out.\n\n"
+        f"**Recommendation:** {'Proceed with renewal' if change_pct < 30 else 'Review pricing justification before issuing'}"
+    )
+    return {"instruction": instruction, "response": response, "category": "renewal_review"}
+# ── Master Generator ──
+GENERATORS = {
+    "claims_handling": gen_claims_handling,
+    "policy_analysis": gen_policy_analysis,
+    "fnol": gen_fnol,
+    "compliance_check": gen_compliance_check,
+    "bordereaux_processing": gen_bordereaux_processing,
+    "fraud_assessment": gen_fraud_assessment,
+    "underwriting_triage": gen_underwriting_triage,
+    "customer_communication": gen_customer_communication,
+    "reserve_setting": gen_reserve_setting,
+    "jargon_explanation": gen_jargon_explanation,
+    "regulatory_query": gen_regulatory_query,
+    "renewal_review": gen_renewal_review,
+}
+def generate_sft_dataset(n: int = 10000, output_path: str = "data/output/insurance_sft_10k.jsonl"):
+    """Generate n SFT examples, balanced across categories."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    categories = list(GENERATORS.keys())
+    per_category = n // len(categories)
+    remainder = n % len(categories)
+    records = []
+    for i, cat in enumerate(categories):
+        count = per_category + (1 if i < remainder else 0)
+        gen_fn = GENERATORS[cat]
+        for _ in tqdm(range(count), desc=f"Generating {cat}"):
+            record = gen_fn()
+            # Format as chat for Qwen3 training
+            records.append({
+                "messages": [
+                    {"role": "system", "content": "You are InsureLLM, a specialist UK insurance AI assistant trained on UK insurance law, FCA regulations, Lloyd's market practices, and ACORD standards. You provide accurate, compliant, and plain-English guidance for insurance professionals and consumers."},
+                    {"role": "user", "content": record["instruction"]},
+                    {"role": "assistant", "content": record["response"]},
+                ],
+                "category": record["category"],
+            })
+    random.shuffle(records)
+    with open(output_path, "w") as f:
+        for record in records:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+    print(f"\n✓ Generated {len(records)} SFT examples → {output_path}")
+    # Print category distribution
+    from collections import Counter
+    dist = Counter(r["category"] for r in records)
+    for cat, count in sorted(dist.items()):
+        print(f"  {cat}: {count}")
+    return output_path
+if __name__ == "__main__":
+    generate_sft_dataset()

data/gen_tabular.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""
+InsureOS — Synthetic Tabular Claims Data Generator
+Generates 50K synthetic UK insurance claims for fraud detection and pricing models.
+"""
+import csv
+import json
+import os
+import random
+from datetime import datetime, timedelta
+from pathlib import Path
+from faker import Faker
+from tqdm import tqdm
+from data.constants import (
+    UK_INSURERS, UK_REGIONS, LLOYDS_SYNDICATES, MGAS,
+    CLAIM_TYPES, LINES_OF_BUSINESS,
+)
+fake = Faker("en_GB")
+Faker.seed(44)
+random.seed(44)
+# ── Vehicle data ──
+MAKES_MODELS = {
+    "Ford": ["Fiesta", "Focus", "Puma", "Kuga", "Ranger"],
+    "Vauxhall": ["Corsa", "Astra", "Mokka", "Grandland"],
+    "Volkswagen": ["Golf", "Polo", "T-Roc", "Tiguan", "ID.3"],
+    "BMW": ["1 Series", "3 Series", "X1", "X3", "iX"],
+    "Mercedes": ["A-Class", "C-Class", "GLA", "GLC"],
+    "Audi": ["A3", "A4", "Q3", "Q5"],
+    "Toyota": ["Yaris", "Corolla", "C-HR", "RAV4"],
+    "Nissan": ["Juke", "Qashqai", "Leaf", "X-Trail"],
+    "Kia": ["Picanto", "Sportage", "Niro", "EV6"],
+    "Hyundai": ["i10", "i20", "Tucson", "IONIQ 5"],
+    "Tesla": ["Model 3", "Model Y"],
+    "Peugeot": ["208", "2008", "308", "3008"],
+    "Renault": ["Clio", "Captur", "Megane E-Tech"],
+}
+FUEL_TYPES = ["Petrol", "Diesel", "Hybrid", "Electric", "Plug-in Hybrid"]
+OCCUPATIONS = [
+    "Accountant", "Teacher", "Software Engineer", "Nurse", "Manager",
+    "Retired", "Student", "Self-employed", "Civil Servant", "Sales Executive",
+    "Driver", "Electrician", "Solicitor", "Architect", "Doctor",
+    "Chef", "Farmer", "Mechanic", "Journalist", "Pharmacist",
+]
+PROPERTY_TYPES = ["Detached", "Semi-detached", "Terraced", "Flat", "Bungalow", "Converted Flat"]
+HEATING_TYPES = ["Gas Central", "Electric", "Oil", "Heat Pump", "LPG"]
+FLOOD_RISK = ["Negligible", "Low", "Medium", "High"]
+SUBSIDENCE_RISK = ["No", "Historic – resolved", "Active"]
+# ── Motor claim record ──
+def gen_motor_claim(claim_id: int, is_fraud: bool = False) -> dict:
+    region_name, region = random.choice(list(UK_REGIONS.items()))
+    postcode_prefix = random.choice(region)
+    make = random.choice(list(MAKES_MODELS.keys()))
+    model = random.choice(MAKES_MODELS[make])
+    vehicle_year = random.randint(2012, 2025)
+    vehicle_value = random.randint(3000, 65000)
+    driver_age = random.randint(18, 80)
+    inception = fake.date_between(start_date="-3y", end_date="-30d")
+    loss_date = fake.date_between(start_date=inception, end_date="today")
+    report_date = loss_date + timedelta(days=random.randint(0, 14))
+    settlement_date = report_date + timedelta(days=random.randint(5, 120)) if random.random() > 0.25 else None
+    claim_type = random.choice(CLAIM_TYPES["Motor Private Car"])
+    reserve = random.randint(500, 40000)
+    # Fraud signals
+    if is_fraud:
+        # Exaggerated patterns
+        claim_amount = reserve * random.uniform(1.5, 4.0)
+        days_to_report = random.randint(10, 30)
+        previous_claims_3y = random.randint(2, 6)
+        policy_age_days = random.randint(10, 90)  # New policy
+        witnesses = 0
+        dashcam = False
+        police_report = random.random() < 0.15
+        time_of_loss = f"{random.randint(22, 23):02d}:{random.randint(0, 59):02d}"
+    else:
+        claim_amount = reserve * random.uniform(0.5, 1.3)
+        days_to_report = random.randint(0, 7)
+        previous_claims_3y = random.choices([0, 1, 2, 3], weights=[60, 25, 10, 5])[0]
+        policy_age_days = random.randint(30, 1095)
+        witnesses = random.choices([0, 1, 2], weights=[30, 50, 20])[0]
+        dashcam = random.random() < 0.4
+        police_report = random.random() < 0.5
+        time_of_loss = f"{random.randint(6, 21):02d}:{random.randint(0, 59):02d}"
+    return {
+        "claim_id": f"MTR-{claim_id:06d}",
+        "lob": "Motor",
+        "insurer": random.choice(UK_INSURERS),
+        "region": region_name,
+        "postcode_prefix": postcode_prefix,
+        "inception_date": inception.isoformat(),
+        "loss_date": loss_date.isoformat(),
+        "report_date": report_date.isoformat(),
+        "settlement_date": settlement_date.isoformat() if settlement_date else "",
+        "claim_type": claim_type,
+        "claim_status": random.choice(["Open", "Settled", "Declined", "Reserved"]) if not settlement_date else "Settled",
+        # Driver
+        "driver_age": driver_age,
+        "driver_gender": random.choice(["M", "F"]),
+        "occupation": random.choice(OCCUPATIONS),
+        "years_driving": max(0, driver_age - random.randint(17, 25)),
+        "years_ncd": random.randint(0, min(20, max(0, driver_age - 18))),
+        # Vehicle
+        "vehicle_make": make,
+        "vehicle_model": model,
+        "vehicle_year": vehicle_year,
+        "vehicle_value": round(vehicle_value, 0),
+        "fuel_type": random.choice(FUEL_TYPES),
+        "annual_mileage": random.choice([5000, 8000, 10000, 12000, 15000, 20000]),
+        # Financial
+        "premium": round(random.uniform(300, 3500), 2),
+        "voluntary_excess": random.choice([0, 100, 250, 500, 750]),
+        "compulsory_excess": random.choice([150, 250, 350]),
+        "reserve_amount": round(reserve, 2),
+        "claim_amount": round(claim_amount, 2),
+        "recovery_amount": round(claim_amount * random.uniform(0, 0.5), 2) if random.random() < 0.3 else 0.0,
+        # Risk indicators
+        "previous_claims_3y": previous_claims_3y,
+        "days_to_report": days_to_report,
+        "policy_age_days": policy_age_days,
+        "witnesses": witnesses,
+        "dashcam": dashcam,
+        "police_report": police_report,
+        "time_of_loss": time_of_loss,
+        # Target
+        "is_fraud": is_fraud,
+    }
+# ── Home/Property claim record ──
+def gen_property_claim(claim_id: int, is_fraud: bool = False) -> dict:
+    region_name, region = random.choice(list(UK_REGIONS.items()))
+    postcode_prefix = random.choice(region)
+    property_type = random.choice(PROPERTY_TYPES)
+    rebuild_value = random.randint(150000, 800000)
+    contents_value = random.randint(20000, 150000)
+    property_age = random.randint(1, 200)
+    inception = fake.date_between(start_date="-3y", end_date="-30d")
+    loss_date = fake.date_between(start_date=inception, end_date="today")
+    report_date = loss_date + timedelta(days=random.randint(0, 14))
+    claim_type = random.choice(CLAIM_TYPES["Home Buildings"] + CLAIM_TYPES["Home Contents"])
+    reserve = random.randint(1000, 80000)
+    if is_fraud:
+        claim_amount = reserve * random.uniform(2.0, 5.0)
+        days_to_report = random.randint(12, 45)
+        previous_claims_3y = random.randint(2, 5)
+        policy_age_days = random.randint(15, 60)
+        has_cctv = False
+        loss_adjuster_appointed = True
+    else:
+        claim_amount = reserve * random.uniform(0.6, 1.4)
+        days_to_report = random.randint(0, 7)
+        previous_claims_3y = random.choices([0, 1, 2], weights=[70, 22, 8])[0]
+        policy_age_days = random.randint(30, 1825)
+        has_cctv = random.random() < 0.25
+        loss_adjuster_appointed = random.random() < 0.4
+    return {
+        "claim_id": f"PRP-{claim_id:06d}",
+        "lob": "Property",
+        "insurer": random.choice(UK_INSURERS),
+        "region": region_name,
+        "postcode_prefix": postcode_prefix,
+        "inception_date": inception.isoformat(),
+        "loss_date": loss_date.isoformat(),
+        "report_date": report_date.isoformat(),
+        "settlement_date": "",
+        "claim_type": claim_type,
+        "claim_status": random.choice(["Open", "Settled", "Declined", "Reserved"]),
+        # Property
+        "property_type": property_type,
+        "property_age_years": property_age,
+        "rebuild_value": rebuild_value,
+        "contents_value": contents_value,
+        "heating_type": random.choice(HEATING_TYPES),
+        "flood_risk_zone": random.choice(FLOOD_RISK),
+        "subsidence_history": random.choice(SUBSIDENCE_RISK),
+        "alarm_installed": random.random() < 0.6,
+        "locks_bs3621": random.random() < 0.55,
+        # Financial
+        "premium": round(random.uniform(150, 2500), 2),
+        "voluntary_excess": random.choice([0, 100, 250, 500]),
+        "compulsory_excess": random.choice([100, 250]),
+        "reserve_amount": round(reserve, 2),
+        "claim_amount": round(claim_amount, 2),
+        "recovery_amount": 0.0,
+        # Risk
+        "previous_claims_3y": previous_claims_3y,
+        "days_to_report": days_to_report,
+        "policy_age_days": policy_age_days,
+        "has_cctv": has_cctv,
+        "loss_adjuster_appointed": loss_adjuster_appointed,
+        "unoccupied_30_days": random.random() < 0.05,
+        # Target
+        "is_fraud": is_fraud,
+    }
+# ── Liability claim record ──
+def gen_liability_claim(claim_id: int, is_fraud: bool = False) -> dict:
+    region_name, region = random.choice(list(UK_REGIONS.items()))
+    postcode_prefix = random.choice(region)
+    inception = fake.date_between(start_date="-3y", end_date="-30d")
+    loss_date = fake.date_between(start_date=inception, end_date="today")
+    report_date = loss_date + timedelta(days=random.randint(0, 30))
+    claim_type = random.choice(
+        CLAIM_TYPES["Employers' Liability"] + CLAIM_TYPES["Public Liability"]
+        + CLAIM_TYPES["Professional Indemnity"]
+    )
+    reserve = random.randint(5000, 250000)
+    if is_fraud:
+        claim_amount = reserve * random.uniform(2.0, 6.0)
+        days_to_report = random.randint(20, 60)
+        previous_claims_3y = random.randint(3, 8)
+        solicitor_involved = True
+        independent_witness = False
+        medical_evidence_delay_days = random.randint(30, 90)
+    else:
+        claim_amount = reserve * random.uniform(0.5, 1.5)
+        days_to_report = random.randint(0, 14)
+        previous_claims_3y = random.choices([0, 1, 2], weights=[65, 25, 10])[0]
+        solicitor_involved = random.random() < 0.45
+        independent_witness = random.random() < 0.5
+        medical_evidence_delay_days = random.randint(2, 21)
+    return {
+        "claim_id": f"LBL-{claim_id:06d}",
+        "lob": "Liability",
+        "insurer": random.choice(UK_INSURERS),
+        "region": region_name,
+        "postcode_prefix": postcode_prefix,
+        "inception_date": inception.isoformat(),
+        "loss_date": loss_date.isoformat(),
+        "report_date": report_date.isoformat(),
+        "settlement_date": "",
+        "claim_type": claim_type,
+        "claim_status": random.choice(["Open", "Settled", "Declined", "Reserved"]),
+        # Claimant
+        "claimant_age": random.randint(18, 80),
+        "claimant_gender": random.choice(["M", "F"]),
+        "injury_type": random.choice([
+            "Whiplash", "Back strain", "Fracture", "Soft tissue",
+            "Psychological", "Head injury", "Burns", "Multiple",
+        ]),
+        "injury_severity": random.choice(["Minor", "Moderate", "Serious", "Catastrophic"]),
+        # Financial
+        "reserve_amount": round(reserve, 2),
+        "claim_amount": round(claim_amount, 2),
+        "solicitor_costs": round(random.uniform(500, 15000), 2) if solicitor_involved else 0.0,
+        "medical_costs": round(random.uniform(200, 25000), 2),
+        # Risk
+        "previous_claims_3y": previous_claims_3y,
+        "days_to_report": days_to_report,
+        "solicitor_involved": solicitor_involved,
+        "independent_witness": independent_witness,
+        "medical_evidence_delay_days": medical_evidence_delay_days,
+        "cctv_available": random.random() < 0.2,
+        # Target
+        "is_fraud": is_fraud,
+    }
+# ── Orchestrator ──
+def generate_tabular_dataset(
+    n: int = 50000,
+    fraud_rate: float = 0.08,
+    output_dir: str = "data/output",
+):
+    """Generate mixed LoB tabular claims dataset with ~8% fraud rate."""
+    os.makedirs(output_dir, exist_ok=True)
+    n_fraud = int(n * fraud_rate)
+    n_genuine = n - n_fraud
+    # Split across LoB: 50% motor, 30% property, 20% liability
+    splits = {"motor": 0.50, "property": 0.30, "liability": 0.20}
+    motor_records = []
+    prop_records = []
+    liab_records = []
+    for lob, frac in splits.items():
+        total_lob = int(n * frac)
+        fraud_lob = int(n_fraud * frac)
+        genuine_lob = total_lob - fraud_lob
+        gen_fn = {"motor": gen_motor_claim, "property": gen_property_claim, "liability": gen_liability_claim}[lob]
+        target = {"motor": motor_records, "property": prop_records, "liability": liab_records}[lob]
+        for i in tqdm(range(genuine_lob), desc=f"{lob.capitalize()} — genuine"):
+            target.append(gen_fn(len(target) + 1, is_fraud=False))
+        for i in tqdm(range(fraud_lob), desc=f"{lob.capitalize()} — fraud"):
+            target.append(gen_fn(len(target) + 1, is_fraud=True))
+    # Write separate CSVs per LoB
+    for name, records in [("motor", motor_records), ("property", prop_records), ("liability", liab_records)]:
+        random.shuffle(records)
+        outpath = os.path.join(output_dir, f"claims_{name}_{len(records)}k.csv")
+        if records:
+            with open(outpath, "w", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=records[0].keys())
+                writer.writeheader()
+                writer.writerows(records)
+            fraud_count = sum(1 for r in records if r["is_fraud"])
+            print(f"✓ {name}: {len(records)} records ({fraud_count} fraud, {fraud_count/len(records)*100:.1f}%) → {outpath}")
+    # Also write combined JSONL for convenience
+    all_records = motor_records + prop_records + liab_records
+    random.shuffle(all_records)
+    combined_path = os.path.join(output_dir, f"claims_all_{len(all_records)}.jsonl")
+    with open(combined_path, "w") as f:
+        for rec in all_records:
+            f.write(json.dumps(rec, ensure_ascii=False, default=str) + "\n")
+    total_fraud = sum(1 for r in all_records if r["is_fraud"])
+    print(f"\n✓ Combined: {len(all_records)} records ({total_fraud} fraud, {total_fraud/len(all_records)*100:.1f}%) → {combined_path}")
+    return combined_path
+if __name__ == "__main__":
+    generate_tabular_dataset()

data/generate_all.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+InsureOS — Master Data Generation Orchestrator
+Runs all synthetic data generators and produces the complete training dataset.
+"""
+import time
+import sys
+from pathlib import Path
+# Ensure project root on path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from data.gen_sft import generate_sft_dataset
+from data.gen_dpo import generate_dpo_dataset
+from data.gen_tabular import generate_tabular_dataset
+from data.gen_documents import generate_document_dataset
+from data.gen_ner import generate_ner_dataset
+OUTPUT_DIR = "data/output"
+def main():
+    start = time.time()
+    print("=" * 60)
+    print(" InsureOS — Synthetic Data Generation Pipeline")
+    print("=" * 60)
+    # 1. SFT instruction-response pairs
+    print("\n[1/5] SFT Data (10K instruction-response pairs)")
+    generate_sft_dataset(n=10000, output_path=f"{OUTPUT_DIR}/insurance_sft_10k.jsonl")
+    # 2. DPO preference pairs
+    print("\n[2/5] DPO Data (5K preference pairs)")
+    generate_dpo_dataset(n=5000, output_path=f"{OUTPUT_DIR}/insurance_dpo_5k.jsonl")
+    # 3. Tabular claims data
+    print("\n[3/5] Tabular Claims Data (50K records)")
+    generate_tabular_dataset(n=50000, fraud_rate=0.08, output_dir=OUTPUT_DIR)
+    # 4. Document classification
+    print("\n[4/5] Document Classification Data (10K documents)")
+    generate_document_dataset(n=10000, output_path=f"{OUTPUT_DIR}/insurance_docs_10k.jsonl")
+    # 5. NER data
+    print("\n[5/5] NER Data (8K token-labelled examples)")
+    generate_ner_dataset(n=8000, output_path=f"{OUTPUT_DIR}/insurance_ner_8k.jsonl")
+    elapsed = time.time() - start
+    print("\n" + "=" * 60)
+    print(f" ✓ All data generated in {elapsed:.1f}s")
+    print(f" Output directory: {OUTPUT_DIR}/")
+    print("=" * 60)
+    # List generated files
+    output_path = Path(OUTPUT_DIR)
+    if output_path.exists():
+        print("\nGenerated files:")
+        for f in sorted(output_path.iterdir()):
+            size_mb = f.stat().st_size / (1024 * 1024)
+            print(f"  {f.name:50s} {size_mb:8.2f} MB")
+if __name__ == "__main__":
+    main()

distill.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""
+InsureOS — Knowledge Distillation Script
+Distils InsureLLM-8B (DPO-aligned teacher) → InsureLLM-4B (Qwen3-4B student).
+Uses KL-divergence + hard-label distillation for 16 GB VRAM.
+"""
+import os
+import json
+import argparse
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from datasets import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
+from tqdm import tqdm
+# ── Defaults ──
+TEACHER_MODEL = "models/insurellm-8b-dpo-merged"
+STUDENT_MODEL = "Qwen/Qwen3-4B"
+DATA_PATH = "data/output/insurance_sft_10k.jsonl"
+OUTPUT_DIR = "models/insurellm-4b-distilled"
+MAX_SEQ_LEN = 1024
+LORA_R = 32
+LORA_ALPHA = 64
+TEMPERATURE = 3.0  # softens teacher logits
+ALPHA_KL = 0.7     # weight of KL loss vs hard label loss
+EPOCHS = 3
+BATCH_SIZE = 2
+GRAD_ACCUM = 8
+LR = 1e-4
+WARMUP_STEPS = 50
+SAVE_STEPS = 200
+def load_data(path: str, tokenizer, max_len: int) -> Dataset:
+    """Load and tokenize SFT data for distillation."""
+    records = []
+    with open(path) as f:
+        for line in f:
+            obj = json.loads(line)
+            # Apply chat template to get text
+            text = tokenizer.apply_chat_template(
+                obj["messages"],
+                tokenize=False,
+                add_generation_prompt=False,
+            )
+            records.append({"text": text})
+    ds = Dataset.from_list(records)
+    def tokenize_fn(examples):
+        return tokenizer(
+            examples["text"],
+            truncation=True,
+            max_length=max_len,
+            padding="max_length",
+            return_tensors="pt",
+        )
+    ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])
+    ds.set_format("torch")
+    return ds
+def main():
+    parser = argparse.ArgumentParser(description="Distil InsureLLM-8B → InsureLLM-4B")
+    parser.add_argument("--teacher-model", default=TEACHER_MODEL)
+    parser.add_argument("--student-model", default=STUDENT_MODEL)
+    parser.add_argument("--data-path", default=DATA_PATH)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    parser.add_argument("--epochs", type=int, default=EPOCHS)
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
+    parser.add_argument("--lr", type=float, default=LR)
+    parser.add_argument("--temperature", type=float, default=TEMPERATURE)
+    parser.add_argument("--alpha-kl", type=float, default=ALPHA_KL)
+    args = parser.parse_args()
+    print(f"{'='*60}")
+    print(f" InsureOS — Knowledge Distillation")
+    print(f" Teacher: {args.teacher_model}")
+    print(f" Student: {args.student_model}")
+    print(f" Temperature: {args.temperature}, Alpha: {args.alpha_kl}")
+    print(f"{'='*60}\n")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # ── 1. Load tokenizer (student's) ──
+    print("[1/5] Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.student_model,
+        trust_remote_code=True,
+        padding_side="right",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # ── 2. Load teacher (4-bit, frozen) ──
+    print("[2/5] Loading teacher model (4-bit, frozen)...")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    teacher = AutoModelForCausalLM.from_pretrained(
+        args.teacher_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    teacher.eval()
+    for p in teacher.parameters():
+        p.requires_grad = False
+    # ── 3. Load student (4-bit + LoRA for training) ──
+    print("[3/5] Loading student model with LoRA...")
+    student = AutoModelForCausalLM.from_pretrained(
+        args.student_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    student = prepare_model_for_kbit_training(student, use_gradient_checkpointing=True)
+    lora_config = LoraConfig(
+        r=LORA_R,
+        lora_alpha=LORA_ALPHA,
+        lora_dropout=0.05,
+        target_modules="all-linear",
+        task_type=TaskType.CAUSAL_LM,
+        bias="none",
+    )
+    student = get_peft_model(student, lora_config)
+    student.print_trainable_parameters()
+    # ── 4. Load data ──
+    print("[4/5] Loading and tokenizing data...")
+    dataset = load_data(args.data_path, tokenizer, MAX_SEQ_LEN)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
+    print(f"  Examples: {len(dataset)}, Batches/epoch: {len(dataloader)}")
+    # ── 5. Distillation training loop ──
+    print("[5/5] Starting distillation...\n")
+    optimizer = torch.optim.AdamW(student.parameters(), lr=args.lr, weight_decay=0.01)
+    total_steps = len(dataloader) * args.epochs // GRAD_ACCUM
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)
+    global_step = 0
+    best_loss = float("inf")
+    for epoch in range(args.epochs):
+        student.train()
+        epoch_loss = 0.0
+        accum_loss = 0.0
+        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{args.epochs}")
+        for step, batch in enumerate(pbar):
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            # Teacher forward (no grad)
+            with torch.no_grad():
+                teacher_outputs = teacher(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                )
+                teacher_logits = teacher_outputs.logits
+            # Student forward
+            student_outputs = student(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                labels=input_ids,  # for hard label loss
+            )
+            student_logits = student_outputs.logits
+            hard_loss = student_outputs.loss
+            # KL divergence loss (soft labels)
+            T = args.temperature
+            teacher_probs = F.log_softmax(teacher_logits / T, dim=-1)
+            student_log_probs = F.log_softmax(student_logits / T, dim=-1)
+            # Only compute KL over non-padding tokens
+            mask = attention_mask.unsqueeze(-1).float()
+            kl_loss = F.kl_div(
+                student_log_probs * mask,
+                teacher_probs * mask,
+                log_target=True,
+                reduction="batchmean",
+            ) * (T ** 2)
+            # Combined loss
+            loss = args.alpha_kl * kl_loss + (1 - args.alpha_kl) * hard_loss
+            loss = loss / GRAD_ACCUM
+            loss.backward()
+            accum_loss += loss.item()
+            if (step + 1) % GRAD_ACCUM == 0:
+                torch.nn.utils.clip_grad_norm_(student.parameters(), 1.0)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                global_step += 1
+                pbar.set_postfix({
+                    "loss": f"{accum_loss:.4f}",
+                    "kl": f"{kl_loss.item():.4f}",
+                    "hard": f"{hard_loss.item():.4f}",
+                    "lr": f"{scheduler.get_last_lr()[0]:.2e}",
+                })
+                epoch_loss += accum_loss
+                accum_loss = 0.0
+                # Save checkpoint
+                if global_step % SAVE_STEPS == 0:
+                    ckpt_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    student.save_pretrained(ckpt_dir)
+                    tokenizer.save_pretrained(ckpt_dir)
+                    print(f"\n  Checkpoint saved: {ckpt_dir}")
+        avg_loss = epoch_loss / max(1, global_step)
+        print(f"\nEpoch {epoch+1} — avg loss: {avg_loss:.4f}")
+        if avg_loss < best_loss:
+            best_loss = avg_loss
+            best_dir = os.path.join(args.output_dir, "best")
+            student.save_pretrained(best_dir)
+            tokenizer.save_pretrained(best_dir)
+            print(f"  Best model saved: {best_dir}")
+    # ── Final save ──
+    print("\nSaving final distilled model...")
+    student.save_pretrained(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    # Merge LoRA
+    merged_dir = f"{args.output_dir}-merged"
+    print(f"Merging LoRA → {merged_dir}")
+    merged = student.merge_and_unload()
+    merged.save_pretrained(merged_dir)
+    tokenizer.save_pretrained(merged_dir)
+    print(f"\n✓ Distillation complete!")
+    print(f"  Student (LoRA): {args.output_dir}")
+    print(f"  Student (merged): {merged_dir}")
+    print(f"  Best loss: {best_loss:.4f}")
+if __name__ == "__main__":
+    main()

doc_classifier.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+InsureOS — Document Classifier Training
+Fine-tunes ModernBERT (or a fallback BERT-base) for 12-class insurance document classification.
+"""
+import os
+import json
+import argparse
+from pathlib import Path
+import torch
+import numpy as np
+from datasets import Dataset
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+)
+from sklearn.metrics import accuracy_score, f1_score, classification_report
+from data.constants import DOCUMENT_TYPES
+# ── Defaults ──
+# ModernBERT is preferred; fall back to bert-base if unavailable
+MODEL_NAME = "answerdotai/ModernBERT-base"
+FALLBACK_MODEL = "google-bert/bert-base-uncased"
+DATA_PATH = "data/output/insurance_docs_10k.jsonl"
+OUTPUT_DIR = "models/doc-classifier"
+MAX_LEN = 512
+EPOCHS = 5
+BATCH_SIZE = 16
+LR = 2e-5
+WARMUP_RATIO = 0.1
+EVAL_SPLIT = 0.1
+LABELS = DOCUMENT_TYPES  # 12 classes
+def load_data(path: str) -> Dataset:
+    records = []
+    with open(path) as f:
+        for line in f:
+            obj = json.loads(line)
+            records.append({
+                "text": obj["text"],
+                "label": obj["label_id"],
+            })
+    return Dataset.from_list(records)
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = np.argmax(pred.predictions, axis=-1)
+    acc = accuracy_score(labels, preds)
+    f1_macro = f1_score(labels, preds, average="macro")
+    f1_weighted = f1_score(labels, preds, average="weighted")
+    return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}
+def main():
+    parser = argparse.ArgumentParser(description="Train document classifier")
+    parser.add_argument("--model-name", default=MODEL_NAME)
+    parser.add_argument("--data-path", default=DATA_PATH)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    parser.add_argument("--epochs", type=int, default=EPOCHS)
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
+    parser.add_argument("--lr", type=float, default=LR)
+    args = parser.parse_args()
+    print(f"{'='*60}")
+    print(f" InsureOS — Document Classifier Training")
+    print(f" Model: {args.model_name}")
+    print(f" Classes: {len(LABELS)}")
+    print(f"{'='*60}\n")
+    # ── 1. Load tokenizer & model ──
+    print("[1/4] Loading model and tokenizer...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(
+            args.model_name,
+            num_labels=len(LABELS),
+            id2label={i: l for i, l in enumerate(LABELS)},
+            label2id={l: i for i, l in enumerate(LABELS)},
+        )
+    except Exception:
+        print(f"  ⚠ {args.model_name} unavailable, falling back to {FALLBACK_MODEL}")
+        tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
+        model = AutoModelForSequenceClassification.from_pretrained(
+            FALLBACK_MODEL,
+            num_labels=len(LABELS),
+            id2label={i: l for i, l in enumerate(LABELS)},
+            label2id={l: i for i, l in enumerate(LABELS)},
+        )
+    # ── 2. Load & tokenize data ──
+    print("[2/4] Loading data...")
+    dataset = load_data(args.data_path)
+    print(f"  Total: {len(dataset)}")
+    def tokenize_fn(examples):
+        return tokenizer(
+            examples["text"],
+            truncation=True,
+            max_length=MAX_LEN,
+            padding="max_length",
+        )
+    dataset = dataset.map(tokenize_fn, batched=True)
+    dataset = dataset.class_encode_column("label")
+    split = dataset.train_test_split(test_size=EVAL_SPLIT, seed=42, stratify_by_column="label")
+    train_ds = split["train"]
+    eval_ds = split["test"]
+    print(f"  Train: {len(train_ds)}, Eval: {len(eval_ds)}")
+    # ── 3. Training ──
+    print("[3/4] Training...")
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size * 2,
+        learning_rate=args.lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=WARMUP_RATIO,
+        weight_decay=0.01,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model="f1_macro",
+        greater_is_better=True,
+        fp16=torch.cuda.is_available(),
+        report_to="none",
+        logging_steps=50,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        compute_metrics=compute_metrics,
+    )
+    trainer.train()
+    # ── 4. Evaluate & save ──
+    print("[4/4] Final evaluation...")
+    results = trainer.evaluate()
+    print(f"  Accuracy: {results['eval_accuracy']:.4f}")
+    print(f"  F1 (macro): {results['eval_f1_macro']:.4f}")
+    print(f"  F1 (weighted): {results['eval_f1_weighted']:.4f}")
+    # Detailed classification report
+    preds = trainer.predict(eval_ds)
+    y_pred = np.argmax(preds.predictions, axis=-1)
+    y_true = preds.label_ids
+    report = classification_report(y_true, y_pred, target_names=LABELS)
+    print(f"\n{report}")
+    # Save
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    # Save label map and results
+    meta = {
+        "labels": LABELS,
+        "id2label": {i: l for i, l in enumerate(LABELS)},
+        "results": {k: float(v) for k, v in results.items()},
+    }
+    with open(os.path.join(args.output_dir, "training_meta.json"), "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"\n✓ Document classifier saved → {args.output_dir}")
+if __name__ == "__main__":
+    main()

dpo_train.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+InsureOS — DPO (Direct Preference Optimization) Training Script
+Applies RLHF-style alignment using preference pairs: FCA-compliant (chosen) vs non-compliant (rejected).
+Runs on the QLoRA-finetuned InsureLLM-4B checkpoint.
+"""
+import os
+import json
+import argparse
+import torch
+from datasets import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, TaskType
+from trl import DPOTrainer, DPOConfig
+# ── Defaults ──
+SFT_MODEL = "models/insurellm-4b-qlora-merged"  # output of Phase 1
+DATA_PATH = "data/output/insurance_dpo_5k.jsonl"
+OUTPUT_DIR = "models/insurellm-4b-dpo"
+MAX_SEQ_LEN = 512
+MAX_PROMPT_LEN = 256
+LORA_R = 32
+LORA_ALPHA = 64
+LORA_DROPOUT = 0.05
+BETA = 0.1  # DPO temperature — lower = stronger preference signal
+EPOCHS = 1
+BATCH_SIZE = 1
+GRAD_ACCUM = 16  # effective batch = 16
+LR = 5e-5
+WARMUP_RATIO = 0.1
+LOGGING_STEPS = 10
+SAVE_STEPS = 100
+EVAL_SPLIT = 0.05
+def load_dpo_data(path: str) -> Dataset:
+    """Load JSONL DPO data. Each record has prompt, chosen, rejected."""
+    records = []
+    with open(path) as f:
+        for line in f:
+            obj = json.loads(line)
+            records.append(obj)
+    return Dataset.from_list(records)
+def main():
+    parser = argparse.ArgumentParser(description="DPO alignment for InsureLLM")
+    parser.add_argument("--sft-model", default=SFT_MODEL)
+    parser.add_argument("--data-path", default=DATA_PATH)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    parser.add_argument("--epochs", type=int, default=EPOCHS)
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
+    parser.add_argument("--lr", type=float, default=LR)
+    parser.add_argument("--beta", type=float, default=BETA)
+    parser.add_argument("--lora-r", type=int, default=LORA_R)
+    parser.add_argument("--resume-from-checkpoint", action="store_true")
+    args = parser.parse_args()
+    print(f"{'='*60}")
+    print(f" InsureOS — DPO Alignment Training")
+    print(f" SFT model: {args.sft_model}")
+    print(f" DPO beta: {args.beta}")
+    print(f" Data: {args.data_path}")
+    print(f"{'='*60}\n")
+    # ── 1. Tokenizer ──
+    print("[1/5] Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.sft_model,
+        trust_remote_code=True,
+        padding_side="left",  # DPO needs left padding
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # ── 2. Load model (4-bit for VRAM) ──
+    print("[2/5] Loading SFT model in 4-bit...")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.sft_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+        dtype=torch.bfloat16,
+    )
+    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+    # Reference model (same base, frozen) — DPOTrainer can create this automatically,
+    # but with 4-bit we share the base and just use the adapter diff.
+    # DPOTrainer with peft_config will handle ref model internally.
+    # ── 3. Apply fresh LoRA for DPO ──
+    print("[3/5] Applying DPO LoRA adapters...")
+    lora_config = LoraConfig(
+        r=args.lora_r,
+        lora_alpha=LORA_ALPHA,
+        lora_dropout=LORA_DROPOUT,
+        target_modules="all-linear",
+        task_type=TaskType.CAUSAL_LM,
+        bias="none",
+    )
+    # ── 4. Load data ──
+    print("[4/5] Loading DPO preference data...")
+    dataset = load_dpo_data(args.data_path)
+    print(f"  Total preference pairs: {len(dataset)}")
+    split = dataset.train_test_split(test_size=EVAL_SPLIT, seed=42)
+    train_ds = split["train"]
+    eval_ds = split["test"]
+    print(f"  Train: {len(train_ds)}, Eval: {len(eval_ds)}")
+    # ── 5. Train with DPO ──
+    print("[5/5] Starting DPO training...")
+    dpo_config = DPOConfig(
+        output_dir=args.output_dir,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        learning_rate=args.lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=WARMUP_RATIO,
+        weight_decay=0.01,
+        bf16=True,
+        beta=args.beta,
+        max_length=MAX_SEQ_LEN,
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=2,
+        eval_strategy="steps",
+        eval_steps=SAVE_STEPS,
+        load_best_model_at_end=True,
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        report_to="none",
+    )
+    trainer = DPOTrainer(
+        model=model,
+        args=dpo_config,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        processing_class=tokenizer,
+        peft_config=lora_config,
+    )
+    if args.resume_from_checkpoint:
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    # ── Save ──
+    print("\nSaving DPO model...")
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    # Merge
+    merged_dir = f"{args.output_dir}-merged"
+    print(f"Merging DPO LoRA → {merged_dir}")
+    merged = trainer.model.merge_and_unload()
+    merged.save_pretrained(merged_dir)
+    tokenizer.save_pretrained(merged_dir)
+    print(f"\n✓ DPO training complete!")
+    print(f"  DPO adapter: {args.output_dir}")
+    print(f"  Merged model: {merged_dir}")
+    print(f"\n  This model is now aligned to prefer:")
+    print(f"    ✓ FCA Consumer Duty compliant responses")
+    print(f"    ✓ Plain English over jargon")
+    print(f"    ✓ GDPR-safe data handling")
+    print(f"    ✓ Accurate claims/regulatory information")
+    print(f"    ✓ Fair pricing (no proxy discrimination)")
+if __name__ == "__main__":
+    main()

evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Evaluation package

evaluation/results/full_eval_report.json ADDED Viewed

	@@ -0,0 +1,412 @@

+{
+  "insurellm": {
+    "model": "models/insurellm-4b-realworld-merged",
+    "domain_eval": [
+      {
+        "id": "fca_consumer_duty",
+        "score": 0,
+        "required_found": 0,
+        "required_total": 5,
+        "forbidden_found": 0,
+        "latency_s": 69.76176810264587,
+        "response_length": 111
+      },
+      {
+        "id": "gdpr_data_protection",
+        "score": 0,
+        "required_found": 0,
+        "required_total": 4,
+        "forbidden_found": 0,
+        "latency_s": 180.0589418411255,
+        "response_length": 201
+      },
+      {
+        "id": "claims_process",
+        "score": 0.6,
+        "required_found": 3,
+        "required_total": 5,
+        "forbidden_found": 0,
+        "latency_s": 197.52413249015808,
+        "response_length": 189
+      },
+      {
+        "id": "fraud_indicators",
+        "score": 0.25,
+        "required_found": 1,
+        "required_total": 4,
+        "forbidden_found": 0,
+        "latency_s": 196.2334017753601,
+        "response_length": 181
+      },
+      {
+        "id": "lloyds_market",
+        "score": 0.2,
+        "required_found": 1,
+        "required_total": 5,
+        "forbidden_found": 0,
+        "latency_s": 121.65669202804565,
+        "response_length": 146
+      },
+      {
+        "id": "pricing_fairness",
+        "score": 0.25,
+        "required_found": 1,
+        "required_total": 4,
+        "forbidden_found": 0,
+        "latency_s": 57.555458784103394,
+        "response_length": 83
+      },
+      {
+        "id": "subrogation",
+        "score": 0.5,
+        "required_found": 2,
+        "required_total": 4,
+        "forbidden_found": 0,
+        "latency_s": 117.41489219665527,
+        "response_length": 155
+      },
+      {
+        "id": "renewal_transparency",
+        "score": 0.2,
+        "required_found": 1,
+        "required_total": 5,
+        "forbidden_found": 0,
+        "latency_s": 161.38414025306702,
+        "response_length": 181
+      }
+    ],
+    "generation_metrics": {
+      "rouge1": 0.3839015292749472,
+      "rouge2": 0.10873793498858823,
+      "rougeL": 0.19891142911031565
+    },
+    "summary": {
+      "avg_domain_score": 0.25,
+      "avg_latency_s": 137.6986784338951,
+      "domain_pass_rate": 0.0
+    }
+  },
+  "fraudnet": [
+    {
+      "lob": "Motor",
+      "auc_roc": 1.0,
+      "avg_precision": 1.0,
+      "precision_fraud": 1.0,
+      "recall_fraud": 1.0,
+      "f1_fraud": 1.0,
+      "n_train": 20000,
+      "n_test": 5000,
+      "n_fraud_train": 1600,
+      "fraud_rate": 0.08,
+      "features_used": [
+        "driver_age",
+        "years_driving",
+        "years_ncd",
+        "vehicle_year",
+        "vehicle_value",
+        "annual_mileage",
+        "premium",
+        "voluntary_excess",
+        "compulsory_excess",
+        "reserve_amount",
+        "claim_amount",
+        "recovery_amount",
+        "previous_claims_3y",
+        "days_to_report",
+        "policy_age_days",
+        "witnesses",
+        "dashcam",
+        "police_report",
+        "claim_reserve_ratio",
+        "claim_premium_ratio",
+        "new_policy",
+        "late_report",
+        "vehicle_age"
+      ],
+      "top_features": {
+        "claim_reserve_ratio": "0.48872024",
+        "days_to_report": "0.43656266",
+        "policy_age_days": "0.057298034",
+        "previous_claims_3y": "0.014383504",
+        "witnesses": "0.0020713843",
+        "dashcam": "0.00096420496",
+        "driver_age": "0.0",
+        "years_driving": "0.0",
+        "years_ncd": "0.0",
+        "vehicle_year": "0.0"
+      }
+    },
+    {
+      "lob": "Property",
+      "auc_roc": 1.0,
+      "avg_precision": 1.0,
+      "precision_fraud": 1.0,
+      "recall_fraud": 1.0,
+      "f1_fraud": 1.0,
+      "n_train": 12000,
+      "n_test": 3000,
+      "n_fraud_train": 960,
+      "fraud_rate": 0.08,
+      "features_used": [
+        "property_age_years",
+        "rebuild_value",
+        "contents_value",
+        "premium",
+        "voluntary_excess",
+        "compulsory_excess",
+        "reserve_amount",
+        "claim_amount",
+        "previous_claims_3y",
+        "days_to_report",
+        "policy_age_days",
+        "has_cctv",
+        "loss_adjuster_appointed",
+        "unoccupied_30_days",
+        "alarm_installed",
+        "locks_bs3621",
+        "claim_reserve_ratio",
+        "claim_premium_ratio",
+        "new_policy",
+        "late_report"
+      ],
+      "top_features": {
+        "days_to_report": "0.40884864",
+        "policy_age_days": "0.37601757",
+        "claim_reserve_ratio": "0.1996711",
+        "previous_claims_3y": "0.014420756",
+        "late_report": "0.0010419991",
+        "property_age_years": "0.0",
+        "rebuild_value": "0.0",
+        "contents_value": "0.0",
+        "premium": "0.0",
+        "voluntary_excess": "0.0"
+      }
+    },
+    {
+      "lob": "Liability",
+      "auc_roc": 1.0,
+      "avg_precision": 1.0,
+      "precision_fraud": 1.0,
+      "recall_fraud": 1.0,
+      "f1_fraud": 1.0,
+      "n_train": 8000,
+      "n_test": 2000,
+      "n_fraud_train": 640,
+      "fraud_rate": 0.08,
+      "features_used": [
+        "claimant_age",
+        "reserve_amount",
+        "claim_amount",
+        "solicitor_costs",
+        "medical_costs",
+        "previous_claims_3y",
+        "days_to_report",
+        "solicitor_involved",
+        "independent_witness",
+        "medical_evidence_delay_days",
+        "cctv_available",
+        "claim_reserve_ratio",
+        "late_report"
+      ],
+      "top_features": {
+        "previous_claims_3y": "0.561369",
+        "days_to_report": "0.43863094",
+        "claimant_age": "0.0",
+        "reserve_amount": "0.0",
+        "claim_amount": "0.0",
+        "solicitor_costs": "0.0",
+        "medical_costs": "0.0",
+        "solicitor_involved": "0.0",
+        "independent_witness": "0.0",
+        "medical_evidence_delay_days": "0.0"
+      }
+    }
+  ],
+  "pricing": {
+    "glm": {
+      "model": "TweedieGLM",
+      "tweedie_power": 1.5,
+      "mae": 12244.958454220796,
+      "rmse": 17615.02268631013,
+      "mape_pct": 198.8474585306464,
+      "coefficients": {
+        "driver_age": 0.0,
+        "years_driving": 0.0,
+        "years_ncd": 0.0,
+        "vehicle_year": 0.0,
+        "vehicle_value": 0.0,
+        "annual_mileage": 0.0,
+        "voluntary_excess": 0.0,
+        "compulsory_excess": 0.0,
+        "previous_claims_3y": 0.0,
+        "policy_age_days": 0.0,
+        "vehicle_age": 0.0,
+        "driver_experience_ratio": 0.0,
+        "ncd_ratio": 0.0,
+        "vehicle_make_enc": 0.0,
+        "fuel_type_enc": 0.0,
+        "occupation_enc": 0.0,
+        "region_enc": 0.0
+      },
+      "intercept": 9.967596757593236,
+      "n_train": 20000,
+      "n_test": 5000
+    },
+    "ebm": {
+      "model": "EBM",
+      "mae": 11131.778297959956,
+      "rmse": 14787.148537325793,
+      "mape_pct": 177.58336694602855,
+      "n_train": 20000,
+      "n_test": 5000,
+      "top_features": {
+        "previous_claims_3y": 3259.1140028713794,
+        "policy_age_days": 2683.871584881652,
+        "previous_claims_3y & policy_age_days": 1608.1250699587606,
+        "region_enc": 221.31391899112393,
+        "vehicle_make_enc": 173.4298978553976,
+        "voluntary_excess & previous_claims_3y": 172.51716007254487,
+        "annual_mileage": 171.50784229318242,
+        "compulsory_excess": 165.02085743907992,
+        "voluntary_excess": 163.32366251884218,
+        "ncd_ratio": 152.51296273306403
+      }
+    }
+  },
+  "doc_classifier": {
+    "labels": [
+      "Policy Schedule",
+      "Certificate of Insurance",
+      "Claim Form",
+      "Loss Adjuster Report",
+      "Bordereaux \u2014 Premium",
+      "Bordereaux \u2014 Claims",
+      "Endorsement",
+      "Renewal Notice",
+      "Statement of Fact",
+      "FNOL Report",
+      "Subrogation Notice",
+      "Policy Wording"
+    ],
+    "id2label": {
+      "0": "Policy Schedule",
+      "1": "Certificate of Insurance",
+      "2": "Claim Form",
+      "3": "Loss Adjuster Report",
+      "4": "Bordereaux \u2014 Premium",
+      "5": "Bordereaux \u2014 Claims",
+      "6": "Endorsement",
+      "7": "Renewal Notice",
+      "8": "Statement of Fact",
+      "9": "FNOL Report",
+      "10": "Subrogation Notice",
+      "11": "Policy Wording"
+    },
+    "results": {
+      "eval_loss": 4.1706562114995904e-06,
+      "eval_accuracy": 1.0,
+      "eval_f1_macro": 1.0,
+      "eval_f1_weighted": 1.0,
+      "eval_runtime": 30.3435,
+      "eval_samples_per_second": 32.956,
+      "eval_steps_per_second": 2.076,
+      "epoch": 5.0
+    }
+  },
+  "ner": {
+    "label_list": [
+      "O",
+      "B-CLAIM_NUMBER",
+      "B-DATE",
+      "B-INSURER",
+      "B-LOB",
+      "B-MGA",
+      "B-MONEY",
+      "B-ORG",
+      "B-PERIL",
+      "B-PERSON",
+      "B-POLICY_NUMBER",
+      "B-POSTCODE",
+      "B-REGULATION",
+      "B-SYNDICATE",
+      "B-VEHICLE",
+      "I-DATE",
+      "I-INSURER",
+      "I-LOB",
+      "I-MGA",
+      "I-ORG",
+      "I-PERIL",
+      "I-PERSON",
+      "I-POSTCODE",
+      "I-REGULATION",
+      "I-SYNDICATE",
+      "I-VEHICLE"
+    ],
+    "label2id": {
+      "O": 0,
+      "B-CLAIM_NUMBER": 1,
+      "B-DATE": 2,
+      "B-INSURER": 3,
+      "B-LOB": 4,
+      "B-MGA": 5,
+      "B-MONEY": 6,
+      "B-ORG": 7,
+      "B-PERIL": 8,
+      "B-PERSON": 9,
+      "B-POLICY_NUMBER": 10,
+      "B-POSTCODE": 11,
+      "B-REGULATION": 12,
+      "B-SYNDICATE": 13,
+      "B-VEHICLE": 14,
+      "I-DATE": 15,
+      "I-INSURER": 16,
+      "I-LOB": 17,
+      "I-MGA": 18,
+      "I-ORG": 19,
+      "I-PERIL": 20,
+      "I-PERSON": 21,
+      "I-POSTCODE": 22,
+      "I-REGULATION": 23,
+      "I-SYNDICATE": 24,
+      "I-VEHICLE": 25
+    },
+    "id2label": {
+      "0": "O",
+      "1": "B-CLAIM_NUMBER",
+      "2": "B-DATE",
+      "3": "B-INSURER",
+      "4": "B-LOB",
+      "5": "B-MGA",
+      "6": "B-MONEY",
+      "7": "B-ORG",
+      "8": "B-PERIL",
+      "9": "B-PERSON",
+      "10": "B-POLICY_NUMBER",
+      "11": "B-POSTCODE",
+      "12": "B-REGULATION",
+      "13": "B-SYNDICATE",
+      "14": "B-VEHICLE",
+      "15": "I-DATE",
+      "16": "I-INSURER",
+      "17": "I-LOB",
+      "18": "I-MGA",
+      "19": "I-ORG",
+      "20": "I-PERIL",
+      "21": "I-PERSON",
+      "22": "I-POSTCODE",
+      "23": "I-REGULATION",
+      "24": "I-SYNDICATE",
+      "25": "I-VEHICLE"
+    },
+    "results": {
+      "eval_loss": 4.797985820914619e-05,
+      "eval_f1": 1.0,
+      "eval_precision": 1.0,
+      "eval_recall": 1.0,
+      "eval_runtime": 11.6416,
+      "eval_samples_per_second": 68.719,
+      "eval_steps_per_second": 2.147,
+      "epoch": 8.0
+    }
+  }
+}

evaluation/results/insurellm_eval.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+  "model": "models/insurellm-4b-realworld-merged",
+  "domain_eval": [
+    {
+      "id": "fca_consumer_duty",
+      "score": 0,
+      "required_found": 0,
+      "required_total": 5,
+      "forbidden_found": 0,
+      "latency_s": 69.76176810264587,
+      "response_length": 111
+    },
+    {
+      "id": "gdpr_data_protection",
+      "score": 0,
+      "required_found": 0,
+      "required_total": 4,
+      "forbidden_found": 0,
+      "latency_s": 180.0589418411255,
+      "response_length": 201
+    },
+    {
+      "id": "claims_process",
+      "score": 0.6,
+      "required_found": 3,
+      "required_total": 5,
+      "forbidden_found": 0,
+      "latency_s": 197.52413249015808,
+      "response_length": 189
+    },
+    {
+      "id": "fraud_indicators",
+      "score": 0.25,
+      "required_found": 1,
+      "required_total": 4,
+      "forbidden_found": 0,
+      "latency_s": 196.2334017753601,
+      "response_length": 181
+    },
+    {
+      "id": "lloyds_market",
+      "score": 0.2,
+      "required_found": 1,
+      "required_total": 5,
+      "forbidden_found": 0,
+      "latency_s": 121.65669202804565,
+      "response_length": 146
+    },
+    {
+      "id": "pricing_fairness",
+      "score": 0.25,
+      "required_found": 1,
+      "required_total": 4,
+      "forbidden_found": 0,
+      "latency_s": 57.555458784103394,
+      "response_length": 83
+    },
+    {
+      "id": "subrogation",
+      "score": 0.5,
+      "required_found": 2,
+      "required_total": 4,
+      "forbidden_found": 0,
+      "latency_s": 117.41489219665527,
+      "response_length": 155
+    },
+    {
+      "id": "renewal_transparency",
+      "score": 0.2,
+      "required_found": 1,
+      "required_total": 5,
+      "forbidden_found": 0,
+      "latency_s": 161.38414025306702,
+      "response_length": 181
+    }
+  ],
+  "generation_metrics": {
+    "rouge1": 0.3839015292749472,
+    "rouge2": 0.10873793498858823,
+    "rougeL": 0.19891142911031565
+  },
+  "summary": {
+    "avg_domain_score": 0.25,
+    "avg_latency_s": 137.6986784338951,
+    "domain_pass_rate": 0.0
+  }
+}

evaluation/run_eval.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+InsureOS — Comprehensive Evaluation Suite
+Evaluates all trained models: InsureLLM (generative), FraudNet, PricingGLM, DocClassifier, NER.
+"""
+import os
+import json
+import argparse
+import time
+from pathlib import Path
+import torch
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# Optional imports (gracefully degrade if not installed)
+try:
+    from rouge_score import rouge_scorer
+    HAS_ROUGE = True
+except ImportError:
+    HAS_ROUGE = False
+try:
+    from bert_score import score as bert_score_fn
+    HAS_BERTSCORE = True
+except ImportError:
+    HAS_BERTSCORE = False
+# ── Defaults ──
+INSURELLM_MODEL = "models/insurellm-4b-dpo-merged"
+EVAL_DATA = "data/output/insurance_sft_10k.jsonl"
+OUTPUT_DIR = "evaluation/results"
+N_SAMPLES = 100  # number of samples for generative eval
+# ── UK Insurance Domain-Specific Evaluation Prompts ──
+DOMAIN_EVAL_PROMPTS = [
+    {
+        "id": "fca_consumer_duty",
+        "prompt": "A customer's home insurance claim for escape of water has been open for 8 weeks with no update. They call asking what's happening. Draft a response.",
+        "required_elements": ["apology", "timeline", "next steps", "complaint rights", "financial ombudsman"],
+        "forbidden_elements": ["jargon", "indemnification", "proximate cause", "peril"],
+    },
+    {
+        "id": "gdpr_data_protection",
+        "prompt": "A third party calls asking about the status of a motor claim made against our policyholder. How should we respond?",
+        "required_elements": ["data protection", "cannot share", "their own insurer", "GDPR"],
+        "forbidden_elements": ["policy number", "policyholder address", "bank details"],
+    },
+    {
+        "id": "claims_process",
+        "prompt": "Explain the insurance claims process for a domestic property fire from FNOL through to settlement.",
+        "required_elements": ["FNOL", "loss adjuster", "investigation", "settlement", "excess"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "fraud_indicators",
+        "prompt": "What are the key red flags for insurance fraud in motor third-party claims?",
+        "required_elements": ["exaggeration", "staged", "late reporting", "multiple claims"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "lloyds_market",
+        "prompt": "Explain the role of an MGA in the Lloyd's market and how binding authority agreements work.",
+        "required_elements": ["binding authority", "capacity provider", "syndicate", "delegated authority", "bordereaux"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "pricing_fairness",
+        "prompt": "An insurer wants to use first names as a rating factor because it improves their model by 3%. Should they?",
+        "required_elements": ["proxy discrimination", "protected characteristics", "Equality Act", "FCA"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "subrogation",
+        "prompt": "Explain subrogation rights in UK insurance. When does an insurer pursue recovery?",
+        "required_elements": ["recovery", "third party", "policyholder indemnified", "non-fault"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "renewal_transparency",
+        "prompt": "A customer's premium increased by 25% at renewal. They want to know why. Draft an explanation.",
+        "required_elements": ["transparency", "factors", "shop around", "Consumer Duty", "fair value"],
+        "forbidden_elements": ["take it or leave it", "market rate"],
+    },
+]
+def evaluate_insurellm(model_path: str, n_samples: int, output_dir: str) -> dict:
+    """Evaluate the generative InsureLLM model."""
+    print(f"\n{'='*60}")
+    print(f" Evaluating InsureLLM: {model_path}")
+    print(f"{'='*60}")
+    # Load model
+    print("Loading model...")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+        dtype=torch.bfloat16,
+    )
+    model.eval()
+    results = {
+        "model": model_path,
+        "domain_eval": [],
+        "generation_metrics": {},
+    }
+    # ── 1. Domain-Specific Evaluation ──
+    print("\n[1/3] Domain-specific evaluation...")
+    for i, item in enumerate(DOMAIN_EVAL_PROMPTS):
+        print(f"  Prompt {i+1}/{len(DOMAIN_EVAL_PROMPTS)}: {item['id']}...", flush=True)
+        messages = [
+            {"role": "system", "content": "You are InsureLLM, a specialist UK insurance AI assistant. Answer directly without internal reasoning."},
+            {"role": "user", "content": item["prompt"]},
+        ]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # Disable Qwen3 thinking mode by appending <think>\n</think>\n
+        if "<|im_start|>assistant" in text:
+            text = text + "<think>\n</think>\n"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        start = time.time()
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                do_sample=False,
+            )
+        latency = time.time() - start
+        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        # Check required elements
+        response_lower = response.lower()
+        found_required = [e for e in item["required_elements"] if e.lower() in response_lower]
+        found_forbidden = [e for e in item["forbidden_elements"] if e.lower() in response_lower]
+        score = len(found_required) / max(len(item["required_elements"]), 1)
+        penalty = len(found_forbidden) * 0.15
+        final_score = max(0, score - penalty)
+        eval_result = {
+            "id": item["id"],
+            "score": final_score,
+            "required_found": len(found_required),
+            "required_total": len(item["required_elements"]),
+            "forbidden_found": len(found_forbidden),
+            "latency_s": latency,
+            "response_length": len(response.split()),
+        }
+        results["domain_eval"].append(eval_result)
+        status = "✓" if final_score >= 0.7 else "△" if final_score >= 0.4 else "✗"
+        print(f"  {status} {item['id']}: {final_score:.2f} "
+              f"({len(found_required)}/{len(item['required_elements'])} required, "
+              f"{len(found_forbidden)} forbidden, {latency:.1f}s)")
+    avg_domain = np.mean([r["score"] for r in results["domain_eval"]])
+    avg_latency = np.mean([r["latency_s"] for r in results["domain_eval"]])
+    print(f"\n  Average domain score: {avg_domain:.3f}")
+    print(f"  Average latency: {avg_latency:.1f}s")
+    # ── 2. ROUGE scores on held-out SFT data ──
+    if HAS_ROUGE and os.path.exists(EVAL_DATA):
+        print("\n[2/3] ROUGE evaluation on SFT test set...")
+        scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+        eval_records = []
+        with open(EVAL_DATA) as f:
+            for line in f:
+                eval_records.append(json.loads(line))
+        # Use last N as eval
+        eval_subset = eval_records[-min(n_samples, len(eval_records)):]
+        rouge1_scores = []
+        rouge2_scores = []
+        rougeL_scores = []
+        for rec in eval_subset:
+            messages = rec["messages"]
+            # Get reference (last assistant message)
+            reference = messages[-1]["content"]
+            prompt_messages = messages[:-1]
+            text = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)
+            # Disable Qwen3 thinking mode
+            if "<|im_start|>assistant" in text:
+                text = text + "<think>\n</think>\n"
+            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
+            with torch.no_grad():
+                outputs = model.generate(**inputs, max_new_tokens=256, do_sample=False)
+            generated = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            scores = scorer.score(reference, generated)
+            rouge1_scores.append(scores["rouge1"].fmeasure)
+            rouge2_scores.append(scores["rouge2"].fmeasure)
+            rougeL_scores.append(scores["rougeL"].fmeasure)
+        results["generation_metrics"]["rouge1"] = float(np.mean(rouge1_scores))
+        results["generation_metrics"]["rouge2"] = float(np.mean(rouge2_scores))
+        results["generation_metrics"]["rougeL"] = float(np.mean(rougeL_scores))
+        print(f"  ROUGE-1: {results['generation_metrics']['rouge1']:.4f}")
+        print(f"  ROUGE-2: {results['generation_metrics']['rouge2']:.4f}")
+        print(f"  ROUGE-L: {results['generation_metrics']['rougeL']:.4f}")
+    else:
+        print("\n[2/3] Skipping ROUGE (rouge_score not installed or data not found)")
+    # ── 3. Summary metrics ──
+    print("\n[3/3] Computing summary...")
+    results["summary"] = {
+        "avg_domain_score": float(avg_domain),
+        "avg_latency_s": float(avg_latency),
+        "domain_pass_rate": float(np.mean([1 if r["score"] >= 0.7 else 0 for r in results["domain_eval"]])),
+    }
+    # Save
+    os.makedirs(output_dir, exist_ok=True)
+    outpath = os.path.join(output_dir, "insurellm_eval.json")
+    with open(outpath, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\n✓ InsureLLM eval results → {outpath}")
+    return results
+def evaluate_all(args):
+    """Run evaluation for all available models."""
+    print(f"{'='*60}")
+    print(f" InsureOS — Full Evaluation Suite")
+    print(f"{'='*60}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    all_results = {}
+    # 1. InsureLLM
+    if os.path.exists(args.insurellm_model):
+        all_results["insurellm"] = evaluate_insurellm(
+            args.insurellm_model, args.n_samples, args.output_dir
+        )
+    else:
+        print(f"\n⚠ InsureLLM not found at {args.insurellm_model}, skipping")
+    # 2. FraudNet — just check if results exist from training
+    fraud_results = Path("models/fraudnet/training_results.json")
+    if fraud_results.exists():
+        with open(fraud_results) as f:
+            all_results["fraudnet"] = json.load(f)
+        print(f"\n✓ FraudNet results loaded from training")
+    else:
+        print(f"\n⚠ FraudNet results not found, skipping")
+    # 3. Pricing GLM
+    pricing_results = Path("models/pricing-glm/training_results.json")
+    if pricing_results.exists():
+        with open(pricing_results) as f:
+            all_results["pricing"] = json.load(f)
+        print(f"✓ Pricing model results loaded from training")
+    else:
+        print(f"⚠ Pricing results not found, skipping")
+    # 4. Doc Classifier
+    doc_meta = Path("models/doc-classifier/training_meta.json")
+    if doc_meta.exists():
+        with open(doc_meta) as f:
+            all_results["doc_classifier"] = json.load(f)
+        print(f"✓ Doc classifier results loaded")
+    else:
+        print(f"⚠ Doc classifier results not found, skipping")
+    # 5. NER
+    ner_meta = Path("models/ner-model/training_meta.json")
+    if ner_meta.exists():
+        with open(ner_meta) as f:
+            all_results["ner"] = json.load(f)
+        print(f"✓ NER results loaded")
+    else:
+        print(f"⚠ NER results not found, skipping")
+    # ── Summary report ──
+    report_path = os.path.join(args.output_dir, "full_eval_report.json")
+    with open(report_path, "w") as f:
+        json.dump(all_results, f, indent=2, default=str)
+    print(f"\n{'='*60}")
+    print(f" EVALUATION SUMMARY")
+    print(f"{'='*60}")
+    if "insurellm" in all_results:
+        s = all_results["insurellm"].get("summary", {})
+        print(f"\n  InsureLLM (Generative):")
+        print(f"    Domain score: {s.get('avg_domain_score', 'N/A')}")
+        print(f"    Pass rate:    {s.get('domain_pass_rate', 'N/A')}")
+        print(f"    Latency:      {s.get('avg_latency_s', 'N/A')}s")
+    if "fraudnet" in all_results:
+        for r in all_results["fraudnet"]:
+            if isinstance(r, dict):
+                print(f"\n  FraudNet ({r.get('lob', '?')}):")
+                print(f"    AUC-ROC:      {r.get('auc_roc', 'N/A')}")
+                print(f"    Avg Precision: {r.get('avg_precision', 'N/A')}")
+    if "pricing" in all_results:
+        for model_type in ["glm", "ebm"]:
+            if model_type in all_results["pricing"]:
+                m = all_results["pricing"][model_type]
+                print(f"\n  Pricing {model_type.upper()}:")
+                print(f"    MAE:  £{m.get('mae', 'N/A')}")
+                print(f"    RMSE: £{m.get('rmse', 'N/A')}")
+    if "doc_classifier" in all_results:
+        r = all_results["doc_classifier"].get("results", {})
+        print(f"\n  Document Classifier:")
+        print(f"    Accuracy:   {r.get('eval_accuracy', 'N/A')}")
+        print(f"    F1 (macro): {r.get('eval_f1_macro', 'N/A')}")
+    if "ner" in all_results:
+        r = all_results["ner"].get("results", {})
+        print(f"\n  NER Model:")
+        print(f"    F1:        {r.get('eval_f1', 'N/A')}")
+        print(f"    Precision: {r.get('eval_precision', 'N/A')}")
+        print(f"    Recall:    {r.get('eval_recall', 'N/A')}")
+    print(f"\n  Full report → {report_path}")
+def main():
+    parser = argparse.ArgumentParser(description="InsureOS evaluation suite")
+    parser.add_argument("--insurellm-model", default=INSURELLM_MODEL)
+    parser.add_argument("--n-samples", type=int, default=N_SAMPLES)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    args = parser.parse_args()
+    evaluate_all(args)
+if __name__ == "__main__":
+    main()

fraud_model.py ADDED Viewed

	@@ -0,0 +1,320 @@

+"""
+InsureOS — Fraud Detection Model
+Hybrid approach: XGBoost + Isolation Forest ensemble on tabular claims data.
+Trains separate models per LoB (Motor, Property, Liability) + a combined model.
+"""
+import os
+import json
+import argparse
+import pickle
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.metrics import (
+    classification_report, roc_auc_score, precision_recall_curve,
+    average_precision_score, confusion_matrix,
+)
+from sklearn.ensemble import IsolationForest
+import xgboost as xgb
+# ── Defaults ──
+DATA_DIR = "data/output"
+OUTPUT_DIR = "models/fraudnet"
+TEST_SIZE = 0.2
+N_FOLDS = 5
+RANDOM_STATE = 42
+# XGBoost hyperparams (tuned for imbalanced fraud data)
+XGB_PARAMS = {
+    "objective": "binary:logistic",
+    "eval_metric": "aucpr",
+    "max_depth": 6,
+    "learning_rate": 0.05,
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    "min_child_weight": 5,
+    "gamma": 1,
+    "reg_alpha": 0.1,
+    "reg_lambda": 1.0,
+    "tree_method": "hist",
+    "device": "cuda",
+    "n_estimators": 500,
+    "early_stopping_rounds": 30,
+}
+# Feature groups per LoB
+MOTOR_FEATURES = [
+    "driver_age", "years_driving", "years_ncd", "vehicle_year", "vehicle_value",
+    "annual_mileage", "premium", "voluntary_excess", "compulsory_excess",
+    "reserve_amount", "claim_amount", "recovery_amount",
+    "previous_claims_3y", "days_to_report", "policy_age_days",
+    "witnesses", "dashcam", "police_report",
+]
+PROPERTY_FEATURES = [
+    "property_age_years", "rebuild_value", "contents_value",
+    "premium", "voluntary_excess", "compulsory_excess",
+    "reserve_amount", "claim_amount",
+    "previous_claims_3y", "days_to_report", "policy_age_days",
+    "has_cctv", "loss_adjuster_appointed", "unoccupied_30_days",
+    "alarm_installed", "locks_bs3621",
+]
+LIABILITY_FEATURES = [
+    "claimant_age", "reserve_amount", "claim_amount",
+    "solicitor_costs", "medical_costs",
+    "previous_claims_3y", "days_to_report",
+    "solicitor_involved", "independent_witness",
+    "medical_evidence_delay_days", "cctv_available",
+]
+def load_csv(path: str) -> pd.DataFrame:
+    df = pd.read_csv(path)
+    # Convert booleans
+    bool_cols = df.select_dtypes(include=["object"]).columns
+    for col in bool_cols:
+        if set(df[col].dropna().unique()).issubset({"True", "False", True, False}):
+            df[col] = df[col].map({"True": 1, "False": 0, True: 1, False: 0}).fillna(0).astype(int)
+    return df
+def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Add derived fraud-signal features."""
+    df = df.copy()
+    # Claim-to-reserve ratio (key fraud signal)
+    if "reserve_amount" in df.columns and "claim_amount" in df.columns:
+        df["claim_reserve_ratio"] = df["claim_amount"] / df["reserve_amount"].clip(lower=1)
+    # Claim-to-premium ratio
+    if "premium" in df.columns and "claim_amount" in df.columns:
+        df["claim_premium_ratio"] = df["claim_amount"] / df["premium"].clip(lower=1)
+    # Policy age bucket (new policies = higher fraud risk)
+    if "policy_age_days" in df.columns:
+        df["new_policy"] = (df["policy_age_days"] < 90).astype(int)
+    # Late reporting flag
+    if "days_to_report" in df.columns:
+        df["late_report"] = (df["days_to_report"] > 14).astype(int)
+    # Vehicle age
+    if "vehicle_year" in df.columns:
+        df["vehicle_age"] = 2025 - df["vehicle_year"]
+    return df
+def train_xgb_model(
+    X_train: pd.DataFrame,
+    y_train: pd.Series,
+    X_test: pd.DataFrame,
+    y_test: pd.Series,
+    feature_names: list,
+    lob: str,
+    output_dir: str,
+) -> dict:
+    """Train XGBoost model for a specific LoB."""
+    # Handle class imbalance via scale_pos_weight
+    n_neg = (y_train == 0).sum()
+    n_pos = (y_train == 1).sum()
+    scale_pos_weight = n_neg / max(n_pos, 1)
+    params = {**XGB_PARAMS, "scale_pos_weight": scale_pos_weight}
+    n_estimators = params.pop("n_estimators")
+    early_stopping = params.pop("early_stopping_rounds")
+    model = xgb.XGBClassifier(
+        n_estimators=n_estimators,
+        early_stopping_rounds=early_stopping,
+        **params,
+    )
+    # Use all available features from the feature list that exist in X
+    avail_features = [f for f in feature_names if f in X_train.columns]
+    X_tr = X_train[avail_features].fillna(0)
+    X_te = X_test[avail_features].fillna(0)
+    model.fit(
+        X_tr, y_train,
+        eval_set=[(X_te, y_test)],
+        verbose=50,
+    )
+    # Predictions
+    y_pred_proba = model.predict_proba(X_te)[:, 1]
+    y_pred = (y_pred_proba >= 0.5).astype(int)
+    # Metrics
+    auc_roc = roc_auc_score(y_test, y_pred_proba)
+    avg_prec = average_precision_score(y_test, y_pred_proba)
+    report = classification_report(y_test, y_pred, output_dict=True)
+    results = {
+        "lob": lob,
+        "auc_roc": auc_roc,
+        "avg_precision": avg_prec,
+        "precision_fraud": report.get("1", {}).get("precision", 0),
+        "recall_fraud": report.get("1", {}).get("recall", 0),
+        "f1_fraud": report.get("1", {}).get("f1-score", 0),
+        "n_train": len(y_train),
+        "n_test": len(y_test),
+        "n_fraud_train": int(y_train.sum()),
+        "fraud_rate": float(y_train.mean()),
+        "features_used": avail_features,
+    }
+    # Feature importance
+    importance = dict(zip(avail_features, model.feature_importances_))
+    results["top_features"] = dict(sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10])
+    # Save
+    os.makedirs(output_dir, exist_ok=True)
+    model_path = os.path.join(output_dir, f"xgb_{lob.lower()}.json")
+    model.save_model(model_path)
+    print(f"\n  ✓ {lob} XGBoost saved → {model_path}")
+    print(f"    AUC-ROC: {auc_roc:.4f}, Avg Precision: {avg_prec:.4f}")
+    print(f"    Fraud precision: {results['precision_fraud']:.3f}, recall: {results['recall_fraud']:.3f}")
+    return results
+def train_isolation_forest(
+    X_train: pd.DataFrame,
+    feature_names: list,
+    lob: str,
+    output_dir: str,
+) -> str:
+    """Train Isolation Forest for anomaly scoring (unsupervised complement)."""
+    avail_features = [f for f in feature_names if f in X_train.columns]
+    X = X_train[avail_features].fillna(0)
+    iforest = IsolationForest(
+        n_estimators=200,
+        contamination=0.08,
+        max_samples="auto",
+        random_state=RANDOM_STATE,
+        n_jobs=-1,
+    )
+    iforest.fit(X)
+    model_path = os.path.join(output_dir, f"iforest_{lob.lower()}.pkl")
+    with open(model_path, "wb") as f:
+        pickle.dump(iforest, f)
+    print(f"  ✓ {lob} Isolation Forest saved → {model_path}")
+    return model_path
+def main():
+    parser = argparse.ArgumentParser(description="Train FraudNet models")
+    parser.add_argument("--data-dir", default=DATA_DIR)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    args = parser.parse_args()
+    print(f"{'='*60}")
+    print(f" InsureOS — FraudNet Training")
+    print(f" Data: {args.data_dir}")
+    print(f"{'='*60}\n")
+    os.makedirs(args.output_dir, exist_ok=True)
+    all_results = []
+    # ── Motor ──
+    motor_files = list(Path(args.data_dir).glob("claims_motor_*.csv"))
+    if motor_files:
+        print("=" * 40 + " MOTOR " + "=" * 40)
+        df = load_csv(str(motor_files[0]))
+        df = engineer_features(df)
+        features = MOTOR_FEATURES + ["claim_reserve_ratio", "claim_premium_ratio", "new_policy", "late_report", "vehicle_age"]
+        X = df.drop(columns=["is_fraud", "claim_id", "lob", "insurer", "region",
+                              "claim_type", "claim_status", "vehicle_make", "vehicle_model",
+                              "fuel_type", "occupation", "driver_gender",
+                              "postcode_prefix", "inception_date", "loss_date",
+                              "report_date", "settlement_date", "time_of_loss"],
+                     errors="ignore")
+        y = df["is_fraud"].astype(int)
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
+        )
+        result = train_xgb_model(X_train, y_train, X_test, y_test, features, "Motor", args.output_dir)
+        all_results.append(result)
+        train_isolation_forest(X_train, features, "Motor", args.output_dir)
+    # ── Property ──
+    prop_files = list(Path(args.data_dir).glob("claims_property_*.csv"))
+    if prop_files:
+        print("\n" + "=" * 38 + " PROPERTY " + "=" * 38)
+        df = load_csv(str(prop_files[0]))
+        df = engineer_features(df)
+        features = PROPERTY_FEATURES + ["claim_reserve_ratio", "claim_premium_ratio", "new_policy", "late_report"]
+        X = df.drop(columns=["is_fraud", "claim_id", "lob", "insurer", "region",
+                              "claim_type", "claim_status", "property_type",
+                              "heating_type", "flood_risk_zone", "subsidence_history",
+                              "postcode_prefix", "inception_date", "loss_date",
+                              "report_date", "settlement_date"],
+                     errors="ignore")
+        y = df["is_fraud"].astype(int)
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
+        )
+        result = train_xgb_model(X_train, y_train, X_test, y_test, features, "Property", args.output_dir)
+        all_results.append(result)
+        train_isolation_forest(X_train, features, "Property", args.output_dir)
+    # ── Liability ──
+    liab_files = list(Path(args.data_dir).glob("claims_liability_*.csv"))
+    if liab_files:
+        print("\n" + "=" * 37 + " LIABILITY " + "=" * 37)
+        df = load_csv(str(liab_files[0]))
+        df = engineer_features(df)
+        features = LIABILITY_FEATURES + ["claim_reserve_ratio", "late_report"]
+        X = df.drop(columns=["is_fraud", "claim_id", "lob", "insurer", "region",
+                              "claim_type", "claim_status", "claimant_gender",
+                              "injury_type", "injury_severity",
+                              "postcode_prefix", "inception_date", "loss_date",
+                              "report_date", "settlement_date"],
+                     errors="ignore")
+        y = df["is_fraud"].astype(int)
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
+        )
+        result = train_xgb_model(X_train, y_train, X_test, y_test, features, "Liability", args.output_dir)
+        all_results.append(result)
+        train_isolation_forest(X_train, features, "Liability", args.output_dir)
+    # ── Save results summary ──
+    summary_path = os.path.join(args.output_dir, "training_results.json")
+    with open(summary_path, "w") as f:
+        json.dump(all_results, f, indent=2, default=str)
+    print(f"\n{'='*60}")
+    print(f" ✓ FraudNet training complete!")
+    print(f"{'='*60}")
+    for r in all_results:
+        print(f"\n  {r['lob']}:")
+        print(f"    AUC-ROC: {r['auc_roc']:.4f}")
+        print(f"    Avg Precision: {r['avg_precision']:.4f}")
+        print(f"    Fraud F1: {r['f1_fraud']:.3f}")
+        print(f"    Top features: {list(r['top_features'].keys())[:5]}")
+    print(f"\n  Results → {summary_path}")
+if __name__ == "__main__":
+    main()

ner_model.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+InsureOS — Insurance NER Model Training
+Fine-tunes ModernBERT (or fallback BERT-base) for token-level Named Entity Recognition
+with 14 insurance-specific entity types in IOB2 format.
+"""
+import os
+import json
+import argparse
+from pathlib import Path
+import torch
+import numpy as np
+from datasets import Dataset
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForTokenClassification,
+)
+from seqeval.metrics import (
+    classification_report as seq_classification_report,
+    f1_score as seq_f1_score,
+    precision_score as seq_precision_score,
+    recall_score as seq_recall_score,
+)
+# ── Defaults ──
+MODEL_NAME = "answerdotai/ModernBERT-base"
+FALLBACK_MODEL = "google-bert/bert-base-uncased"
+DATA_PATH = "data/output/insurance_ner_8k.jsonl"
+OUTPUT_DIR = "models/ner-model"
+MAX_LEN = 256
+EPOCHS = 8
+BATCH_SIZE = 16
+LR = 3e-5
+WARMUP_RATIO = 0.1
+EVAL_SPLIT = 0.1
+def load_data(path: str):
+    """Load NER JSONL and build label set."""
+    records = []
+    all_tags = set()
+    with open(path) as f:
+        for line in f:
+            obj = json.loads(line)
+            records.append(obj)
+            all_tags.update(obj["ner_tags"])
+    # Build label list: O first, then B-/I- sorted
+    entity_tags = sorted(t for t in all_tags if t != "O")
+    label_list = ["O"] + entity_tags
+    label2id = {l: i for i, l in enumerate(label_list)}
+    id2label = {i: l for i, l in enumerate(label_list)}
+    return records, label_list, label2id, id2label
+def tokenize_and_align(examples, tokenizer, label2id, max_len):
+    """Tokenize and align NER labels with subword tokens."""
+    tokenized = tokenizer(
+        examples["tokens"],
+        is_split_into_words=True,
+        truncation=True,
+        max_length=max_len,
+        padding="max_length",
+    )
+    aligned_labels = []
+    for i, labels in enumerate(examples["ner_tags"]):
+        word_ids = tokenized.word_ids(batch_index=i)
+        label_ids = []
+        previous_word_idx = None
+        for word_idx in word_ids:
+            if word_idx is None:
+                label_ids.append(-100)  # special tokens
+            elif word_idx != previous_word_idx:
+                label_ids.append(label2id.get(labels[word_idx], 0))
+            else:
+                # For subword tokens, use I- tag if the original is B-
+                orig_label = labels[word_idx]
+                if orig_label.startswith("B-"):
+                    label_ids.append(label2id.get(orig_label.replace("B-", "I-"), 0))
+                else:
+                    label_ids.append(label2id.get(orig_label, 0))
+            previous_word_idx = word_idx
+        aligned_labels.append(label_ids)
+    tokenized["labels"] = aligned_labels
+    return tokenized
+def main():
+    parser = argparse.ArgumentParser(description="Train insurance NER model")
+    parser.add_argument("--model-name", default=MODEL_NAME)
+    parser.add_argument("--data-path", default=DATA_PATH)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    parser.add_argument("--epochs", type=int, default=EPOCHS)
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
+    parser.add_argument("--lr", type=float, default=LR)
+    args = parser.parse_args()
+    # Load data
+    print(f"{'='*60}")
+    print(f" InsureOS — NER Model Training")
+    print(f"{'='*60}\n")
+    print("[1/5] Loading NER data...")
+    records, label_list, label2id, id2label = load_data(args.data_path)
+    print(f"  Examples: {len(records)}")
+    print(f"  Labels: {len(label_list)} ({len(label_list)-1} entity types)")
+    # Load tokenizer & model
+    print("[2/5] Loading model...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+        model = AutoModelForTokenClassification.from_pretrained(
+            args.model_name,
+            num_labels=len(label_list),
+            id2label=id2label,
+            label2id=label2id,
+        )
+    except Exception:
+        print(f"  ⚠ Falling back to {FALLBACK_MODEL}")
+        tokenizer = AutoTokenizer.from_pretrained(FALLBACK_MODEL)
+        model = AutoModelForTokenClassification.from_pretrained(
+            FALLBACK_MODEL,
+            num_labels=len(label_list),
+            id2label=id2label,
+            label2id=label2id,
+        )
+    # Prepare dataset
+    print("[3/5] Tokenizing and aligning labels...")
+    ds = Dataset.from_list(records)
+    ds = ds.map(
+        lambda ex: tokenize_and_align(ex, tokenizer, label2id, MAX_LEN),
+        batched=True,
+        remove_columns=["text"],
+    )
+    split = ds.train_test_split(test_size=EVAL_SPLIT, seed=42)
+    train_ds = split["train"]
+    eval_ds = split["test"]
+    print(f"  Train: {len(train_ds)}, Eval: {len(eval_ds)}")
+    # Metrics
+    def compute_metrics(pred):
+        preds = np.argmax(pred.predictions, axis=-1)
+        labels = pred.label_ids
+        true_labels = []
+        true_preds = []
+        for pred_seq, label_seq in zip(preds, labels):
+            seq_labels = []
+            seq_preds = []
+            for p, l in zip(pred_seq, label_seq):
+                if l != -100:
+                    seq_labels.append(id2label[l])
+                    seq_preds.append(id2label[p])
+            true_labels.append(seq_labels)
+            true_preds.append(seq_preds)
+        return {
+            "f1": seq_f1_score(true_labels, true_preds),
+            "precision": seq_precision_score(true_labels, true_preds),
+            "recall": seq_recall_score(true_labels, true_preds),
+        }
+    # Train
+    print("[4/5] Training...")
+    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size * 2,
+        learning_rate=args.lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=WARMUP_RATIO,
+        weight_decay=0.01,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
+        greater_is_better=True,
+        fp16=torch.cuda.is_available(),
+        report_to="none",
+        logging_steps=50,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+    trainer.train()
+    # Evaluate
+    print("[5/5] Final evaluation...")
+    results = trainer.evaluate()
+    print(f"  F1: {results['eval_f1']:.4f}")
+    print(f"  Precision: {results['eval_precision']:.4f}")
+    print(f"  Recall: {results['eval_recall']:.4f}")
+    # Detailed per-entity report
+    preds_output = trainer.predict(eval_ds)
+    preds = np.argmax(preds_output.predictions, axis=-1)
+    labels = preds_output.label_ids
+    true_labels = []
+    true_preds = []
+    for pred_seq, label_seq in zip(preds, labels):
+        seq_labels = []
+        seq_preds = []
+        for p, l in zip(pred_seq, label_seq):
+            if l != -100:
+                seq_labels.append(id2label[l])
+                seq_preds.append(id2label[p])
+        true_labels.append(seq_labels)
+        true_preds.append(seq_preds)
+    report = seq_classification_report(true_labels, true_preds)
+    print(f"\n{report}")
+    # Save
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    meta = {
+        "label_list": label_list,
+        "label2id": label2id,
+        "id2label": id2label,
+        "results": {k: float(v) for k, v in results.items()},
+    }
+    with open(os.path.join(args.output_dir, "training_meta.json"), "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"\n✓ NER model saved → {args.output_dir}")
+if __name__ == "__main__":
+    main()

pricing_glm.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+InsureOS — Insurance Pricing GLM + EBM
+Trains a Tweedie GLM for pure premium estimation and an Explainable Boosting Machine (EBM)
+for interpretable rating factor analysis. Uses motor claims tabular data.
+"""
+import os
+import json
+import argparse
+import pickle
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.preprocessing import LabelEncoder
+# Tweedie GLM
+from sklearn.linear_model import TweedieRegressor
+# Explainable Boosting Machine (glass-box model)
+from interpret.glassbox import ExplainableBoostingRegressor
+from interpret import show
+# ── Defaults ──
+DATA_DIR = "data/output"
+OUTPUT_DIR = "models/pricing-glm"
+TEST_SIZE = 0.2
+RANDOM_STATE = 42
+# GLM hyperparams
+TWEEDIE_POWER = 1.5  # 1 < p < 2 → Compound Poisson-Gamma (standard for insurance)
+TWEEDIE_ALPHA = 1.0  # regularization strength
+TWEEDIE_MAX_ITER = 300
+# Features for pricing
+PRICING_FEATURES = [
+    "driver_age", "years_driving", "years_ncd", "vehicle_year", "vehicle_value",
+    "annual_mileage", "voluntary_excess", "compulsory_excess",
+    "previous_claims_3y", "policy_age_days",
+]
+CAT_FEATURES = [
+    "vehicle_make", "fuel_type", "occupation", "region",
+]
+def load_and_prepare(data_dir: str) -> pd.DataFrame:
+    """Load motor claims CSV and prepare for pricing model."""
+    motor_files = list(Path(data_dir).glob("claims_motor_*.csv"))
+    if not motor_files:
+        raise FileNotFoundError(f"No motor claims CSV found in {data_dir}")
+    df = pd.read_csv(str(motor_files[0]))
+    # Target: claim_amount (pure premium proxy)
+    # Only use settled claims with positive amounts
+    df = df[df["claim_amount"] > 0].copy()
+    # Encode categoricals
+    encoders = {}
+    for col in CAT_FEATURES:
+        if col in df.columns:
+            le = LabelEncoder()
+            df[col + "_enc"] = le.fit_transform(df[col].fillna("Unknown"))
+            encoders[col] = le
+    # Derived features
+    df["vehicle_age"] = 2025 - df["vehicle_year"]
+    df["driver_experience_ratio"] = df["years_driving"] / df["driver_age"].clip(lower=18)
+    df["ncd_ratio"] = df["years_ncd"] / df["years_driving"].clip(lower=1)
+    return df, encoders
+def train_tweedie_glm(
+    X_train: pd.DataFrame,
+    y_train: pd.Series,
+    X_test: pd.DataFrame,
+    y_test: pd.Series,
+    feature_names: list,
+    output_dir: str,
+) -> dict:
+    """Train Tweedie GLM for pure premium."""
+    print("\n[GLM] Training Tweedie Regressor...")
+    glm = TweedieRegressor(
+        power=TWEEDIE_POWER,
+        alpha=TWEEDIE_ALPHA,
+        max_iter=TWEEDIE_MAX_ITER,
+        link="log",
+    )
+    X_tr = X_train[feature_names].fillna(0)
+    X_te = X_test[feature_names].fillna(0)
+    glm.fit(X_tr, y_train)
+    # Predictions (clipped to positive)
+    y_pred = np.clip(glm.predict(X_te), 0, None)
+    # Metrics
+    mae = mean_absolute_error(y_test, y_pred)
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+    mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100
+    # Coefficients
+    coefs = dict(zip(feature_names, glm.coef_))
+    results = {
+        "model": "TweedieGLM",
+        "tweedie_power": TWEEDIE_POWER,
+        "mae": mae,
+        "rmse": rmse,
+        "mape_pct": mape,
+        "coefficients": coefs,
+        "intercept": float(glm.intercept_),
+        "n_train": len(y_train),
+        "n_test": len(y_test),
+    }
+    # Save
+    model_path = os.path.join(output_dir, "tweedie_glm.pkl")
+    with open(model_path, "wb") as f:
+        pickle.dump(glm, f)
+    print(f"  ✓ Tweedie GLM saved → {model_path}")
+    print(f"    MAE: £{mae:,.2f}")
+    print(f"    RMSE: £{rmse:,.2f}")
+    print(f"    MAPE: {mape:.1f}%")
+    print(f"    Top coefficients:")
+    for feat, coef in sorted(coefs.items(), key=lambda x: abs(x[1]), reverse=True)[:5]:
+        print(f"      {feat}: {coef:+.4f}")
+    return results
+def train_ebm(
+    X_train: pd.DataFrame,
+    y_train: pd.Series,
+    X_test: pd.DataFrame,
+    y_test: pd.Series,
+    feature_names: list,
+    output_dir: str,
+) -> dict:
+    """Train Explainable Boosting Machine for interpretable pricing."""
+    print("\n[EBM] Training Explainable Boosting Machine...")
+    ebm = ExplainableBoostingRegressor(
+        max_bins=256,
+        outer_bags=8,
+        inner_bags=4,
+        learning_rate=0.01,
+        max_leaves=3,
+        min_samples_leaf=10,
+        interactions=10,  # allow up to 10 pairwise interactions
+        random_state=RANDOM_STATE,
+    )
+    X_tr = X_train[feature_names].fillna(0)
+    X_te = X_test[feature_names].fillna(0)
+    ebm.fit(X_tr, y_train)
+    y_pred = np.clip(ebm.predict(X_te), 0, None)
+    mae = mean_absolute_error(y_test, y_pred)
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+    mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100
+    # Feature importances from EBM
+    importance = dict(zip(
+        ebm.term_names_,
+        ebm.term_importances(),
+    ))
+    results = {
+        "model": "EBM",
+        "mae": mae,
+        "rmse": rmse,
+        "mape_pct": mape,
+        "n_train": len(y_train),
+        "n_test": len(y_test),
+        "top_features": dict(sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]),
+    }
+    # Save
+    model_path = os.path.join(output_dir, "pricing_ebm.pkl")
+    with open(model_path, "wb") as f:
+        pickle.dump(ebm, f)
+    print(f"  ✓ EBM saved → {model_path}")
+    print(f"    MAE: £{mae:,.2f}")
+    print(f"    RMSE: £{rmse:,.2f}")
+    print(f"    MAPE: {mape:.1f}%")
+    print(f"    Top features:")
+    for feat, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]:
+        print(f"      {feat}: {imp:.4f}")
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="Train pricing models")
+    parser.add_argument("--data-dir", default=DATA_DIR)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    args = parser.parse_args()
+    print(f"{'='*60}")
+    print(f" InsureOS — Pricing Model Training")
+    print(f" Data: {args.data_dir}")
+    print(f"{'='*60}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load data
+    print("\nLoading motor claims data...")
+    df, encoders = load_and_prepare(args.data_dir)
+    print(f"  Records: {len(df)}")
+    print(f"  Mean claim amount: £{df['claim_amount'].mean():,.2f}")
+    print(f"  Median claim amount: £{df['claim_amount'].median():,.2f}")
+    # Feature set
+    numeric_features = PRICING_FEATURES + ["vehicle_age", "driver_experience_ratio", "ncd_ratio"]
+    cat_enc_features = [c + "_enc" for c in CAT_FEATURES if c + "_enc" in df.columns]
+    all_features = numeric_features + cat_enc_features
+    y = df["claim_amount"]
+    X = df[all_features]
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
+    )
+    # Train both models
+    glm_results = train_tweedie_glm(X_train, y_train, X_test, y_test, all_features, args.output_dir)
+    ebm_results = train_ebm(X_train, y_train, X_test, y_test, all_features, args.output_dir)
+    # Save encoders
+    encoder_path = os.path.join(args.output_dir, "label_encoders.pkl")
+    with open(encoder_path, "wb") as f:
+        pickle.dump(encoders, f)
+    # Save results
+    summary = {"glm": glm_results, "ebm": ebm_results}
+    summary_path = os.path.join(args.output_dir, "training_results.json")
+    with open(summary_path, "w") as f:
+        json.dump(summary, f, indent=2, default=str)
+    print(f"\n{'='*60}")
+    print(f" ✓ Pricing model training complete!")
+    print(f"   Tweedie GLM MAE: £{glm_results['mae']:,.2f}")
+    print(f"   EBM MAE: £{ebm_results['mae']:,.2f}")
+    print(f"   Results → {summary_path}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()

push_to_hf.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+"""Push all INSUREOS models and code to HuggingFace Hub.
+Created by Bytical AI.
+"""
+import os
+import sys
+from huggingface_hub import HfApi, create_repo
+TOKEN = os.environ.get("HF_TOKEN")
+if not TOKEN:
+    print("ERROR: Set HF_TOKEN environment variable")
+    sys.exit(1)
+api = HfApi(token=TOKEN)
+ORG = "piyushptiwari"
+# Ensure the organization exists (it may just be the user namespace)
+# We'll use "bytical" as org — if it doesn't exist, we fall back to user
+try:
+    api.whoami()
+    print("Authenticated successfully")
+except Exception as e:
+    print(f"Auth error: {e}")
+    sys.exit(1)
+def ensure_repo(repo_id: str, repo_type: str = "model"):
+    """Create repo if it doesn't exist."""
+    try:
+        create_repo(
+            repo_id=repo_id,
+            repo_type=repo_type,
+            exist_ok=True,
+            token=TOKEN,
+        )
+        print(f"  Repo ready: {repo_id} ({repo_type})")
+    except Exception as e:
+        print(f"  Repo creation note for {repo_id}: {e}")
+def upload_folder(repo_id: str, local_dir: str, repo_type: str = "model",
+                  ignore_patterns=None):
+    """Upload a local folder to HF Hub."""
+    if ignore_patterns is None:
+        ignore_patterns = ["__pycache__", "*.pyc", ".DS_Store"]
+    print(f"  Uploading {local_dir} -> {repo_id}...")
+    api.upload_folder(
+        repo_id=repo_id,
+        folder_path=local_dir,
+        repo_type=repo_type,
+        ignore_patterns=ignore_patterns,
+    )
+    print(f"  Done: {repo_id}")
+if __name__ == "__main__":
+    BASE = "/home/piyush/Desktop/Insurance/insureos-models"
+    # =========================================================
+    # 1. InsureLLM-4B (main LLM — the best merged model only)
+    # =========================================================
+    print("\n[1/7] InsureLLM-4B")
+    repo = f"{ORG}/InsureLLM-4B"
+    ensure_repo(repo)
+    upload_folder(repo, f"{BASE}/models/insurellm-4b-realworld-merged")
+    # =========================================================
+    # 2. InsureDocClassifier
+    # =========================================================
+    print("\n[2/7] InsureDocClassifier")
+    repo = f"{ORG}/InsureDocClassifier"
+    ensure_repo(repo)
+    upload_folder(
+        repo, f"{BASE}/models/doc-classifier",
+        ignore_patterns=["__pycache__", "*.pyc", ".DS_Store",
+                         "checkpoint-*"]
+    )
+    # =========================================================
+    # 3. InsureNER
+    # =========================================================
+    print("\n[3/7] InsureNER")
+    repo = f"{ORG}/InsureNER"
+    ensure_repo(repo)
+    upload_folder(
+        repo, f"{BASE}/models/ner-model",
+        ignore_patterns=["__pycache__", "*.pyc", ".DS_Store",
+                         "checkpoint-*"]
+    )
+    # =========================================================
+    # 4. InsureFraudNet
+    # =========================================================
+    print("\n[4/7] InsureFraudNet")
+    repo = f"{ORG}/InsureFraudNet"
+    ensure_repo(repo)
+    upload_folder(repo, f"{BASE}/models/fraudnet")
+    # =========================================================
+    # 5. InsurePricing
+    # =========================================================
+    print("\n[5/7] InsurePricing")
+    repo = f"{ORG}/InsurePricing"
+    ensure_repo(repo)
+    upload_folder(repo, f"{BASE}/models/pricing-glm")
+    # =========================================================
+    # 6. Training Code + Search Engine (as a regular repo)
+    # =========================================================
+    print("\n[6/7] insureos-models (code repo)")
+    repo = f"{ORG}/insureos-models"
+    ensure_repo(repo, repo_type="model")
+    # We upload code only — no model weights, no raw data, no personal files
+    upload_folder(
+        repo, BASE, repo_type="model",
+        ignore_patterns=[
+            "__pycache__", "*.pyc", ".DS_Store",
+            "models/*",           # model weights are in separate repos
+            "*.pkl", "*.bin",     # no binary artifacts in code repo
+            "raw/*",              # raw scraped data
+            "processed/*",
+            "search/index_data/*",  # search index binaries
+            "data/output/*",      # generated training data
+            "*.jsonl",            # training data files
+            ".venv/*",
+        ]
+    )
+    # =========================================================
+    # 7. Training Data (as a dataset)
+    # =========================================================
+    print("\n[7/7] insureos-training-data (dataset)")
+    repo = f"{ORG}/insureos-training-data"
+    ensure_repo(repo, repo_type="dataset")
+    upload_folder(
+        repo, f"{BASE}/data/output", repo_type="dataset",
+        ignore_patterns=["__pycache__", "*.pyc"]
+    )
+    print("\n" + "=" * 60)
+    print("ALL UPLOADS COMPLETE!")
+    print("=" * 60)
+    print(f"\nModels:")
+    print(f"  https://huggingface.co/{ORG}/InsureLLM-4B")
+    print(f"  https://huggingface.co/{ORG}/InsureDocClassifier")
+    print(f"  https://huggingface.co/{ORG}/InsureNER")
+    print(f"  https://huggingface.co/{ORG}/InsureFraudNet")
+    print(f"  https://huggingface.co/{ORG}/InsurePricing")
+    print(f"\nCode:")
+    print(f"  https://huggingface.co/{ORG}/insureos-models")
+    print(f"\nDataset:")
+    print(f"  https://huggingface.co/datasets/{ORG}/insureos-training-data")

qlora_finetune.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+InsureOS — QLoRA Fine-Tuning Script
+Fine-tunes Qwen3-8B on UK insurance SFT data using 4-bit QLoRA.
+Fits in 16 GB VRAM with gradient checkpointing.
+"""
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+import json
+import argparse
+from pathlib import Path
+import torch
+from datasets import Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+)
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
+from trl import SFTTrainer, SFTConfig
+# ── Defaults ──
+BASE_MODEL = "Qwen/Qwen3-4B"
+DATA_PATH = "data/output/insurance_sft_10k.jsonl"
+OUTPUT_DIR = "models/insurellm-4b-qlora"
+MAX_SEQ_LEN = 1024
+LORA_R = 64
+LORA_ALPHA = 128
+LORA_DROPOUT = 0.05
+EPOCHS = 3
+BATCH_SIZE = 2
+GRAD_ACCUM = 8  # effective batch = 16
+LR = 2e-4
+WARMUP_RATIO = 0.05
+LOGGING_STEPS = 10
+SAVE_STEPS = 200
+EVAL_SPLIT = 0.05  # 5% held out for eval
+def load_sft_data(path: str) -> Dataset:
+    """Load JSONL SFT data into a HuggingFace Dataset."""
+    records = []
+    with open(path) as f:
+        for line in f:
+            obj = json.loads(line)
+            records.append(obj)
+    return Dataset.from_list(records)
+def format_messages(example: dict) -> dict:
+    """Convert messages list to a single training text using Qwen3 chat template."""
+    # The SFTTrainer with `dataset_text_field` or chat template will handle this,
+    # but we can also format manually if needed.
+    # Our SFT data has {"messages": [...], "category": ...}
+    return example  # SFTTrainer will use the tokenizer's chat template
+def main():
+    parser = argparse.ArgumentParser(description="QLoRA fine-tune Qwen3-8B for UK insurance")
+    parser.add_argument("--base-model", default=BASE_MODEL)
+    parser.add_argument("--data-path", default=DATA_PATH)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    parser.add_argument("--epochs", type=int, default=EPOCHS)
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE)
+    parser.add_argument("--lr", type=float, default=LR)
+    parser.add_argument("--lora-r", type=int, default=LORA_R)
+    parser.add_argument("--lora-alpha", type=int, default=LORA_ALPHA)
+    parser.add_argument("--max-seq-len", type=int, default=MAX_SEQ_LEN)
+    parser.add_argument("--grad-accum", type=int, default=GRAD_ACCUM)
+    parser.add_argument("--resume-from-checkpoint", action="store_true")
+    args = parser.parse_args()
+    print(f"{'='*60}")
+    print(f" InsureOS — QLoRA Fine-Tuning")
+    print(f" Base model: {args.base_model}")
+    print(f" Data: {args.data_path}")
+    print(f" LoRA rank: {args.lora_r}, alpha: {args.lora_alpha}")
+    print(f"{'='*60}\n")
+    # ── 1. Load tokenizer ──
+    print("[1/5] Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.base_model,
+        trust_remote_code=True,
+        padding_side="right",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # ── 2. Load model in 4-bit ──
+    print("[2/5] Loading model in 4-bit quantization...")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+        dtype=torch.bfloat16,
+    )
+    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+    # ── 3. Apply LoRA ──
+    print("[3/5] Applying LoRA adapters...")
+    lora_config = LoraConfig(
+        r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=LORA_DROPOUT,
+        target_modules="all-linear",
+        task_type=TaskType.CAUSAL_LM,
+        bias="none",
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # ── 4. Load data ──
+    print("[4/5] Loading training data...")
+    dataset = load_sft_data(args.data_path)
+    print(f"  Total examples: {len(dataset)}")
+    # Train/eval split
+    split = dataset.train_test_split(test_size=EVAL_SPLIT, seed=42)
+    train_ds = split["train"]
+    eval_ds = split["test"]
+    print(f"  Train: {len(train_ds)}, Eval: {len(eval_ds)}")
+    # ── 5. Train ──
+    print("[5/5] Starting training...")
+    sft_config = SFTConfig(
+        output_dir=args.output_dir,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_accum,
+        learning_rate=args.lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=WARMUP_RATIO,
+        weight_decay=0.01,
+        bf16=True,
+        logging_steps=LOGGING_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=3,
+        eval_strategy="steps",
+        eval_steps=SAVE_STEPS,
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        report_to="none",
+        max_length=args.max_seq_len,
+        packing=False,
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        dataset_kwargs={"skip_prepare_dataset": False},
+    )
+    trainer = SFTTrainer(
+        model=model,
+        args=sft_config,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        processing_class=tokenizer,
+    )
+    if args.resume_from_checkpoint:
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    # ── Save ──
+    print("\nSaving model and tokenizer...")
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    # Save merged adapter weights for easier loading
+    merged_dir = f"{args.output_dir}-merged"
+    print(f"Merging LoRA weights → {merged_dir}")
+    merged_model = model.merge_and_unload()
+    merged_model.save_pretrained(merged_dir)
+    tokenizer.save_pretrained(merged_dir)
+    print(f"\n✓ Training complete!")
+    print(f"  LoRA adapter: {args.output_dir}")
+    print(f"  Merged model: {merged_dir}")
+if __name__ == "__main__":
+    main()

retrain_realworld.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""Retrain InsureLLM with real-world collected data.
+This script:
+1. Loads the existing DPO-merged model (best checkpoint)
+2. Runs QLoRA fine-tuning on real-world SFT data
+3. Saves the improved model
+"""
+import argparse
+import logging
+import os
+import torch
+from pathlib import Path
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger(__name__)
+BASE_DIR = Path(__file__).resolve().parent.parent
+def retrain(
+    sft_file: str = "collect/sft_real_world.jsonl",
+    base_model: str = "models/insurellm-4b-dpo-merged",
+    output_dir: str = "models/insurellm-4b-realworld",
+    max_seq_len: int = 1024,
+    batch_size: int = 2,
+    grad_accum: int = 4,
+    epochs: int = 2,
+    lr: float = 2e-5,
+    lora_r: int = 64,
+    lora_alpha: int = 128,
+):
+    from datasets import load_dataset
+    from peft import LoraConfig, get_peft_model, TaskType
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+    from trl import SFTConfig, SFTTrainer
+    sft_path = str(BASE_DIR / sft_file)
+    model_path = str(BASE_DIR / base_model)
+    out_path = str(BASE_DIR / output_dir)
+    logger.info(f"Loading SFT data from {sft_path}")
+    dataset = load_dataset("json", data_files=sft_path, split="train")
+    logger.info(f"  {len(dataset)} training examples")
+    # Train/eval split
+    split = dataset.train_test_split(test_size=0.05, seed=42)
+    train_ds = split["train"]
+    eval_ds = split["test"]
+    logger.info(f"  Train: {len(train_ds)}, Eval: {len(eval_ds)}")
+    # Quantization config
+    bnb = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    logger.info(f"Loading base model: {model_path}")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=bnb,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+        dtype=torch.bfloat16,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # LoRA config
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        lora_dropout=0.05,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                        "gate_proj", "up_proj", "down_proj"],
+        bias="none",
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # Training config
+    training_args = SFTConfig(
+        output_dir=out_path,
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        gradient_accumulation_steps=grad_accum,
+        learning_rate=lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.05,
+        logging_steps=10,
+        save_strategy="steps",
+        save_steps=200,
+        eval_strategy="steps",
+        eval_steps=200,
+        save_total_limit=3,
+        bf16=True,
+        max_length=max_seq_len,
+        packing=False,
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        report_to="none",
+        seed=42,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        processing_class=tokenizer,
+    )
+    logger.info("Starting training...")
+    trainer.train()
+    logger.info("Training complete!")
+    # Save LoRA adapter
+    trainer.save_model(out_path)
+    tokenizer.save_pretrained(out_path)
+    logger.info(f"LoRA adapter saved to {out_path}")
+    # Merge and save
+    merged_path = out_path + "-merged"
+    logger.info("Merging LoRA into base model...")
+    from peft import PeftModel
+    base = AutoModelForCausalLM.from_pretrained(
+        model_path, dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True,
+    )
+    merged = PeftModel.from_pretrained(base, out_path)
+    merged = merged.merge_and_unload()
+    # Save with safetensors (avoid transformers 5.4.0 bug)
+    os.makedirs(merged_path, exist_ok=True)
+    from safetensors.torch import save_file
+    state = merged.state_dict()
+    # Handle tied weights
+    if "lm_head.weight" in state and "model.embed_tokens.weight" in state:
+        if state["lm_head.weight"].data_ptr() == state["model.embed_tokens.weight"].data_ptr():
+            state["lm_head.weight"] = state["lm_head.weight"].clone()
+    save_file(state, f"{merged_path}/model.safetensors")
+    merged.config.save_pretrained(merged_path)
+    tokenizer.save_pretrained(merged_path)
+    logger.info(f"Merged model saved to {merged_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Retrain InsureLLM with real-world data")
+    parser.add_argument("--sft-file", default="collect/sft_real_world.jsonl")
+    parser.add_argument("--base-model", default="models/insurellm-4b-dpo-merged")
+    parser.add_argument("--output-dir", default="models/insurellm-4b-realworld")
+    parser.add_argument("--max-seq-len", type=int, default=1024)
+    parser.add_argument("--batch-size", type=int, default=2)
+    parser.add_argument("--grad-accum", type=int, default=4)
+    parser.add_argument("--epochs", type=int, default=2)
+    parser.add_argument("--lr", type=float, default=2e-5)
+    args = parser.parse_args()
+    retrain(
+        sft_file=args.sft_file,
+        base_model=args.base_model,
+        output_dir=args.output_dir,
+        max_seq_len=args.max_seq_len,
+        batch_size=args.batch_size,
+        grad_accum=args.grad_accum,
+        epochs=args.epochs,
+        lr=args.lr,
+    )

run_collection.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Master orchestrator for all data collection sources."""
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+def run_collection():
+    """Run all data collection sources."""
+    start = time.time()
+    total_docs = 0
+    # ── 1. Wikipedia ───────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("1/6  WIKIPEDIA — Insurance articles")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.wikipedia import collect_wikipedia
+        docs = collect_wikipedia(max_articles=400)
+        total_docs += len(docs)
+        logger.info(f"  ✓ Wikipedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Wikipedia failed: {e}")
+    # ── 2. FCA Handbook ────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("2/6  FCA HANDBOOK — UK insurance regulation")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.fca import collect_fca
+        docs = collect_fca()
+        total_docs += len(docs)
+        logger.info(f"  ✓ FCA: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ FCA failed: {e}")
+    # ── 3. UK Legislation ──────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("3/6  UK LEGISLATION — Insurance Act 2015 etc.")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.legislation import collect_legislation
+        docs = collect_legislation()
+        total_docs += len(docs)
+        logger.info(f"  ✓ Legislation: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Legislation failed: {e}")
+    # ── 4. Investopedia ────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("4/6  INVESTOPEDIA — Insurance glossary")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.investopedia import collect_investopedia
+        docs = collect_investopedia()
+        total_docs += len(docs)
+        logger.info(f"  ✓ Investopedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Investopedia failed: {e}")
+    # ── 5. HuggingFace ─────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("5/6  HUGGINGFACE — Insurance datasets")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.hf_datasets import collect_huggingface
+        docs = collect_huggingface()
+        total_docs += len(docs)
+        logger.info(f"  ✓ HuggingFace: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ HuggingFace failed: {e}")
+    # ── 6. RSS / News ──────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("6/6  RSS NEWS — Insurance industry news")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.rss_news import collect_rss
+        docs = collect_rss()
+        total_docs += len(docs)
+        logger.info(f"  ✓ RSS: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ RSS failed: {e}")
+    # ── 7. Education ───────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("7/7  EDUCATION — Open textbooks & exam content")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.education import collect_education
+        docs = collect_education()
+        total_docs += len(docs)
+        logger.info(f"  ✓ Education: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  ✗ Education failed: {e}")
+    # ── Convert to SFT ─────────────────────────────────────────────
+    logger.info("=" * 60)
+    logger.info("CONVERTING collected data → SFT + DPO training format")
+    logger.info("=" * 60)
+    try:
+        from collect.convert_sft import convert_all_to_sft
+        sft_count, dpo_count = convert_all_to_sft()
+        logger.info(f"  ✓ SFT pairs: {sft_count}")
+        logger.info(f"  ✓ DPO pairs: {dpo_count}")
+    except Exception as e:
+        logger.error(f"  ✗ SFT conversion failed: {e}")
+    elapsed = time.time() - start
+    logger.info("=" * 60)
+    logger.info(f"COLLECTION COMPLETE")
+    logger.info(f"  Total documents: {total_docs:,}")
+    logger.info(f"  Time elapsed:    {elapsed / 60:.1f} minutes")
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    run_collection()

run_eval.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""
+InsureOS — Comprehensive Evaluation Suite
+Evaluates all trained models: InsureLLM (generative), FraudNet, PricingGLM, DocClassifier, NER.
+"""
+import os
+import json
+import argparse
+import time
+from pathlib import Path
+import torch
+import numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# Optional imports (gracefully degrade if not installed)
+try:
+    from rouge_score import rouge_scorer
+    HAS_ROUGE = True
+except ImportError:
+    HAS_ROUGE = False
+try:
+    from bert_score import score as bert_score_fn
+    HAS_BERTSCORE = True
+except ImportError:
+    HAS_BERTSCORE = False
+# ── Defaults ──
+INSURELLM_MODEL = "models/insurellm-8b-dpo-merged"
+EVAL_DATA = "data/output/insurance_sft_10k.jsonl"
+OUTPUT_DIR = "evaluation/results"
+N_SAMPLES = 100  # number of samples for generative eval
+# ── UK Insurance Domain-Specific Evaluation Prompts ──
+DOMAIN_EVAL_PROMPTS = [
+    {
+        "id": "fca_consumer_duty",
+        "prompt": "A customer's home insurance claim for escape of water has been open for 8 weeks with no update. They call asking what's happening. Draft a response.",
+        "required_elements": ["apology", "timeline", "next steps", "complaint rights", "financial ombudsman"],
+        "forbidden_elements": ["jargon", "indemnification", "proximate cause", "peril"],
+    },
+    {
+        "id": "gdpr_data_protection",
+        "prompt": "A third party calls asking about the status of a motor claim made against our policyholder. How should we respond?",
+        "required_elements": ["data protection", "cannot share", "their own insurer", "GDPR"],
+        "forbidden_elements": ["policy number", "policyholder address", "bank details"],
+    },
+    {
+        "id": "claims_process",
+        "prompt": "Explain the insurance claims process for a domestic property fire from FNOL through to settlement.",
+        "required_elements": ["FNOL", "loss adjuster", "investigation", "settlement", "excess"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "fraud_indicators",
+        "prompt": "What are the key red flags for insurance fraud in motor third-party claims?",
+        "required_elements": ["exaggeration", "staged", "late reporting", "multiple claims"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "lloyds_market",
+        "prompt": "Explain the role of an MGA in the Lloyd's market and how binding authority agreements work.",
+        "required_elements": ["binding authority", "capacity provider", "syndicate", "delegated authority", "bordereaux"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "pricing_fairness",
+        "prompt": "An insurer wants to use first names as a rating factor because it improves their model by 3%. Should they?",
+        "required_elements": ["proxy discrimination", "protected characteristics", "Equality Act", "FCA"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "subrogation",
+        "prompt": "Explain subrogation rights in UK insurance. When does an insurer pursue recovery?",
+        "required_elements": ["recovery", "third party", "policyholder indemnified", "non-fault"],
+        "forbidden_elements": [],
+    },
+    {
+        "id": "renewal_transparency",
+        "prompt": "A customer's premium increased by 25% at renewal. They want to know why. Draft an explanation.",
+        "required_elements": ["transparency", "factors", "shop around", "Consumer Duty", "fair value"],
+        "forbidden_elements": ["take it or leave it", "market rate"],
+    },
+]
+def evaluate_insurellm(model_path: str, n_samples: int, output_dir: str) -> dict:
+    """Evaluate the generative InsureLLM model."""
+    print(f"\n{'='*60}")
+    print(f" Evaluating InsureLLM: {model_path}")
+    print(f"{'='*60}")
+    # Load model
+    print("Loading model...")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    results = {
+        "model": model_path,
+        "domain_eval": [],
+        "generation_metrics": {},
+    }
+    # ── 1. Domain-Specific Evaluation ──
+    print("\n[1/3] Domain-specific evaluation...")
+    for item in DOMAIN_EVAL_PROMPTS:
+        messages = [
+            {"role": "system", "content": "You are InsureLLM, a specialist UK insurance AI assistant."},
+            {"role": "user", "content": item["prompt"]},
+        ]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        start = time.time()
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+            )
+        latency = time.time() - start
+        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        # Check required elements
+        response_lower = response.lower()
+        found_required = [e for e in item["required_elements"] if e.lower() in response_lower]
+        found_forbidden = [e for e in item["forbidden_elements"] if e.lower() in response_lower]
+        score = len(found_required) / max(len(item["required_elements"]), 1)
+        penalty = len(found_forbidden) * 0.15
+        final_score = max(0, score - penalty)
+        eval_result = {
+            "id": item["id"],
+            "score": final_score,
+            "required_found": len(found_required),
+            "required_total": len(item["required_elements"]),
+            "forbidden_found": len(found_forbidden),
+            "latency_s": latency,
+            "response_length": len(response.split()),
+        }
+        results["domain_eval"].append(eval_result)
+        status = "✓" if final_score >= 0.7 else "△" if final_score >= 0.4 else "✗"
+        print(f"  {status} {item['id']}: {final_score:.2f} "
+              f"({len(found_required)}/{len(item['required_elements'])} required, "
+              f"{len(found_forbidden)} forbidden, {latency:.1f}s)")
+    avg_domain = np.mean([r["score"] for r in results["domain_eval"]])
+    avg_latency = np.mean([r["latency_s"] for r in results["domain_eval"]])
+    print(f"\n  Average domain score: {avg_domain:.3f}")
+    print(f"  Average latency: {avg_latency:.1f}s")
+    # ── 2. ROUGE scores on held-out SFT data ──
+    if HAS_ROUGE and os.path.exists(EVAL_DATA):
+        print("\n[2/3] ROUGE evaluation on SFT test set...")
+        scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+        eval_records = []
+        with open(EVAL_DATA) as f:
+            for line in f:
+                eval_records.append(json.loads(line))
+        # Use last N as eval
+        eval_subset = eval_records[-min(n_samples, len(eval_records)):]
+        rouge1_scores = []
+        rouge2_scores = []
+        rougeL_scores = []
+        for rec in eval_subset:
+            messages = rec["messages"]
+            # Get reference (last assistant message)
+            reference = messages[-1]["content"]
+            prompt_messages = messages[:-1]
+            text = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)
+            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
+            with torch.no_grad():
+                outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1, do_sample=False)
+            generated = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+            scores = scorer.score(reference, generated)
+            rouge1_scores.append(scores["rouge1"].fmeasure)
+            rouge2_scores.append(scores["rouge2"].fmeasure)
+            rougeL_scores.append(scores["rougeL"].fmeasure)
+        results["generation_metrics"]["rouge1"] = float(np.mean(rouge1_scores))
+        results["generation_metrics"]["rouge2"] = float(np.mean(rouge2_scores))
+        results["generation_metrics"]["rougeL"] = float(np.mean(rougeL_scores))
+        print(f"  ROUGE-1: {results['generation_metrics']['rouge1']:.4f}")
+        print(f"  ROUGE-2: {results['generation_metrics']['rouge2']:.4f}")
+        print(f"  ROUGE-L: {results['generation_metrics']['rougeL']:.4f}")
+    else:
+        print("\n[2/3] Skipping ROUGE (rouge_score not installed or data not found)")
+    # ── 3. Summary metrics ──
+    print("\n[3/3] Computing summary...")
+    results["summary"] = {
+        "avg_domain_score": float(avg_domain),
+        "avg_latency_s": float(avg_latency),
+        "domain_pass_rate": float(np.mean([1 if r["score"] >= 0.7 else 0 for r in results["domain_eval"]])),
+    }
+    # Save
+    os.makedirs(output_dir, exist_ok=True)
+    outpath = os.path.join(output_dir, "insurellm_eval.json")
+    with open(outpath, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\n✓ InsureLLM eval results → {outpath}")
+    return results
+def evaluate_all(args):
+    """Run evaluation for all available models."""
+    print(f"{'='*60}")
+    print(f" InsureOS — Full Evaluation Suite")
+    print(f"{'='*60}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    all_results = {}
+    # 1. InsureLLM
+    if os.path.exists(args.insurellm_model):
+        all_results["insurellm"] = evaluate_insurellm(
+            args.insurellm_model, args.n_samples, args.output_dir
+        )
+    else:
+        print(f"\n⚠ InsureLLM not found at {args.insurellm_model}, skipping")
+    # 2. FraudNet — just check if results exist from training
+    fraud_results = Path("models/fraudnet/training_results.json")
+    if fraud_results.exists():
+        with open(fraud_results) as f:
+            all_results["fraudnet"] = json.load(f)
+        print(f"\n✓ FraudNet results loaded from training")
+    else:
+        print(f"\n⚠ FraudNet results not found, skipping")
+    # 3. Pricing GLM
+    pricing_results = Path("models/pricing-glm/training_results.json")
+    if pricing_results.exists():
+        with open(pricing_results) as f:
+            all_results["pricing"] = json.load(f)
+        print(f"✓ Pricing model results loaded from training")
+    else:
+        print(f"⚠ Pricing results not found, skipping")
+    # 4. Doc Classifier
+    doc_meta = Path("models/doc-classifier/training_meta.json")
+    if doc_meta.exists():
+        with open(doc_meta) as f:
+            all_results["doc_classifier"] = json.load(f)
+        print(f"✓ Doc classifier results loaded")
+    else:
+        print(f"⚠ Doc classifier results not found, skipping")
+    # 5. NER
+    ner_meta = Path("models/ner-model/training_meta.json")
+    if ner_meta.exists():
+        with open(ner_meta) as f:
+            all_results["ner"] = json.load(f)
+        print(f"✓ NER results loaded")
+    else:
+        print(f"⚠ NER results not found, skipping")
+    # ── Summary report ──
+    report_path = os.path.join(args.output_dir, "full_eval_report.json")
+    with open(report_path, "w") as f:
+        json.dump(all_results, f, indent=2, default=str)
+    print(f"\n{'='*60}")
+    print(f" EVALUATION SUMMARY")
+    print(f"{'='*60}")
+    if "insurellm" in all_results:
+        s = all_results["insurellm"].get("summary", {})
+        print(f"\n  InsureLLM (Generative):")
+        print(f"    Domain score: {s.get('avg_domain_score', 'N/A')}")
+        print(f"    Pass rate:    {s.get('domain_pass_rate', 'N/A')}")
+        print(f"    Latency:      {s.get('avg_latency_s', 'N/A')}s")
+    if "fraudnet" in all_results:
+        for r in all_results["fraudnet"]:
+            if isinstance(r, dict):
+                print(f"\n  FraudNet ({r.get('lob', '?')}):")
+                print(f"    AUC-ROC:      {r.get('auc_roc', 'N/A')}")
+                print(f"    Avg Precision: {r.get('avg_precision', 'N/A')}")
+    if "pricing" in all_results:
+        for model_type in ["glm", "ebm"]:
+            if model_type in all_results["pricing"]:
+                m = all_results["pricing"][model_type]
+                print(f"\n  Pricing {model_type.upper()}:")
+                print(f"    MAE:  £{m.get('mae', 'N/A')}")
+                print(f"    RMSE: £{m.get('rmse', 'N/A')}")
+    if "doc_classifier" in all_results:
+        r = all_results["doc_classifier"].get("results", {})
+        print(f"\n  Document Classifier:")
+        print(f"    Accuracy:   {r.get('eval_accuracy', 'N/A')}")
+        print(f"    F1 (macro): {r.get('eval_f1_macro', 'N/A')}")
+    if "ner" in all_results:
+        r = all_results["ner"].get("results", {})
+        print(f"\n  NER Model:")
+        print(f"    F1:        {r.get('eval_f1', 'N/A')}")
+        print(f"    Precision: {r.get('eval_precision', 'N/A')}")
+        print(f"    Recall:    {r.get('eval_recall', 'N/A')}")
+    print(f"\n  Full report → {report_path}")
+def main():
+    parser = argparse.ArgumentParser(description="InsureOS evaluation suite")
+    parser.add_argument("--insurellm-model", default=INSURELLM_MODEL)
+    parser.add_argument("--n-samples", type=int, default=N_SAMPLES)
+    parser.add_argument("--output-dir", default=OUTPUT_DIR)
+    args = parser.parse_args()
+    evaluate_all(args)
+if __name__ == "__main__":
+    main()

run_fast.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""Fast data collection — reduced Wikipedia cap, lower API delay."""
+import json
+import logging
+import sys
+import time
+from pathlib import Path
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+# Override delay for API sources (Wikipedia API is generous)
+import collect.config as cfg
+cfg.REQUEST_DELAY = 0.5
+def run_fast():
+    start = time.time()
+    total_docs = 0
+    # 1. Wikipedia (cap at 150 — still 2M+ chars of insurance knowledge)
+    logger.info("=" * 60)
+    logger.info("1/7  WIKIPEDIA — Insurance articles (max 150)")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.wikipedia import collect_wikipedia
+        docs = collect_wikipedia(max_articles=150)
+        total_docs += len(docs)
+        logger.info(f"  => Wikipedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Wikipedia failed: {e}", exc_info=True)
+    # 2. FCA Handbook
+    cfg.REQUEST_DELAY = 1.5  # Web scraping — be polite
+    logger.info("=" * 60)
+    logger.info("2/7  FCA HANDBOOK")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.fca import collect_fca
+        docs = collect_fca()
+        total_docs += len(docs)
+        logger.info(f"  => FCA: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  FCA failed: {e}", exc_info=True)
+    # 3. UK Legislation
+    logger.info("=" * 60)
+    logger.info("3/7  UK LEGISLATION")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.legislation import collect_legislation
+        docs = collect_legislation()
+        total_docs += len(docs)
+        logger.info(f"  => Legislation: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Legislation failed: {e}", exc_info=True)
+    # 4. Investopedia
+    logger.info("=" * 60)
+    logger.info("4/7  INVESTOPEDIA")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.investopedia import collect_investopedia
+        docs = collect_investopedia()
+        total_docs += len(docs)
+        logger.info(f"  => Investopedia: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Investopedia failed: {e}", exc_info=True)
+    # 5. HuggingFace
+    cfg.REQUEST_DELAY = 0.3
+    logger.info("=" * 60)
+    logger.info("5/7  HUGGINGFACE DATASETS")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.hf_datasets import collect_huggingface
+        docs = collect_huggingface()
+        total_docs += len(docs)
+        logger.info(f"  => HuggingFace: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  HuggingFace failed: {e}", exc_info=True)
+    # 6. RSS News
+    cfg.REQUEST_DELAY = 1.0
+    logger.info("=" * 60)
+    logger.info("6/7  RSS NEWS")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.rss_news import collect_rss
+        docs = collect_rss()
+        total_docs += len(docs)
+        logger.info(f"  => RSS: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  RSS failed: {e}", exc_info=True)
+    # 7. Education
+    logger.info("=" * 60)
+    logger.info("7/7  EDUCATION")
+    logger.info("=" * 60)
+    try:
+        from collect.sources.education import collect_education
+        docs = collect_education()
+        total_docs += len(docs)
+        logger.info(f"  => Education: {len(docs)} documents")
+    except Exception as e:
+        logger.error(f"  Education failed: {e}", exc_info=True)
+    # Convert to SFT
+    logger.info("=" * 60)
+    logger.info("CONVERTING → SFT + DPO format")
+    logger.info("=" * 60)
+    try:
+        from collect.convert_sft import convert_all_to_sft
+        sft_count, dpo_count = convert_all_to_sft()
+        logger.info(f"  => SFT pairs: {sft_count}")
+        logger.info(f"  => DPO pairs: {dpo_count}")
+    except Exception as e:
+        logger.error(f"  SFT conversion failed: {e}", exc_info=True)
+    elapsed = time.time() - start
+    logger.info("=" * 60)
+    logger.info(f"DONE — {total_docs:,} documents in {elapsed / 60:.1f} min")
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    run_fast()

scraper_base.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Base scraper with rate limiting, retries, and polite crawling."""
+import time
+import json
+import hashlib
+import logging
+from pathlib import Path
+from typing import Optional
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from collect.config import (
+    HEADERS, REQUEST_DELAY, MAX_RETRIES, TIMEOUT, RAW_DIR,
+)
+logger = logging.getLogger(__name__)
+class BaseScraper:
+    """Polite web scraper with rate limiting and caching."""
+    def __init__(self, source_name: str):
+        self.source_name = source_name
+        self.output_dir = RAW_DIR / source_name
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.session = self._build_session()
+        self._last_request_time = 0.0
+        self.stats = {"fetched": 0, "cached": 0, "failed": 0, "total_chars": 0}
+    def _build_session(self) -> requests.Session:
+        session = requests.Session()
+        session.headers.update(HEADERS)
+        retry = Retry(
+            total=MAX_RETRIES,
+            backoff_factor=1.0,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["GET"],
+        )
+        adapter = HTTPAdapter(max_retries=retry)
+        session.mount("https://", adapter)
+        session.mount("http://", adapter)
+        return session
+    def _rate_limit(self):
+        elapsed = time.time() - self._last_request_time
+        if elapsed < REQUEST_DELAY:
+            time.sleep(REQUEST_DELAY - elapsed)
+        self._last_request_time = time.time()
+    def _cache_key(self, url: str) -> str:
+        return hashlib.sha256(url.encode()).hexdigest()[:16]
+    def _cache_path(self, url: str) -> Path:
+        return self.output_dir / f"{self._cache_key(url)}.json"
+    def fetch(self, url: str, force: bool = False) -> Optional[str]:
+        """Fetch URL content with caching and rate limiting."""
+        cache = self._cache_path(url)
+        if not force and cache.exists():
+            data = json.loads(cache.read_text())
+            self.stats["cached"] += 1
+            return data.get("content")
+        self._rate_limit()
+        try:
+            resp = self.session.get(url, timeout=TIMEOUT)
+            resp.raise_for_status()
+            content = resp.text
+            # Cache the result
+            cache.write_text(json.dumps({
+                "url": url,
+                "status": resp.status_code,
+                "content": content,
+                "fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+            }))
+            self.stats["fetched"] += 1
+            self.stats["total_chars"] += len(content)
+            return content
+        except Exception as e:
+            logger.warning(f"[{self.source_name}] Failed to fetch {url}: {e}")
+            self.stats["failed"] += 1
+            return None
+    def save_documents(self, documents: list[dict], filename: str = "documents.jsonl"):
+        """Save collected documents as JSONL."""
+        out = self.output_dir / filename
+        with open(out, "w") as f:
+            for doc in documents:
+                f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+        logger.info(f"[{self.source_name}] Saved {len(documents)} docs → {out}")
+        return out
+    def print_stats(self):
+        logger.info(
+            f"[{self.source_name}] Stats: "
+            f"fetched={self.stats['fetched']}, "
+            f"cached={self.stats['cached']}, "
+            f"failed={self.stats['failed']}, "
+            f"chars={self.stats['total_chars']:,}"
+        )

scripts/setup.sh ADDED Viewed

	@@ -0,0 +1,82 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# ============================================================
+# InsureOS Models — Environment Setup
+# Target: 16GB GPU VM (Bytical Audio)
+# ============================================================
+echo "=== InsureOS Models Setup ==="
+echo "GPU check:"
+nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo "WARNING: No GPU detected"
+# System packages
+echo "=== Installing system dependencies ==="
+sudo apt-get update -qq
+sudo apt-get install -y -qq python3-pip python3-venv git curl wget
+# Create venv
+echo "=== Creating Python virtual environment ==="
+python3 -m venv .venv
+source .venv/bin/activate
+# Core ML
+echo "=== Installing PyTorch + CUDA ==="
+pip install --upgrade pip wheel setuptools
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# Training stack
+echo "=== Installing training dependencies ==="
+pip install \
+    transformers>=4.50.0 \
+    datasets>=3.0.0 \
+    accelerate>=1.2.0 \
+    peft>=0.14.0 \
+    trl>=0.15.0 \
+    bitsandbytes>=0.45.0 \
+    flash-attn --no-build-isolation \
+    sentencepiece \
+    protobuf \
+    wandb
+# Specialized ML
+echo "=== Installing specialized ML packages ==="
+pip install \
+    scikit-learn>=1.5.0 \
+    xgboost>=2.1.0 \
+    lightgbm>=4.5.0 \
+    interpret>=0.6.0 \
+    statsmodels>=0.14.0 \
+    scipy>=1.14.0 \
+    networkx>=3.4 \
+    torch-geometric>=2.6.0 \
+    pyg-lib -f https://data.pyg.org/whl/torch-2.5.0+cu121.html
+# Serving
+echo "=== Installing serving dependencies ==="
+pip install \
+    fastapi>=0.115.0 \
+    uvicorn>=0.32.0 \
+    pydantic>=2.10.0
+# Evaluation
+echo "=== Installing evaluation packages ==="
+pip install \
+    rouge-score \
+    nltk \
+    bert-score \
+    seqeval
+# Data generation
+echo "=== Installing data generation packages ==="
+pip install \
+    faker>=33.0.0 \
+    numpy>=1.26.0 \
+    pandas>=2.2.0 \
+    tqdm>=4.67.0
+echo ""
+echo "=== Setup complete! ==="
+echo "Activate with: source .venv/bin/activate"
+echo "Generate data: python -m data.generate_all"
+echo "Train all:     bash scripts/train_all.sh"

scripts/train_all.sh ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# ============================================================
+# InsureOS Models — Train All Models Sequentially
+# Designed for 16GB GPU — runs one model at a time
+# ============================================================
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(dirname "$SCRIPT_DIR")"
+cd "$ROOT_DIR"
+source .venv/bin/activate
+echo "============================================"
+echo " InsureOS Models — Full Training Pipeline"
+echo " $(date)"
+echo "============================================"
+# Phase 0: Generate data
+echo ""
+echo "=== PHASE 0: Generating synthetic training data ==="
+python -m data.generate_all
+# Phase 1: QLoRA fine-tuning (InsureLLM-8B)
+echo ""
+echo "=== PHASE 1: QLoRA Fine-Tuning — Qwen3-8B ==="
+python -m training.qlora_finetune \
+    --base-model Qwen/Qwen3-8B \
+    --dataset data/output/insurance_sft_10k.jsonl \
+    --output-dir outputs/insurellm-8b \
+    --epochs 3 \
+    --batch-size 2 \
+    --gradient-accumulation 8 \
+    --learning-rate 2e-4 \
+    --lora-rank 64 \
+    --lora-alpha 128
+# Phase 2: DPO reinforcement learning
+echo ""
+echo "=== PHASE 2: DPO Reinforcement Learning ==="
+python -m training.dpo_train \
+    --base-model Qwen/Qwen3-8B \
+    --adapter-path outputs/insurellm-8b \
+    --dataset data/output/insurance_dpo_5k.jsonl \
+    --output-dir outputs/insurellm-8b-dpo \
+    --epochs 1 \
+    --batch-size 1 \
+    --gradient-accumulation 16 \
+    --learning-rate 5e-5 \
+    --beta 0.1
+# Phase 3: Distillation to smaller model
+echo ""
+echo "=== PHASE 3: Knowledge Distillation — Qwen3-4B ==="
+python -m training.distill \
+    --teacher-model Qwen/Qwen3-8B \
+    --teacher-adapter outputs/insurellm-8b-dpo \
+    --student-model Qwen/Qwen3-4B \
+    --dataset data/output/insurance_sft_10k.jsonl \
+    --output-dir outputs/insurellm-4b \
+    --epochs 2 \
+    --batch-size 2 \
+    --gradient-accumulation 8
+# Phase 4: Fraud detection model
+echo ""
+echo "=== PHASE 4: Fraud Detection Model ==="
+python -m training.fraud_model \
+    --dataset data/output/claims_tabular_50k.csv \
+    --output-dir outputs/fraudnet
+# Phase 5: Pricing GLM
+echo ""
+echo "=== PHASE 5: Pricing GLM ==="
+python -m training.pricing_glm \
+    --dataset data/output/claims_tabular_50k.csv \
+    --output-dir outputs/pricing-glm
+# Phase 6: Document classifier
+echo ""
+echo "=== PHASE 6: Document Classifier ==="
+python -m training.doc_classifier \
+    --dataset data/output/documents_10k.jsonl \
+    --output-dir outputs/doc-classifier \
+    --epochs 5 \
+    --batch-size 16
+# Phase 7: Insurance NER
+echo ""
+echo "=== PHASE 7: Insurance NER ==="
+python -m training.ner_model \
+    --dataset data/output/entities_8k.jsonl \
+    --output-dir outputs/insure-ner \
+    --epochs 5 \
+    --batch-size 16
+# Phase 8: Evaluation
+echo ""
+echo "=== PHASE 8: Running Evaluation Suite ==="
+python -m evaluation.run_eval --all
+echo ""
+echo "============================================"
+echo " Training Complete! $(date)"
+echo " Models saved in outputs/"
+echo "============================================"
+ls -la outputs/

search/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # InsureSearch — open-source search engine

search/api.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""FastAPI REST API for InsureSearch — matching Azure AI Search API patterns.
+Endpoints:
+  POST /search           — Hybrid search (vector + BM25 + reranker)
+  POST /search/vector    — Pure vector search
+  POST /search/keyword   — Pure BM25 keyword search
+  GET  /suggest          — Autocomplete suggestions
+  GET  /facets           — Get available filter facets
+  GET  /stats            — Index statistics
+  POST /index/build      — Trigger index rebuild
+  GET  /health           — Health check
+"""
+import logging
+import time
+from typing import Optional
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from search.config import (
+    API_DESCRIPTION, API_HOST, API_PORT, API_TITLE, API_VERSION,
+    DEFAULT_TOP_K, MAX_TOP_K,
+)
+logger = logging.getLogger(__name__)
+# ── FastAPI app ────────────────────────────────────────────────────
+app = FastAPI(
+    title=API_TITLE,
+    version=API_VERSION,
+    description=API_DESCRIPTION,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Engine singleton ───────────────────────────────────────────────
+_engine = None
+def _get_engine():
+    global _engine
+    if _engine is None:
+        from search.hybrid_engine import HybridSearchEngine
+        _engine = HybridSearchEngine()
+        _engine.load()
+    return _engine
+# ── Request/Response models ────────────────────────────────────────
+class SearchRequest(BaseModel):
+    query: str = Field(..., min_length=1, max_length=1000,
+                       description="Search query text")
+    top_k: int = Field(DEFAULT_TOP_K, ge=1, le=MAX_TOP_K,
+                       description="Number of results to return")
+    filter_source: Optional[str] = Field(None,
+                                         description="Filter by source (e.g. 'wikipedia', 'fca_handbook')")
+    filter_category: Optional[str] = Field(None,
+                                           description="Filter by category")
+    use_reranker: bool = Field(True,
+                               description="Apply cross-encoder reranking")
+class SearchResultResponse(BaseModel):
+    chunk_id: str
+    doc_id: str
+    title: str
+    text: str
+    score: float
+    source: str
+    url: str
+    category: str
+    highlights: list[str]
+    vector_rank: Optional[int] = None
+    bm25_rank: Optional[int] = None
+    rerank_score: Optional[float] = None
+class SearchResponse(BaseModel):
+    query: str
+    results: list[SearchResultResponse]
+    total_found: int
+    latency_ms: float
+    method: str
+    facets: dict
+class StatsResponse(BaseModel):
+    bm25_chunks: int
+    bm25_terms: int
+    vector_stats: dict
+    facets: dict
+# ── Endpoints ──────────────────────────────────────────────────────
+@app.get("/health")
+def health():
+    return {"status": "ok", "engine": API_TITLE, "version": API_VERSION}
+@app.post("/search", response_model=SearchResponse)
+def search(req: SearchRequest):
+    """Hybrid search (vector + BM25 + cross-encoder reranking)."""
+    engine = _get_engine()
+    result = engine.search(
+        query=req.query,
+        top_k=req.top_k,
+        method="hybrid",
+        filter_source=req.filter_source,
+        filter_category=req.filter_category,
+        use_reranker=req.use_reranker,
+    )
+    return _to_response(result)
+@app.post("/search/vector", response_model=SearchResponse)
+def search_vector(req: SearchRequest):
+    """Pure vector (semantic) search."""
+    engine = _get_engine()
+    result = engine.search(
+        query=req.query,
+        top_k=req.top_k,
+        method="vector",
+        filter_source=req.filter_source,
+        filter_category=req.filter_category,
+        use_reranker=req.use_reranker,
+    )
+    return _to_response(result)
+@app.post("/search/keyword", response_model=SearchResponse)
+def search_keyword(req: SearchRequest):
+    """Pure BM25 keyword search."""
+    engine = _get_engine()
+    result = engine.search(
+        query=req.query,
+        top_k=req.top_k,
+        method="bm25",
+        filter_source=req.filter_source,
+        filter_category=req.filter_category,
+        use_reranker=False,  # No reranker for pure keyword
+    )
+    return _to_response(result)
+@app.get("/suggest")
+def suggest(
+    prefix: str = Query(..., min_length=2, max_length=100),
+    limit: int = Query(10, ge=1, le=50),
+):
+    """Autocomplete suggestions (like Azure AI Search Suggest)."""
+    engine = _get_engine()
+    suggestions = engine.suggest(prefix, limit)
+    return {"prefix": prefix, "suggestions": suggestions}
+@app.get("/facets")
+def facets():
+    """Get available filter facets with counts."""
+    engine = _get_engine()
+    return engine.get_facets()
+@app.get("/stats", response_model=StatsResponse)
+def stats():
+    """Get index statistics."""
+    engine = _get_engine()
+    from search.vector_store import get_collection_stats
+    return StatsResponse(
+        bm25_chunks=engine.bm25.doc_count,
+        bm25_terms=len(engine.bm25.inverted_index),
+        vector_stats=get_collection_stats(),
+        facets=engine.get_facets(),
+    )
+@app.post("/index/build")
+def build_index():
+    """Trigger full index rebuild from collected data."""
+    from search.indexer import build_index as do_build
+    do_build(force_rebuild=True)
+    # Reload engine
+    global _engine
+    _engine = None
+    return {"status": "ok", "message": "Index rebuilt successfully"}
+def _to_response(result) -> SearchResponse:
+    """Convert internal SearchResponse to API SearchResponse."""
+    return SearchResponse(
+        query=result.query,
+        results=[
+            SearchResultResponse(
+                chunk_id=r.chunk_id,
+                doc_id=r.doc_id,
+                title=r.title,
+                text=r.text[:1000],  # Cap text in response
+                score=r.score,
+                source=r.source,
+                url=r.url or "",
+                category=r.category,
+                highlights=r.highlights,
+                vector_rank=r.vector_rank,
+                bm25_rank=r.bm25_rank,
+                rerank_score=r.rerank_score,
+            )
+            for r in result.results
+        ],
+        total_found=result.total_found,
+        latency_ms=result.latency_ms,
+        method=result.method,
+        facets=result.facets,
+    )
+def start():
+    """Start the API server."""
+    import uvicorn
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s [%(levelname)s] %(message)s")
+    uvicorn.run(app, host=API_HOST, port=API_PORT)
+if __name__ == "__main__":
+    start()

search/bm25.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""BM25 keyword search index with persistence."""
+import json
+import logging
+import math
+import pickle
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Optional
+from search.config import BM25_DIR
+from search.models import Chunk
+logger = logging.getLogger(__name__)
+class BM25Index:
+    """Okapi BM25 ranking implementation with persistence.
+    This is a from-scratch BM25 implementation — no external dependency.
+    Matches the quality of rank_bm25 or Lucene's BM25 with:
+    - Configurable k1 and b parameters
+    - Insurance-domain stop words
+    - Stemming-light via regex normalization
+    - Persistent storage
+    """
+    def __init__(self, k1: float = 1.5, b: float = 0.75):
+        self.k1 = k1
+        self.b = b
+        self.doc_count = 0
+        self.avg_doc_len = 0.0
+        # chunk_id -> tokenized doc
+        self.docs: dict[str, list[str]] = {}
+        # chunk_id -> Chunk metadata
+        self.chunk_meta: dict[str, dict] = {}
+        # term -> set of chunk_ids
+        self.inverted_index: dict[str, set[str]] = defaultdict(set)
+        # term -> document frequency
+        self.df: dict[str, int] = defaultdict(int)
+        # chunk_id -> doc length (in tokens)
+        self.doc_lengths: dict[str, int] = {}
+    # ── Stop words (general + insurance-domain) ────────────────────
+    STOP_WORDS = {
+        "a", "an", "the", "is", "it", "in", "on", "at", "to", "for",
+        "of", "and", "or", "but", "not", "with", "by", "from", "as",
+        "be", "was", "were", "been", "being", "have", "has", "had",
+        "do", "does", "did", "will", "would", "could", "should",
+        "may", "might", "shall", "can", "this", "that", "these",
+        "those", "i", "you", "he", "she", "we", "they", "me",
+        "him", "her", "us", "them", "my", "your", "his", "its",
+        "our", "their", "what", "which", "who", "whom", "how",
+        "when", "where", "why", "all", "each", "every", "both",
+        "few", "more", "most", "other", "some", "such", "no",
+        "nor", "only", "own", "same", "so", "than", "too", "very",
+        "just", "also", "if", "then", "else", "about", "up", "out",
+        "any", "are", "into", "over", "after", "before", "between",
+    }
+    def _tokenize(self, text: str) -> list[str]:
+        """Tokenize text with light normalization."""
+        text = text.lower()
+        # Keep alphanumeric, hyphens (for terms like "co-insurance")
+        tokens = re.findall(r'[a-z0-9](?:[a-z0-9-]*[a-z0-9])?', text)
+        # Remove stop words and very short tokens
+        tokens = [t for t in tokens if t not in self.STOP_WORDS and len(t) > 1]
+        return tokens
+    def add_chunk(self, chunk: Chunk):
+        """Add a single chunk to the BM25 index."""
+        tokens = self._tokenize(chunk.text)
+        if not tokens:
+            return
+        self.docs[chunk.chunk_id] = tokens
+        self.doc_lengths[chunk.chunk_id] = len(tokens)
+        self.chunk_meta[chunk.chunk_id] = {
+            "doc_id": chunk.doc_id,
+            "title": chunk.title,
+            "source": chunk.source,
+            "category": chunk.category,
+            "text": chunk.text[:500],
+        }
+        # Update inverted index
+        unique_terms = set(tokens)
+        for term in unique_terms:
+            self.inverted_index[term].add(chunk.chunk_id)
+            self.df[term] += 1
+        self.doc_count += 1
+        # Update average document length (running average)
+        self.avg_doc_len = (
+            (self.avg_doc_len * (self.doc_count - 1) + len(tokens))
+            / self.doc_count
+        )
+    def add_chunks(self, chunks: list[Chunk]):
+        """Add multiple chunks to the index."""
+        for chunk in chunks:
+            self.add_chunk(chunk)
+        logger.info(f"BM25 index: {self.doc_count} chunks, "
+                     f"{len(self.inverted_index)} unique terms")
+    def _idf(self, term: str) -> float:
+        """Compute inverse document frequency for a term."""
+        if term not in self.df:
+            return 0.0
+        n = self.doc_count
+        df = self.df[term]
+        return math.log((n - df + 0.5) / (df + 0.5) + 1.0)
+    def search(self, query: str, top_k: int = 10,
+               filter_source: Optional[str] = None,
+               filter_category: Optional[str] = None) -> list[tuple[str, float]]:
+        """Search the BM25 index. Returns list of (chunk_id, score)."""
+        query_tokens = self._tokenize(query)
+        if not query_tokens:
+            return []
+        # Find candidate chunks (union of all query term posting lists)
+        candidates = set()
+        for token in query_tokens:
+            candidates |= self.inverted_index.get(token, set())
+        if not candidates:
+            return []
+        # Apply filters
+        if filter_source:
+            candidates = {
+                c for c in candidates
+                if self.chunk_meta.get(c, {}).get("source") == filter_source
+            }
+        if filter_category:
+            candidates = {
+                c for c in candidates
+                if self.chunk_meta.get(c, {}).get("category") == filter_category
+            }
+        # Score each candidate
+        scores: list[tuple[str, float]] = []
+        for chunk_id in candidates:
+            score = 0.0
+            doc_tokens = self.docs[chunk_id]
+            doc_len = self.doc_lengths[chunk_id]
+            tf_counter = Counter(doc_tokens)
+            for term in query_tokens:
+                if term not in tf_counter:
+                    continue
+                tf = tf_counter[term]
+                idf = self._idf(term)
+                # BM25 scoring formula
+                numerator = tf * (self.k1 + 1)
+                denominator = tf + self.k1 * (
+                    1 - self.b + self.b * doc_len / max(self.avg_doc_len, 1)
+                )
+                score += idf * numerator / denominator
+            if score > 0:
+                scores.append((chunk_id, score))
+        # Sort by score descending
+        scores.sort(key=lambda x: x[1], reverse=True)
+        return scores[:top_k]
+    def get_suggestions(self, prefix: str, limit: int = 10) -> list[str]:
+        """Get autocomplete suggestions based on indexed terms."""
+        prefix = prefix.lower().strip()
+        if len(prefix) < 2:
+            return []
+        matches = [
+            term for term in self.inverted_index
+            if term.startswith(prefix) and self.df[term] >= 2
+        ]
+        # Sort by document frequency (more common = better suggestion)
+        matches.sort(key=lambda t: self.df[t], reverse=True)
+        return matches[:limit]
+    def get_facets(self) -> dict[str, dict[str, int]]:
+        """Get facet counts for filtering."""
+        source_counts: dict[str, int] = defaultdict(int)
+        category_counts: dict[str, int] = defaultdict(int)
+        for meta in self.chunk_meta.values():
+            source_counts[meta.get("source", "unknown")] += 1
+            category_counts[meta.get("category", "unknown")] += 1
+        return {
+            "sources": dict(source_counts),
+            "categories": dict(category_counts),
+        }
+    def save(self, path: Path = BM25_DIR / "bm25_index.pkl"):
+        """Persist the BM25 index to disk."""
+        data = {
+            "k1": self.k1,
+            "b": self.b,
+            "doc_count": self.doc_count,
+            "avg_doc_len": self.avg_doc_len,
+            "docs": self.docs,
+            "chunk_meta": self.chunk_meta,
+            "inverted_index": {k: list(v) for k, v in self.inverted_index.items()},
+            "df": dict(self.df),
+            "doc_lengths": self.doc_lengths,
+        }
+        with open(path, "wb") as f:
+            pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
+        size_mb = path.stat().st_size / 1024 / 1024
+        logger.info(f"BM25 index saved: {path} ({size_mb:.1f} MB)")
+    def load(self, path: Path = BM25_DIR / "bm25_index.pkl") -> bool:
+        """Load BM25 index from disk."""
+        if not path.exists():
+            return False
+        with open(path, "rb") as f:
+            data = pickle.load(f)
+        self.k1 = data["k1"]
+        self.b = data["b"]
+        self.doc_count = data["doc_count"]
+        self.avg_doc_len = data["avg_doc_len"]
+        self.docs = data["docs"]
+        self.chunk_meta = data["chunk_meta"]
+        self.inverted_index = {k: set(v) for k, v in data["inverted_index"].items()}
+        self.df = defaultdict(int, data["df"])
+        self.doc_lengths = data["doc_lengths"]
+        logger.info(f"BM25 index loaded: {self.doc_count} chunks, "
+                     f"{len(self.inverted_index)} terms")
+        return True

search/config.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Configuration for InsureSearch engine."""
+from pathlib import Path
+# ── Paths ──────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).resolve().parent.parent
+SEARCH_DIR = BASE_DIR / "search"
+INDEX_DIR = SEARCH_DIR / "index_data"
+QDRANT_DIR = INDEX_DIR / "qdrant_storage"
+BM25_DIR = INDEX_DIR / "bm25_storage"
+METADATA_DB = INDEX_DIR / "metadata.db"
+INDEX_DIR.mkdir(parents=True, exist_ok=True)
+QDRANT_DIR.mkdir(parents=True, exist_ok=True)
+BM25_DIR.mkdir(parents=True, exist_ok=True)
+# ── Embedding model ────────────────────────────────────────────────
+# BAAI/bge-small-en-v1.5: 33M params, 384-dim, fast, excellent quality
+EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
+EMBEDDING_DIM = 384
+EMBEDDING_BATCH_SIZE = 64
+# ── Reranker model ─────────────────────────────────────────────────
+# Cross-encoder for second-stage reranking
+RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+RERANKER_TOP_K = 20  # Rerank top K candidates
+# ── Qdrant ─────────────────────────────────────────────────────────
+QDRANT_COLLECTION = "insurance_docs"
+# ── Search defaults ────────────────────────────────────────────────
+DEFAULT_TOP_K = 10
+MAX_TOP_K = 100
+CHUNK_SIZE = 512          # tokens per chunk
+CHUNK_OVERLAP = 64        # overlap between chunks
+# ── Hybrid search weights ─────────────────────────────────────────
+# RRF (Reciprocal Rank Fusion) constant
+RRF_K = 60
+# Weight for combining vector vs BM25 scores (0.0 = all BM25, 1.0 = all vector)
+VECTOR_WEIGHT = 0.6
+BM25_WEIGHT = 0.4
+# ── API ────────────────────────────────────────────────────────────
+API_HOST = "0.0.0.0"
+API_PORT = 8900
+API_TITLE = "InsureSearch"
+API_VERSION = "1.0.0"
+API_DESCRIPTION = (
+    "Open-source hybrid search engine for insurance documents. "
+    "Combines dense vector search (BGE) + sparse keyword search (BM25) "
+    "with cross-encoder reranking. Matches or exceeds Azure AI Search "
+    "for insurance domain."
+)