| """ |
| InsureOS — Master Data Generation Orchestrator |
| Runs all synthetic data generators and produces the complete training dataset. |
| """ |
|
|
| import time |
| import sys |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| from data.gen_sft import generate_sft_dataset |
| from data.gen_dpo import generate_dpo_dataset |
| from data.gen_tabular import generate_tabular_dataset |
| from data.gen_documents import generate_document_dataset |
| from data.gen_ner import generate_ner_dataset |
|
|
|
|
| OUTPUT_DIR = "data/output" |
|
|
|
|
| def main(): |
| start = time.time() |
| print("=" * 60) |
| print(" InsureOS — Synthetic Data Generation Pipeline") |
| print("=" * 60) |
|
|
| |
| print("\n[1/5] SFT Data (10K instruction-response pairs)") |
| generate_sft_dataset(n=10000, output_path=f"{OUTPUT_DIR}/insurance_sft_10k.jsonl") |
|
|
| |
| print("\n[2/5] DPO Data (5K preference pairs)") |
| generate_dpo_dataset(n=5000, output_path=f"{OUTPUT_DIR}/insurance_dpo_5k.jsonl") |
|
|
| |
| print("\n[3/5] Tabular Claims Data (50K records)") |
| generate_tabular_dataset(n=50000, fraud_rate=0.08, output_dir=OUTPUT_DIR) |
|
|
| |
| print("\n[4/5] Document Classification Data (10K documents)") |
| generate_document_dataset(n=10000, output_path=f"{OUTPUT_DIR}/insurance_docs_10k.jsonl") |
|
|
| |
| print("\n[5/5] NER Data (8K token-labelled examples)") |
| generate_ner_dataset(n=8000, output_path=f"{OUTPUT_DIR}/insurance_ner_8k.jsonl") |
|
|
| elapsed = time.time() - start |
| print("\n" + "=" * 60) |
| print(f" ✓ All data generated in {elapsed:.1f}s") |
| print(f" Output directory: {OUTPUT_DIR}/") |
| print("=" * 60) |
|
|
| |
| output_path = Path(OUTPUT_DIR) |
| if output_path.exists(): |
| print("\nGenerated files:") |
| for f in sorted(output_path.iterdir()): |
| size_mb = f.stat().st_size / (1024 * 1024) |
| print(f" {f.name:50s} {size_mb:8.2f} MB") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|