insureos-models / data /generate_all.py
piyushptiwari's picture
Upload folder using huggingface_hub
2cc32a5 verified
"""
InsureOS — Master Data Generation Orchestrator
Runs all synthetic data generators and produces the complete training dataset.
"""
import time
import sys
from pathlib import Path
# Ensure project root on path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from data.gen_sft import generate_sft_dataset
from data.gen_dpo import generate_dpo_dataset
from data.gen_tabular import generate_tabular_dataset
from data.gen_documents import generate_document_dataset
from data.gen_ner import generate_ner_dataset
OUTPUT_DIR = "data/output"
def main():
start = time.time()
print("=" * 60)
print(" InsureOS — Synthetic Data Generation Pipeline")
print("=" * 60)
# 1. SFT instruction-response pairs
print("\n[1/5] SFT Data (10K instruction-response pairs)")
generate_sft_dataset(n=10000, output_path=f"{OUTPUT_DIR}/insurance_sft_10k.jsonl")
# 2. DPO preference pairs
print("\n[2/5] DPO Data (5K preference pairs)")
generate_dpo_dataset(n=5000, output_path=f"{OUTPUT_DIR}/insurance_dpo_5k.jsonl")
# 3. Tabular claims data
print("\n[3/5] Tabular Claims Data (50K records)")
generate_tabular_dataset(n=50000, fraud_rate=0.08, output_dir=OUTPUT_DIR)
# 4. Document classification
print("\n[4/5] Document Classification Data (10K documents)")
generate_document_dataset(n=10000, output_path=f"{OUTPUT_DIR}/insurance_docs_10k.jsonl")
# 5. NER data
print("\n[5/5] NER Data (8K token-labelled examples)")
generate_ner_dataset(n=8000, output_path=f"{OUTPUT_DIR}/insurance_ner_8k.jsonl")
elapsed = time.time() - start
print("\n" + "=" * 60)
print(f" ✓ All data generated in {elapsed:.1f}s")
print(f" Output directory: {OUTPUT_DIR}/")
print("=" * 60)
# List generated files
output_path = Path(OUTPUT_DIR)
if output_path.exists():
print("\nGenerated files:")
for f in sorted(output_path.iterdir()):
size_mb = f.stat().st_size / (1024 * 1024)
print(f" {f.name:50s} {size_mb:8.2f} MB")
if __name__ == "__main__":
main()