"""Build and push Nomen-AI SFT/DPO datasets.""" import argparse, random, re from datasets import Dataset, DatasetDict from nomen_ai.control import ROOT_FAMILIES, THEMES, ControlVector, SYSTEM_PROMPT from nomen_ai.synth import make_example from nomen_ai.antidup import AntiDuplicationMatrix from nomen_ai.phonetics import count_syllables GENERIC_SUFFIXES=["ify","ly","ster","hub","ged","io","zy","able"] GENERIC_PREFIXES=["Tech","Smart","Get","My","Go","The","Best","Pro"] GENERIC_CORES=["brand","name","tube","vlog","media","studio","world","zone"] def quality_ok(name): return 3<=len(name)<=13 and re.search(r"[aeiouy]",name.lower()) and count_syllables(name)<=4 and not re.search(r"(.)\1\1",name.lower()) def derivative_name(rng): x=rng.random() if x<0.4: return rng.choice(GENERIC_PREFIXES)+rng.choice(GENERIC_CORES).capitalize() if x<0.7: return rng.choice(["Stream","Click","Pixel","Brand","Creat","Snap","Vibe"])+rng.choice(GENERIC_SUFFIXES) return rng.choice(GENERIC_PREFIXES)+rng.choice(GENERIC_SUFFIXES).capitalize() def sample_cv(rng): k=rng.choices([1,2,3],weights=[5,4,1])[0] return rng.sample(ROOT_FAMILIES,k), rng.choice(THEMES), rng.choice([0.1,0.3,0.5,0.7,0.9]) def build(n_sft,n_dpo,seed=42): rng=random.Random(seed); anti=AntiDuplicationMatrix(min_novelty=0.7); sft=[]; dpo=[] while len(sft)', sft[0]['messages'][2]['content']); print('dpo', dpo[0]['chosen'][0]['content'], 'vs', dpo[0]['rejected'][0]['content']) if args.push: sft_ds.push_to_hub(args.sft_repo); dpo_ds.push_to_hub(args.dpo_repo) if __name__=='__main__': main()