Nekochu commited on
Commit
560b5e0
·
1 Parent(s): e69e9ec

fix: use float32 not bfloat16 for CPU training (bf16 deadlocks on CPU)

Browse files
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -349,7 +349,7 @@ try:
349
  output_dir="{output_dir}/preprocessed_tensors",
350
  checkpoint_dir="{ACE_CHECKPOINT_DIR}",
351
  variant="turbo", max_duration=60.0,
352
- device="cpu", precision="bfloat16",
353
  )
354
  processed = result.get("processed", 0)
355
  failed = result.get("failed", 0)
@@ -366,8 +366,8 @@ try:
366
 
367
  model = load_decoder_for_training(
368
  checkpoint_dir="{ACE_CHECKPOINT_DIR}", variant="turbo",
369
- device="cpu", precision="bfloat16",
370
- ).bfloat16()
371
 
372
  trainer = FixedLoRATrainer(model,
373
  LoRAConfigV2(r={rank}, alpha={rank}, dropout=0.0),
@@ -376,7 +376,7 @@ try:
376
  dataset_dir="{output_dir}/preprocessed_tensors",
377
  output_dir="{output_dir}",
378
  max_epochs={epochs}, batch_size=1, learning_rate={lr},
379
- device="cpu", precision="bfloat16", seed=42,
380
  num_workers=0, pin_memory=False,
381
  ))
382
 
 
349
  output_dir="{output_dir}/preprocessed_tensors",
350
  checkpoint_dir="{ACE_CHECKPOINT_DIR}",
351
  variant="turbo", max_duration=60.0,
352
+ device="cpu", precision="float32",
353
  )
354
  processed = result.get("processed", 0)
355
  failed = result.get("failed", 0)
 
366
 
367
  model = load_decoder_for_training(
368
  checkpoint_dir="{ACE_CHECKPOINT_DIR}", variant="turbo",
369
+ device="cpu", precision="float32",
370
+ ).float()
371
 
372
  trainer = FixedLoRATrainer(model,
373
  LoRAConfigV2(r={rank}, alpha={rank}, dropout=0.0),
 
376
  dataset_dir="{output_dir}/preprocessed_tensors",
377
  output_dir="{output_dir}",
378
  max_epochs={epochs}, batch_size=1, learning_rate={lr},
379
+ device="cpu", precision="float32", seed=42,
380
  num_workers=0, pin_memory=False,
381
  ))
382