Nekochu commited on
Commit
e69e9ec
·
1 Parent(s): 88ca206

redirect training subprocess stderr to log file for debugging

Browse files
Files changed (1) hide show
  1. app.py +12 -7
app.py CHANGED
@@ -410,10 +410,11 @@ finally:
410
  with open(script_path, "w") as f:
411
  f.write(train_script)
412
 
 
413
  subprocess.Popen(
414
- ["python3", script_path],
415
- stdout=open("/dev/null", "w"),
416
- stderr=open("/dev/null", "w"),
417
  start_new_session=True,
418
  )
419
 
@@ -423,10 +424,14 @@ finally:
423
  f"Inference will be unavailable until training completes (ace-server stopped).")
424
 
425
  def check_train_log():
426
- if not os.path.exists(TRAIN_LOG):
427
- return "No training log found."
428
- with open(TRAIN_LOG) as f:
429
- return f.read() or "Log is empty."
 
 
 
 
430
 
431
  # -- Build UI --
432
  CSS = """
 
410
  with open(script_path, "w") as f:
411
  f.write(train_script)
412
 
413
+ train_stderr = os.path.join(output_dir, "train_stderr.log")
414
  subprocess.Popen(
415
+ ["python3", "-u", script_path],
416
+ stdout=open(TRAIN_LOG, "a"),
417
+ stderr=open(train_stderr, "w"),
418
  start_new_session=True,
419
  )
420
 
 
424
  f"Inference will be unavailable until training completes (ace-server stopped).")
425
 
426
  def check_train_log():
427
+ parts = []
428
+ if os.path.exists(TRAIN_LOG):
429
+ parts.append(open(TRAIN_LOG).read())
430
+ stderr_log = os.path.join(ADAPTER_DIR, "test-lora", "train_stderr.log")
431
+ if os.path.exists(stderr_log) and os.path.getsize(stderr_log) > 0:
432
+ stderr = open(stderr_log).read()[-2000:]
433
+ parts.append(f"\n--- stderr (last 2000 chars) ---\n{stderr}")
434
+ return "\n".join(parts) if parts else "No training log found."
435
 
436
  # -- Build UI --
437
  CSS = """