Dmgautomata commited on
Commit
e168122
·
verified ·
1 Parent(s): ed597d0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -45
app.py CHANGED
@@ -630,31 +630,50 @@ Hinweis: Du musst eingeloggt sein und die Terms akzeptieren!"""
630
  # Training mit Progress-Updates
631
  import torch.optim as optim
632
 
633
- def training_callback(epoch, total_epochs, loss=None):
634
- progress_value = 0.7 + (epoch / total_epochs) * 0.25
635
- desc = f"Epoch {epoch+1}/{total_epochs}"
636
- if loss is not None:
637
- desc += f" - Loss: {loss:.4f}"
 
 
 
 
 
 
 
 
638
  progress(progress_value, desc=desc)
 
 
 
 
639
 
640
  # Starte Training
641
- model.fit(
642
- network,
643
- criterion,
644
- optim.Adam(network.parameters(), lr=main_config.LEARNING_RATE),
645
- main_config.EPOCHS,
646
- max_epochs=float("inf"),
647
- pretrained_path=main_config.PRETRAINED_PATH,
648
- keep=True,
649
- backprop_freq=main_config.BATCH_STEP,
650
- device_ids=main_config.DEVICE_IDS,
651
- eval_metrics=eval_metrics,
652
- keep_epoch=main_config.KEEP_EPOCH,
653
- keep_optimizer=main_config.KEEP_OPTIMIZER,
654
- config=None,
655
- upload=False,
656
- flush_cache_after_step=main_config.FLUSH_CACHE_AFTER_STEP,
657
- )
 
 
 
 
 
 
 
658
 
659
  progress(0.95, desc="💾 Speichere Modell...")
660
 
@@ -662,34 +681,77 @@ Hinweis: Du musst eingeloggt sein und die Terms akzeptieren!"""
662
  model_path = Path("/tmp/models")
663
  model_path.mkdir(exist_ok=True)
664
 
 
 
 
 
665
  # Finde bestes Modell
666
  runs_dir = netlistify_dir / "runs" / "FormalDatasetWindowedLinePair"
667
- if runs_dir.exists():
668
- latest_run = max(runs_dir.iterdir(), key=lambda x: x.stat().st_mtime)
669
- best_model = latest_run / "best_train.pth"
670
- if best_model.exists():
671
- shutil.copy2(best_model, model_path / "best_model.pth")
 
 
 
 
 
 
 
 
 
 
 
 
672
 
673
  progress(1.0, desc="✅ Training abgeschlossen!")
674
 
675
- return f"""
676
- Training erfolgreich abgeschlossen!
677
-
678
- 📊 **Training-Details:**
679
- - GPU: {gpu_name} ({gpu_memory:.1f} GB)
680
- - Epochs: {epochs}
681
- - Batch Size: {batch_size}
682
- - Learning Rate: {learning_rate}
683
- - Dataset-Größe: {len(img_files)} Bilder
684
-
685
- 💾 **Modell gespeichert:**
686
- - Pfad: {model_path}
687
- - Bestes Modell: best_model.pth
688
-
689
- 📁 **Nächste Schritte:**
690
- 1. Lade das trainierte Modell herunter
691
- 2. Verwende es für Inference in deiner Anwendung
692
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
 
694
  except Exception as e:
695
  import traceback
 
630
  # Training mit Progress-Updates
631
  import torch.optim as optim
632
 
633
+ # Tracking-Variablen für Training
634
+ training_completed = False
635
+ actual_epochs_completed = 0
636
+ training_error = None
637
+
638
+ def training_epoch_end_callback():
639
+ """Callback der nach jeder Epoch aufgerufen wird."""
640
+ nonlocal actual_epochs_completed, training_completed
641
+ # Hole aktuelle Epoch aus Model-Objekt
642
+ current_epoch = getattr(model, 'ep', actual_epochs_completed)
643
+ actual_epochs_completed = current_epoch
644
+ progress_value = 0.7 + (current_epoch / epochs) * 0.25
645
+ desc = f"🔥 Epoch {current_epoch}/{epochs}"
646
  progress(progress_value, desc=desc)
647
+
648
+ # Prüfe ob letzte Epoch erreicht wurde
649
+ if current_epoch >= epochs:
650
+ training_completed = True
651
 
652
  # Starte Training
653
+ try:
654
+ model.fit(
655
+ network,
656
+ criterion,
657
+ optim.Adam(network.parameters(), lr=main_config.LEARNING_RATE),
658
+ epochs,
659
+ max_epochs=float("inf"),
660
+ pretrained_path=main_config.PRETRAINED_PATH,
661
+ keep=True,
662
+ backprop_freq=main_config.BATCH_STEP,
663
+ device_ids=main_config.DEVICE_IDS,
664
+ eval_metrics=eval_metrics,
665
+ keep_epoch=main_config.KEEP_EPOCH,
666
+ keep_optimizer=main_config.KEEP_OPTIMIZER,
667
+ config=None,
668
+ upload=False,
669
+ flush_cache_after_step=main_config.FLUSH_CACHE_AFTER_STEP,
670
+ training_epoch_end=training_epoch_end_callback,
671
+ )
672
+ training_completed = True
673
+ except Exception as e:
674
+ training_error = str(e)
675
+ import traceback
676
+ training_error += f"\n\n{traceback.format_exc()}"
677
 
678
  progress(0.95, desc="💾 Speichere Modell...")
679
 
 
681
  model_path = Path("/tmp/models")
682
  model_path.mkdir(exist_ok=True)
683
 
684
+ # Prüfe ob Training erfolgreich war
685
+ model_saved = False
686
+ best_model_path = None
687
+
688
  # Finde bestes Modell
689
  runs_dir = netlistify_dir / "runs" / "FormalDatasetWindowedLinePair"
690
+ if runs_dir.exists() and runs_dir.is_dir():
691
+ try:
692
+ run_dirs = [d for d in runs_dir.iterdir() if d.is_dir()]
693
+ if run_dirs:
694
+ latest_run = max(run_dirs, key=lambda x: x.stat().st_mtime)
695
+ best_model = latest_run / "best_train.pth"
696
+ if best_model.exists():
697
+ best_model_path = model_path / "best_model.pth"
698
+ shutil.copy2(best_model, best_model_path)
699
+ model_saved = True
700
+
701
+ # Prüfe auch latest.pth
702
+ latest_model = latest_run / "latest.pth"
703
+ if latest_model.exists():
704
+ shutil.copy2(latest_model, model_path / "latest_model.pth")
705
+ except Exception as e:
706
+ pass
707
 
708
  progress(1.0, desc="✅ Training abgeschlossen!")
709
 
710
+ # Erstelle Status-Report
711
+ status_lines = []
712
+
713
+ if training_error:
714
+ status_lines.append("❌ **Training mit Fehler beendet:**")
715
+ status_lines.append(f"```\n{training_error}\n```")
716
+ elif training_completed:
717
+ status_lines.append("✅ **Training erfolgreich abgeschlossen!**")
718
+ else:
719
+ status_lines.append("⚠️ **Training-Status unklar**")
720
+
721
+ status_lines.append("")
722
+ status_lines.append("📊 **Training-Details:**")
723
+ status_lines.append(f"- GPU: {gpu_name} ({gpu_memory:.1f} GB)")
724
+ status_lines.append(f"- Geplante Epochs: {epochs}")
725
+ status_lines.append(f"- Abgeschlossene Epochs: {actual_epochs_completed}")
726
+ status_lines.append(f"- Batch Size: {batch_size}")
727
+ status_lines.append(f"- Learning Rate: {learning_rate}")
728
+ status_lines.append(f"- Dataset-Größe: {len(img_files)} Bilder")
729
+ status_lines.append("")
730
+
731
+ if model_saved:
732
+ status_lines.append("💾 **Modell gespeichert:**")
733
+ status_lines.append(f"- Pfad: {model_path}")
734
+ status_lines.append(f"- Bestes Modell: best_model.pth")
735
+ if best_model_path and best_model_path.exists():
736
+ file_size = best_model_path.stat().st_size / (1024 * 1024) # MB
737
+ status_lines.append(f"- Dateigröße: {file_size:.2f} MB")
738
+ else:
739
+ status_lines.append("⚠️ **Modell nicht gefunden:**")
740
+ status_lines.append(f"- Erwarteter Pfad: {runs_dir}")
741
+ status_lines.append("- Prüfe Logs für Details")
742
+
743
+ status_lines.append("")
744
+
745
+ if training_completed and model_saved:
746
+ status_lines.append("📁 **Nächste Schritte:**")
747
+ status_lines.append("1. Lade das trainierte Modell herunter")
748
+ status_lines.append("2. Verwende es für Inference in deiner Anwendung")
749
+ elif not training_completed:
750
+ status_lines.append("⚠️ **Hinweis:** Training wurde möglicherweise nicht vollständig abgeschlossen.")
751
+ status_lines.append("- Prüfe die Logs für weitere Details")
752
+ status_lines.append("- Versuche Training erneut zu starten")
753
+
754
+ return "\n".join(status_lines)
755
 
756
  except Exception as e:
757
  import traceback