Upload app.py
Browse files
app.py
CHANGED
|
@@ -630,31 +630,50 @@ Hinweis: Du musst eingeloggt sein und die Terms akzeptieren!"""
|
|
| 630 |
# Training mit Progress-Updates
|
| 631 |
import torch.optim as optim
|
| 632 |
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
progress(progress_value, desc=desc)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
|
| 640 |
# Starte Training
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
progress(0.95, desc="💾 Speichere Modell...")
|
| 660 |
|
|
@@ -662,34 +681,77 @@ Hinweis: Du musst eingeloggt sein und die Terms akzeptieren!"""
|
|
| 662 |
model_path = Path("/tmp/models")
|
| 663 |
model_path.mkdir(exist_ok=True)
|
| 664 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
# Finde bestes Modell
|
| 666 |
runs_dir = netlistify_dir / "runs" / "FormalDatasetWindowedLinePair"
|
| 667 |
-
if runs_dir.exists():
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
|
| 673 |
progress(1.0, desc="✅ Training abgeschlossen!")
|
| 674 |
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
|
| 694 |
except Exception as e:
|
| 695 |
import traceback
|
|
|
|
| 630 |
# Training mit Progress-Updates
|
| 631 |
import torch.optim as optim
|
| 632 |
|
| 633 |
+
# Tracking-Variablen für Training
|
| 634 |
+
training_completed = False
|
| 635 |
+
actual_epochs_completed = 0
|
| 636 |
+
training_error = None
|
| 637 |
+
|
| 638 |
+
def training_epoch_end_callback():
|
| 639 |
+
"""Callback der nach jeder Epoch aufgerufen wird."""
|
| 640 |
+
nonlocal actual_epochs_completed, training_completed
|
| 641 |
+
# Hole aktuelle Epoch aus Model-Objekt
|
| 642 |
+
current_epoch = getattr(model, 'ep', actual_epochs_completed)
|
| 643 |
+
actual_epochs_completed = current_epoch
|
| 644 |
+
progress_value = 0.7 + (current_epoch / epochs) * 0.25
|
| 645 |
+
desc = f"🔥 Epoch {current_epoch}/{epochs}"
|
| 646 |
progress(progress_value, desc=desc)
|
| 647 |
+
|
| 648 |
+
# Prüfe ob letzte Epoch erreicht wurde
|
| 649 |
+
if current_epoch >= epochs:
|
| 650 |
+
training_completed = True
|
| 651 |
|
| 652 |
# Starte Training
|
| 653 |
+
try:
|
| 654 |
+
model.fit(
|
| 655 |
+
network,
|
| 656 |
+
criterion,
|
| 657 |
+
optim.Adam(network.parameters(), lr=main_config.LEARNING_RATE),
|
| 658 |
+
epochs,
|
| 659 |
+
max_epochs=float("inf"),
|
| 660 |
+
pretrained_path=main_config.PRETRAINED_PATH,
|
| 661 |
+
keep=True,
|
| 662 |
+
backprop_freq=main_config.BATCH_STEP,
|
| 663 |
+
device_ids=main_config.DEVICE_IDS,
|
| 664 |
+
eval_metrics=eval_metrics,
|
| 665 |
+
keep_epoch=main_config.KEEP_EPOCH,
|
| 666 |
+
keep_optimizer=main_config.KEEP_OPTIMIZER,
|
| 667 |
+
config=None,
|
| 668 |
+
upload=False,
|
| 669 |
+
flush_cache_after_step=main_config.FLUSH_CACHE_AFTER_STEP,
|
| 670 |
+
training_epoch_end=training_epoch_end_callback,
|
| 671 |
+
)
|
| 672 |
+
training_completed = True
|
| 673 |
+
except Exception as e:
|
| 674 |
+
training_error = str(e)
|
| 675 |
+
import traceback
|
| 676 |
+
training_error += f"\n\n{traceback.format_exc()}"
|
| 677 |
|
| 678 |
progress(0.95, desc="💾 Speichere Modell...")
|
| 679 |
|
|
|
|
| 681 |
model_path = Path("/tmp/models")
|
| 682 |
model_path.mkdir(exist_ok=True)
|
| 683 |
|
| 684 |
+
# Prüfe ob Training erfolgreich war
|
| 685 |
+
model_saved = False
|
| 686 |
+
best_model_path = None
|
| 687 |
+
|
| 688 |
# Finde bestes Modell
|
| 689 |
runs_dir = netlistify_dir / "runs" / "FormalDatasetWindowedLinePair"
|
| 690 |
+
if runs_dir.exists() and runs_dir.is_dir():
|
| 691 |
+
try:
|
| 692 |
+
run_dirs = [d for d in runs_dir.iterdir() if d.is_dir()]
|
| 693 |
+
if run_dirs:
|
| 694 |
+
latest_run = max(run_dirs, key=lambda x: x.stat().st_mtime)
|
| 695 |
+
best_model = latest_run / "best_train.pth"
|
| 696 |
+
if best_model.exists():
|
| 697 |
+
best_model_path = model_path / "best_model.pth"
|
| 698 |
+
shutil.copy2(best_model, best_model_path)
|
| 699 |
+
model_saved = True
|
| 700 |
+
|
| 701 |
+
# Prüfe auch latest.pth
|
| 702 |
+
latest_model = latest_run / "latest.pth"
|
| 703 |
+
if latest_model.exists():
|
| 704 |
+
shutil.copy2(latest_model, model_path / "latest_model.pth")
|
| 705 |
+
except Exception as e:
|
| 706 |
+
pass
|
| 707 |
|
| 708 |
progress(1.0, desc="✅ Training abgeschlossen!")
|
| 709 |
|
| 710 |
+
# Erstelle Status-Report
|
| 711 |
+
status_lines = []
|
| 712 |
+
|
| 713 |
+
if training_error:
|
| 714 |
+
status_lines.append("❌ **Training mit Fehler beendet:**")
|
| 715 |
+
status_lines.append(f"```\n{training_error}\n```")
|
| 716 |
+
elif training_completed:
|
| 717 |
+
status_lines.append("✅ **Training erfolgreich abgeschlossen!**")
|
| 718 |
+
else:
|
| 719 |
+
status_lines.append("⚠️ **Training-Status unklar**")
|
| 720 |
+
|
| 721 |
+
status_lines.append("")
|
| 722 |
+
status_lines.append("📊 **Training-Details:**")
|
| 723 |
+
status_lines.append(f"- GPU: {gpu_name} ({gpu_memory:.1f} GB)")
|
| 724 |
+
status_lines.append(f"- Geplante Epochs: {epochs}")
|
| 725 |
+
status_lines.append(f"- Abgeschlossene Epochs: {actual_epochs_completed}")
|
| 726 |
+
status_lines.append(f"- Batch Size: {batch_size}")
|
| 727 |
+
status_lines.append(f"- Learning Rate: {learning_rate}")
|
| 728 |
+
status_lines.append(f"- Dataset-Größe: {len(img_files)} Bilder")
|
| 729 |
+
status_lines.append("")
|
| 730 |
+
|
| 731 |
+
if model_saved:
|
| 732 |
+
status_lines.append("💾 **Modell gespeichert:**")
|
| 733 |
+
status_lines.append(f"- Pfad: {model_path}")
|
| 734 |
+
status_lines.append(f"- Bestes Modell: best_model.pth")
|
| 735 |
+
if best_model_path and best_model_path.exists():
|
| 736 |
+
file_size = best_model_path.stat().st_size / (1024 * 1024) # MB
|
| 737 |
+
status_lines.append(f"- Dateigröße: {file_size:.2f} MB")
|
| 738 |
+
else:
|
| 739 |
+
status_lines.append("⚠️ **Modell nicht gefunden:**")
|
| 740 |
+
status_lines.append(f"- Erwarteter Pfad: {runs_dir}")
|
| 741 |
+
status_lines.append("- Prüfe Logs für Details")
|
| 742 |
+
|
| 743 |
+
status_lines.append("")
|
| 744 |
+
|
| 745 |
+
if training_completed and model_saved:
|
| 746 |
+
status_lines.append("📁 **Nächste Schritte:**")
|
| 747 |
+
status_lines.append("1. Lade das trainierte Modell herunter")
|
| 748 |
+
status_lines.append("2. Verwende es für Inference in deiner Anwendung")
|
| 749 |
+
elif not training_completed:
|
| 750 |
+
status_lines.append("⚠️ **Hinweis:** Training wurde möglicherweise nicht vollständig abgeschlossen.")
|
| 751 |
+
status_lines.append("- Prüfe die Logs für weitere Details")
|
| 752 |
+
status_lines.append("- Versuche Training erneut zu starten")
|
| 753 |
+
|
| 754 |
+
return "\n".join(status_lines)
|
| 755 |
|
| 756 |
except Exception as e:
|
| 757 |
import traceback
|