Spaces:

Spanicin
/

candlestick-diffusion

Runtime error

App Files Files Community

Spanicin commited on Nov 29, 2025

Commit

476d0fb

verified ·

1 Parent(s): 0a1421d

Upload app.py

Browse files

Files changed (1) hide show

app.py +39 -11

app.py CHANGED Viewed

@@ -582,12 +582,20 @@ def train_model(data_path, epochs, batch_size, learning_rate, image_size, save_n
     global MODEL, TEXT_ENCODER, DIFFUSION, DEVICE, CONFIG
     try:
         # Setup
         DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         CONFIG = {
-            "base_channels": 64,
-            "channel_mults": (1, 2, 4),
-            "context_dim": 256,
             "image_size": image_size,
             "timesteps": 1000
         }
@@ -623,13 +631,14 @@ def train_model(data_path, epochs, batch_size, learning_rate, image_size, save_n
         logs = [f"🚀 Training started on {DEVICE}"]
         logs.append(f"📊 Model parameters: {num_params:,}")
         logs.append(f"📁 Training samples: {len(train_dataset)}")
         logs.append("-" * 40)
-        total_steps = epochs * len(train_loader)
-        current_step = 0
-        for epoch in range(epochs):
             epoch_loss = 0
             for images, texts in train_loader:
                 images = images.to(DEVICE)
                 context = TEXT_ENCODER(texts, DEVICE)
@@ -641,10 +650,28 @@ def train_model(data_path, epochs, batch_size, learning_rate, image_size, save_n
                 optimizer.step()
                 epoch_loss += loss.item()
-                current_step += 1
-            avg_loss = epoch_loss / len(train_loader)
-            logs.append(f"Epoch {epoch+1}/{epochs}: loss = {avg_loss:.4f}")
         # Save model
         MODEL.eval()
@@ -662,7 +689,8 @@ def train_model(data_path, epochs, batch_size, learning_rate, image_size, save_n
         return "\n".join(logs)
     except Exception as e:
-        return f"❌ Training failed: {str(e)}"
 def load_checkpoint(checkpoint_file):

     global MODEL, TEXT_ENCODER, DIFFUSION, DEVICE, CONFIG
     try:
+        # Clear GPU memory
+        import gc
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         # Setup
         DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Use smaller model for T4 GPU
         CONFIG = {
+            "base_channels": 48,  # Reduced from 64
+            "channel_mults": (1, 2, 4),  # Keep same
+            "context_dim": 192,  # Reduced from 256
             "image_size": image_size,
             "timesteps": 1000
         }
         logs = [f"🚀 Training started on {DEVICE}"]
         logs.append(f"📊 Model parameters: {num_params:,}")
         logs.append(f"📁 Training samples: {len(train_dataset)}")
+        logs.append(f"🖼️ Image size: {image_size}x{image_size}")
+        logs.append(f"📦 Batch size: {batch_size}")
         logs.append("-" * 40)
+        for epoch in range(int(epochs)):
             epoch_loss = 0
+            batch_count = 0
             for images, texts in train_loader:
                 images = images.to(DEVICE)
                 context = TEXT_ENCODER(texts, DEVICE)
                 optimizer.step()
                 epoch_loss += loss.item()
+                batch_count += 1
+                # Clear cache periodically
+                if batch_count % 50 == 0:
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+            avg_loss = epoch_loss / max(batch_count, 1)
+            logs.append(f"Epoch {epoch+1}/{int(epochs)}: loss = {avg_loss:.4f}")
+            # Save checkpoint every 10 epochs
+            if (epoch + 1) % 10 == 0:
+                print(f"Epoch {epoch+1}: loss = {avg_loss:.4f}")
+                checkpoint_path = f"checkpoints/{save_name}_epoch{epoch+1}.pt"
+                os.makedirs("checkpoints", exist_ok=True)
+                torch.save({
+                    "model_state_dict": MODEL.state_dict(),
+                    "text_encoder_state_dict": TEXT_ENCODER.state_dict(),
+                    "config": CONFIG,
+                    "epoch": epoch + 1
+                }, checkpoint_path)
+                logs.append(f"💾 Checkpoint saved: {checkpoint_path}")
         # Save model
         MODEL.eval()
         return "\n".join(logs)
     except Exception as e:
+        import traceback
+        return f"❌ Training failed: {str(e)}\n{traceback.format_exc()}"
 def load_checkpoint(checkpoint_file):