luohoa97 commited on
Commit
be03d5f
·
verified ·
1 Parent(s): b93d3aa

Deploy BitNet-Transformer Trainer

Browse files
Files changed (1) hide show
  1. scripts/train_ai_model.py +25 -11
scripts/train_ai_model.py CHANGED
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
28
 
29
  # Hyperparameters
30
  EPOCHS = 100
31
- BATCH_SIZE = 64 # Reduced for Transformer memory
32
  LR = 0.0003
33
  HIDDEN_DIM = 512
34
  LAYERS = 8
@@ -42,6 +42,11 @@ HF_TOKEN = os.getenv("HF_TOKEN")
42
  def train():
43
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
  logger.info(f"Using device: {device}")
 
 
 
 
 
45
 
46
  # 1. Load Dataset
47
  if not os.path.exists("data/trading_dataset.pt"):
@@ -65,8 +70,8 @@ def train():
65
  val_size = len(dataset) - train_size
66
  train_ds, val_ds = random_split(dataset, [train_size, val_size])
67
 
68
- train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
69
- val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
70
 
71
  # 3. Create Model
72
  input_dim = X.shape[2]
@@ -93,14 +98,22 @@ def train():
93
  for batch_X, batch_y in train_loader:
94
  batch_X, batch_y = batch_X.to(device), batch_y.to(device)
95
  optimizer.zero_grad()
96
- outputs = model(batch_X)
97
- loss = criterion(outputs, batch_y)
98
- loss.backward()
99
 
100
- # Gradient clipping for stability with quantized weights
101
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
 
102
 
103
- optimizer.step()
 
 
 
 
 
 
 
 
 
104
 
105
  train_loss += loss.item()
106
  _, predicted = outputs.max(1)
@@ -115,8 +128,9 @@ def train():
115
  with torch.no_grad():
116
  for batch_X, batch_y in val_loader:
117
  batch_X, batch_y = batch_X.to(device), batch_y.to(device)
118
- outputs = model(batch_X)
119
- loss = criterion(outputs, batch_y)
 
120
  val_loss += loss.item()
121
  _, predicted = outputs.max(1)
122
  val_total += batch_y.size(0)
 
28
 
29
  # Hyperparameters
30
  EPOCHS = 100
31
+ BATCH_SIZE = 128 # Higher for T4 GPU
32
  LR = 0.0003
33
  HIDDEN_DIM = 512
34
  LAYERS = 8
 
42
  def train():
43
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
  logger.info(f"Using device: {device}")
45
+
46
+ # Use BFloat16 if supported (Ampere+ GPUs like A100/H100), otherwise FP16
47
+ use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
48
+ dtype = torch.bfloat16 if use_bf16 else torch.float16
49
+ scaler = torch.cuda.amp.GradScaler(enabled=(not use_bf16)) # Scaler only needed for FP16
50
 
51
  # 1. Load Dataset
52
  if not os.path.exists("data/trading_dataset.pt"):
 
70
  val_size = len(dataset) - train_size
71
  train_ds, val_ds = random_split(dataset, [train_size, val_size])
72
 
73
+ train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
74
+ val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, pin_memory=True)
75
 
76
  # 3. Create Model
77
  input_dim = X.shape[2]
 
98
  for batch_X, batch_y in train_loader:
99
  batch_X, batch_y = batch_X.to(device), batch_y.to(device)
100
  optimizer.zero_grad()
 
 
 
101
 
102
+ # Using Mixed Precision (AMP)
103
+ with torch.cuda.amp.autocast(dtype=dtype):
104
+ outputs = model(batch_X)
105
+ loss = criterion(outputs, batch_y)
106
 
107
+ if not use_bf16:
108
+ scaler.scale(loss).backward()
109
+ scaler.unscale_(optimizer)
110
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
111
+ scaler.step(optimizer)
112
+ scaler.update()
113
+ else:
114
+ loss.backward()
115
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
116
+ optimizer.step()
117
 
118
  train_loss += loss.item()
119
  _, predicted = outputs.max(1)
 
128
  with torch.no_grad():
129
  for batch_X, batch_y in val_loader:
130
  batch_X, batch_y = batch_X.to(device), batch_y.to(device)
131
+ with torch.cuda.amp.autocast(dtype=dtype):
132
+ outputs = model(batch_X)
133
+ loss = criterion(outputs, batch_y)
134
  val_loss += loss.item()
135
  _, predicted = outputs.max(1)
136
  val_total += batch_y.size(0)