Spaces:

ShynBui
/

train_for_fun

Sleeping

App Files Files Community

ShynBui commited on Sep 11, 2024

Commit

a913549

verified ·

1 Parent(s): 4f12561

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -14

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 import pandas as pd
 import os
 import spaces
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(device)
@@ -23,7 +24,7 @@ def load_data(file):
     global global_data
     df = pd.read_csv(file)
     inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt")  # Mã hóa văn bản
-    labels = torch.tensor(df['lable'].tolist()).long()  #
     global_data = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
     print(global_data)
@@ -33,7 +34,7 @@ def get_dataloader(start, end, batch_size=8):
     subset = torch.utils.data.Subset(global_data, range(start, end))
     return DataLoader(subset, batch_size=batch_size)
-@spaces.GPU(duration=120)
 def train_batch(dataloader):
     model.train()
     start_time = time.time()
@@ -49,41 +50,45 @@ def train_batch(dataloader):
         optimizer.step()
         elapsed_time = time.time() - start_time
-        if elapsed_time > 10:  # Dừng trước 60 giây để lưu checkpoint
-            print("save checkpoint")
             torch.save(model.state_dict(), "./checkpoint/model.pt")
             return False, "Checkpoint saved. Training paused."
     return True, "Batch training completed."
 def train_step(file=None):
     if file:
         load_data(file)
     start_idx = 0
     batch_size = 8
     total_samples = len(global_data)
     while start_idx < total_samples:
-        print(start_idx)
-        end_idx = min(start_idx + (batch_size * 10), total_samples)  # Chia nhỏ dữ liệu để xử lý nhanh
         dataloader = get_dataloader(start_idx, end_idx, batch_size)
-        start_time = time.time()
-        success, message = train_batch(dataloader)
-        elapsed_time = time.time() - start_time
-        if elapsed_time >= 10:  # Kết thúc trước khi hết 60 giây để lưu checkpoint
-            torch.save(model.state_dict(), "./checkpoint/model.pt")
-            return f"{message}. Training paused after {elapsed_time:.2f}s."
         start_idx = end_idx
     torch.save(model.state_dict(), "./checkpoint/model.pt")
     return "Training completed and model saved."
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=train_step,

 import pandas as pd
 import os
 import spaces
+from spaces.zero.gradio import HTMLError
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(device)
     global global_data
     df = pd.read_csv(file)
     inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors="pt")  # Mã hóa văn bản
+    labels = torch.tensor(df['label'].tolist()).long()  # Đảm bảo tên cột là 'label'
     global_data = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
     print(global_data)
     subset = torch.utils.data.Subset(global_data, range(start, end))
     return DataLoader(subset, batch_size=batch_size)
+@spaces.GPU(duration=5)
 def train_batch(dataloader):
     model.train()
     start_time = time.time()
         optimizer.step()
         elapsed_time = time.time() - start_time
+        if elapsed_time > 50:  # Dừng trước 59 giây để đảm bảo không vượt hạn ngạch
             torch.save(model.state_dict(), "./checkpoint/model.pt")
             return False, "Checkpoint saved. Training paused."
     return True, "Batch training completed."
 def train_step(file=None):
     if file:
         load_data(file)
+    print(global_data)
     start_idx = 0
     batch_size = 8
     total_samples = len(global_data)
+    counting = 0
     while start_idx < total_samples:
+        print("Step:", counting)
+        print("Percent:", total_samples/start_idx * 100, "%")
+        counting += 1
+        end_idx = min(start_idx + (batch_size * 10), total_samples)  # 10 batches per loop
         dataloader = get_dataloader(start_idx, end_idx, batch_size)
+        try:
+            success, message = train_batch(dataloader)
+            if not success:
+                return message
+        except HTMLError as e:
+            print("Exceeded GPU quota, retrying in 10 seconds...")
+            time.sleep(10)
+            continue
         start_idx = end_idx
+        time.sleep(2)  # Nghỉ 2 giây giữa các phiên huấn luyện
     torch.save(model.state_dict(), "./checkpoint/model.pt")
     return "Training completed and model saved."
 if __name__ == "__main__":
     iface = gr.Interface(
         fn=train_step,