import torch
from tqdm import tqdm
from .config import *
from .data_loader import TextDataLoader
from .model import GPTLanguageModel
import math

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 715
max_steps = 19073

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    
    if it > max_steps:
        return min_lr
    
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 +math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

total_batch_size = 524288
assert total_batch_size % (BATCH_SIZE * BLOCK_SIZE) == 0, "make sure total_batch_size is divisible by BATCH_SIZE * BLOCK_SIZE"
grad_accumulation_steps = total_batch_size // (BATCH_SIZE * BLOCK_SIZE)
print(f"grad_accumulation_steps: {grad_accumulation_steps}")
print(f"total_batch_size: {total_batch_size}")

import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")

from DataLoader import create_dataloader


def train(folder_path, tokenizer, model=None, optimizer=None, vocab_size=10000, platform='none', checkpoint=None, is_tokenized_data = False):

    torch.set_float32_matmul_precision('high')  #hammad added this line (need to check if it is necessary)
    if model is None:
        model = GPTLanguageModel(vocab_size=vocab_size)
        print("Model Initialised")
        if checkpoint != None:
            print("loading checkpoint........")
            model.load(checkpoint)
            print("Model loaded from checkpoint", checkpoint)

        if platform == 'kaggle':
            model = torch.nn.DataParallel(model, device_ids=[0, 1])
            model = model.to(DEVICE)
            optimizer = model.module.configure_optimizers(weight_decay=0.1, learning_rate=LEARNING_RATE, device=DEVICE) #hammad added this line
        else:
            model = model.to(DEVICE)
            model = torch.compile(model) #hammad added this line
            optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=LEARNING_RATE, device=DEVICE)
        # optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, betas = (0.9, 0.95), eps = 1e-8)
        

    # # Initialize the data loader
    # loader = TextDataLoader(file_path, BATCH_SIZE, BLOCK_SIZE, tokenizer)
    
    
    # Set up a tqdm progress bar for the epoch
    for epoch in range(MAX_ITERS):
        print(f"Epoch {epoch}")
        epoch_loss = None  # Track loss for the epoch


        for i in range(len(os.listdir(folder_path))):
            file_path = os.path.join(folder_path, os.listdir(folder_path)[i])
            print(f"loading file: {file_path}")
            loader = create_dataloader(tokenizer, file_path, BATCH_SIZE, BLOCK_SIZE, BLOCK_SIZE, tokenized_data = is_tokenized_data, filename = os.listdir(folder_path)[i]) #hammad added this line

        
            # Create a progress bar for batch processing
            batch_progress_bar = tqdm(loader, desc=f"Epoch {epoch+1} Batch Progress", unit="batch", ncols=100)
            count = 0
            loss_accum = 0
            for xb, yb in batch_progress_bar:
                if xb is None:
                    break  # No more batches, stop the epoch
                optimizer.zero_grad()

                    # Forward pass and loss computation
                xb = xb.to(DEVICE)
                yb = yb.to(DEVICE)
                #with torch.autocast(DEVICE, dtype=torch.bfloat16): #hammad added this line
                logits, loss = model(xb, yb)
                loss = loss / grad_accumulation_steps
                if platform == 'kaggle':
                    loss_accum += loss.mean().detach()
                    loss.mean().backward()
                else:
                    loss_accum += loss.detach()
                    loss.backward()  # Backpropagate the loss
                # for micro_batch in range(grad_accumulation_steps):
                if count % grad_accumulation_steps == 0:
                    print("one batch completed at (xb,yb):", count)
                    loss_accum = 0
                    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #hammad added this line
                    lr = get_lr(count) #need to check if this is correct
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                    optimizer.step()  # Update model parameters
                    torch.cuda.synchronize() #wait for the computation to finish before moving to the next iteration
                
                # Update epoch_loss to the most recent loss value
                if platform == 'kaggle':
                    epoch_loss = loss.mean().item()
                else:
                    epoch_loss = loss.item()
                
                # Update tqdm with the latest loss value
                batch_progress_bar.set_postfix(loss=epoch_loss)

                count+=1
                if count%5000 == 0:
                    if platform == 'kaggle':
                        torch.save(model.module.state_dict(), f"model_weights_checkpoint_{count}.pth")
                    else:
                        torch.save(model.state_dict(), f"model_weights_checkpoint_{count}.pth")
                    print(f"Model weights saved at checkpoint {count}")
            
            # Save model weights after each chunk or epoch
            if platform == 'kaggle':
                torch.save(model.module.state_dict(),
                        f"model_weights_epoch_{epoch}_{file_path[-6:-4]}.pth")
            else:
                torch.save(model.state_dict(),
                        f"model_weights_epoch_{epoch}_{file_path[-6:-4]}.pth")
            print(f"Model weights saved at epoch {epoch}")
            
            # Print the loss at the end of the epoch
            if epoch_loss is not None:
                print(f"Epoch {epoch}, Loss: {epoch_loss}")
            else:
                print(f"Epoch {epoch}, No data available for loss calculation.")
        
        # Reset the loader for a new epoch
    #     loader.reset()
    
    # loader.close()  # Ensure the file is properly closed at the end
    torch.cuda.empty_cache()

    return model, optimizer


#before parallelizing the model
# def train(file_path, tokenizer, model=None, optimizer=None, vocab_size=10000, platform='none'):
#     if model is None:
#         model = GPTLanguageModel(vocab_size=vocab_size)
#         if platform == 'kaggle':
#             model = torch.nn.DataParallel(model, device_ids=[0, 1]).to(DEVICE)
#         else:
#             model = model.to(DEVICE)
#         optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

#     # Initialize the data loader
#     loader = TextDataLoader(file_path, BATCH_SIZE, BLOCK_SIZE, tokenizer, DEVICE)
    
#     # Set up a tqdm progress bar for the epoch
#     for epoch in range(MAX_ITERS):
#         print(f"Epoch {epoch}")
#         epoch_loss = None  # Track loss for the epoch
        
#         # Create a progress bar for batch processing
#         batch_progress_bar = tqdm(loader, total=loader.num_batches(), desc=f"Epoch {epoch+1} Batch Progress", unit="batch", ncols=100)
        
#         for xb, yb in batch_progress_bar:
#             if xb is None:
#                 break  # No more batches, stop the epoch
            
#             # Forward pass and loss computation
#             logits, loss = model(xb, yb)
#             optimizer.zero_grad()
#             loss.backward()  # Backpropagate the loss
#             optimizer.step()  # Update model parameters
            
#             # Update epoch_loss to the most recent loss value
#             epoch_loss = loss.item()
            
#             # Update tqdm with the latest loss value
#             batch_progress_bar.set_postfix(loss=epoch_loss)
        
#         # Save model weights after each chunk or epoch
#         model.save(f"model_weights_epoch_{epoch}.pth")
#         print(f"Model weights saved at epoch {epoch}")
        
#         # Print the loss at the end of the epoch
#         if epoch_loss is not None:
#             print(f"Epoch {epoch}, Loss: {epoch_loss}")
#         else:
#             print(f"Epoch {epoch}, No data available for loss calculation.")
        
#         # Reset the loader for a new epoch
#         loader.reset()
    
#     loader.close()  # Ensure the file is properly closed at the end

#     return model, optimizer

# def train(file_path, tokenizer, model = None, optimizer = None, vocab_size=10000, platform='none'):
#     if model is None:
#         model = GPTLanguageModel(vocab_size=vocab_size)
#         if platform == 'kaggle':
#             model = torch.nn.DataParallel(model, device_ids=[0, 1]).to(DEVICE)
#         else:
#             model = model.to(DEVICE)
#         optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
#     loader = TextDataLoader(file_path, BATCH_SIZE, BLOCK_SIZE, tokenizer, DEVICE)


#     for epoch in range(MAX_ITERS):  # Iterate over the file chunks
#         print(f"Epoch {epoch}")
#         epoch_loss = None  # Track loss for the epoch
#         while not loader.end_of_file:
#             xb, yb = loader.get_batch()
#             if xb is None:
#                 break  # No more batches, stop the epoch

#             # Forward pass and loss computation
#             # print("This is xb", xb)
#             # print("This is yb", yb)
#             logits, loss = model(xb, yb)
#             optimizer.zero_grad()
#             loss.backward()   #2 gpus pe masla kr rraha (krna for n gpus hai)
#             optimizer.step()
            
#             # Update epoch_loss to the most recent loss value
#             epoch_loss = loss.item()

#         # Save model weights after each chunk or epoch
#         model.save(f"model_weights_epoch_{epoch}.pth")
#         print(f"Model weights saved at epoch {epoch}")
        
#         # Print the loss only if it was computed
#         if epoch_loss is not None:
#             print(f"Epoch {epoch}, Loss: {epoch_loss}")
#         else:
#             print(f"Epoch {epoch}, No data available for loss calculation.")

#         # Reset the loader for a new epoch
#         loader.reset()

#     loader.close()  # Ensure file is properly closed at the end

#     return model, optimizer


# def train(file_path, tokenizer, model=None, optimizer=None, vocab_size=10000):
#     # Check if multiple GPUs are available
#     device = DEVICE
#     if model is None:
#         if torch.cuda.is_available() and torch.cuda.device_count() > 1:
#             print(f"Training on {torch.cuda.device_count()} GPUs")
#             model = GPTLanguageModel(vocab_size=vocab_size).to(device)
#             model = torch.nn.DataParallel(model, device_ids=[0, 1])  # Wrap the model for multi-GPU training
#         else:
#             print("Training on a single GPU or CPU.")
            
#             model = GPTLanguageModel(vocab_size=vocab_size).to(device)
        
#     if optimizer is None:
#         optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
#     loader = TextDataLoader(file_path, BATCH_SIZE, BLOCK_SIZE, tokenizer, device)

#     for epoch in range(MAX_ITERS):  # Iterate over the file chunks
#         print(f"Epoch {epoch}")
#         epoch_loss = None  # Track loss for the epoch

#         xb, yb = loader.get_batch()
#         if xb is None:
#             break  # No more batches, stop the epoch

#         # Forward pass and loss computation
#         logits, loss = model(xb, yb)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         # Update epoch_loss to the most recent loss value
#         epoch_loss = loss.item()

#         # Save model weights after each chunk or epoch
#         model_to_save = model.module if isinstance(model, torch.nn.DataParallel) else model  # Get the underlying model if using DataParallel
#         model_to_save.save(f"model_weights_epoch_{epoch}.pth")
#         print(f"Model weights saved at epoch {epoch}")
        
#         # Print the loss only if it was computed
#         if epoch_loss is not None:
#             print(f"Epoch {epoch}, Loss: {epoch_loss}")
#         else:
#             print(f"Epoch {epoch}, No data available for loss calculation.")

#         # Reset the loader for a new epoch
#         loader.reset()

#     loader.close()  # Ensure file is properly closed at the end

#     return model, optimizer