yashwan2003 commited on
Commit
5ea6408
·
verified ·
1 Parent(s): 59057a0

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. best_model.pt +1 -1
  2. main.py +65 -63
best_model.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2c40048d0ec6ab5898a45298e3e974d786054b4becafa92f44763a912d51341
3
  size 726511201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faa1c740cfc878fcbbf4495d80e1648e108c990029ead1b5145014d06e2c8e0a
3
  size 726511201
main.py CHANGED
@@ -689,7 +689,7 @@ def greedy_decode(
689
  model = model.to(device)
690
 
691
  # Encode source sentence
692
- src_ids = source_tokenizer.encode(src_sentence).ids
693
  src_ids = src_ids[:max_length]
694
  src_padded = src_ids + [vocab_info['src_pad_idx']] * (max_length - len(src_ids))
695
  src = torch.tensor([src_padded], dtype=torch.long).to(device)
@@ -749,30 +749,6 @@ def greedy_decode(
749
 
750
 
751
 
752
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
753
- print(f"Using device: {device}")
754
-
755
-
756
- with open(f'dataset/train.en', 'r') as file:
757
- english_texts = file.readlines()
758
- with open(f'dataset/train.kn', 'r') as file:
759
- kannada_texts = file.readlines()
760
-
761
- english_texts= [sentence.rstrip('\n').lower() for sentence in english_texts]
762
- kannada_texts = [sentence.rstrip('\n') for sentence in kannada_texts]
763
-
764
-
765
- # Prepare data with OPTIMIZED settings for faster training
766
- # CRITICAL: Reduced vocab sizes from 64k/40k to 16k/12k for 3-5x speedup
767
- # The final linear layer size is vocab_size × d_model, so smaller vocab = much faster
768
- train_loader, val_loader, src_tok, tgt_tok, vocab_info = prepare_data(
769
- english_texts,
770
- kannada_texts,
771
- source_vocab_size=50000, # Reduced from 64000 - still captures most words
772
- target_vocab_size=32000, # Reduced from 40000 - 3x faster computation
773
- max_length=75, # Reduced from 100 - fewer tokens to process
774
- batch_size=500 # Reduced from 300 - better GPU utilization
775
- )
776
 
777
  class TransformerTrainer:
778
  """
@@ -993,47 +969,73 @@ class TransformerTrainer:
993
 
994
  return self.train_losses, self.val_losses
995
 
 
 
 
996
 
997
 
998
- # Initialize model with optimized size
999
- # PERFORMANCE: Smaller model = faster training, often better generalization
1000
- model = Transformer(
1001
- d_model=384, # Reduced from 512 for faster computation
1002
- ffn_hidden=1536, # Reduced from 2048 (4x d_model ratio maintained)
1003
- num_heads=6, # Reduced from 8 (d_model must be divisible by num_heads)
1004
- drop_prob=0.1,
1005
- num_layers=4, # Reduced from 6 - still effective for translation
1006
- max_sequence_length=75, # Match the max_length from data prep
1007
- src_vocab_size=vocab_info['source_vocab_size'],
1008
- tgt_vocab_size=vocab_info['target_vocab_size']
1009
- )
1010
 
1011
- # Initialize trainer with performance optimizations
1012
- trainer = TransformerTrainer(
1013
- model=model,
1014
- train_loader=train_loader,
1015
- val_loader=val_loader,
1016
- vocab_info=vocab_info,
1017
- device=device,
1018
- learning_rate=0.0001,
1019
- use_amp=True, # Enable mixed precision for 2-3x speedup
1020
- gradient_accumulation_steps=1 # Increase if you get OOM errors
1021
- )
1022
 
1023
- # Train
1024
- train_losses, val_losses = trainer.train(num_epochs=50, save_path='best_model.pt')
1025
 
1026
- # Inference example
1027
- test_sentence = "Hello, how are you?"
1028
- translation = greedy_decode(
1029
- model,
1030
- test_sentence,
1031
- src_tok,
1032
- tgt_tok,
1033
- vocab_info,
1034
- device=device # Explicitly pass device
1035
- )
1036
- print(f"Source: {test_sentence}")
1037
- print(f"Translation: {translation}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
 
1039
- print("Training pipeline ready with fixed device handling!")
 
689
  model = model.to(device)
690
 
691
  # Encode source sentence
692
+ src_ids = source_tokenizer.encode(src_sentence)
693
  src_ids = src_ids[:max_length]
694
  src_padded = src_ids + [vocab_info['src_pad_idx']] * (max_length - len(src_ids))
695
  src = torch.tensor([src_padded], dtype=torch.long).to(device)
 
749
 
750
 
751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
 
753
  class TransformerTrainer:
754
  """
 
969
 
970
  return self.train_losses, self.val_losses
971
 
972
+ if __name__ == "__main__":
973
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
974
+ print(f"Using device: {device}")
975
 
976
 
977
+ with open(f'dataset/train.en', 'r') as file:
978
+ english_texts = file.readlines()
979
+ with open(f'dataset/train.kn', 'r') as file:
980
+ kannada_texts = file.readlines()
 
 
 
 
 
 
 
 
981
 
982
+ english_texts= [sentence.rstrip('\n').lower() for sentence in english_texts]
983
+ kannada_texts = [sentence.rstrip('\n') for sentence in kannada_texts]
 
 
 
 
 
 
 
 
 
984
 
 
 
985
 
986
+ # Prepare data with OPTIMIZED settings for faster training
987
+ # CRITICAL: Reduced vocab sizes from 64k/40k to 16k/12k for 3-5x speedup
988
+ # The final linear layer size is vocab_size × d_model, so smaller vocab = much faster
989
+ train_loader, val_loader, src_tok, tgt_tok, vocab_info = prepare_data(
990
+ english_texts,
991
+ kannada_texts,
992
+ source_vocab_size=50000, # Reduced from 64000 - still captures most words
993
+ target_vocab_size=32000, # Reduced from 40000 - 3x faster computation
994
+ max_length=75, # Reduced from 100 - fewer tokens to process
995
+ batch_size=500 # Reduced from 300 - better GPU utilization
996
+ )
997
+
998
+
999
+
1000
+ # Initialize model with optimized size
1001
+ # PERFORMANCE: Smaller model = faster training, often better generalization
1002
+ model = Transformer(
1003
+ d_model=384, # Reduced from 512 for faster computation
1004
+ ffn_hidden=1536, # Reduced from 2048 (4x d_model ratio maintained)
1005
+ num_heads=6, # Reduced from 8 (d_model must be divisible by num_heads)
1006
+ drop_prob=0.1,
1007
+ num_layers=4, # Reduced from 6 - still effective for translation
1008
+ max_sequence_length=75, # Match the max_length from data prep
1009
+ src_vocab_size=vocab_info['source_vocab_size'],
1010
+ tgt_vocab_size=vocab_info['target_vocab_size']
1011
+ )
1012
+
1013
+ # Initialize trainer with performance optimizations
1014
+ trainer = TransformerTrainer(
1015
+ model=model,
1016
+ train_loader=train_loader,
1017
+ val_loader=val_loader,
1018
+ vocab_info=vocab_info,
1019
+ device=device,
1020
+ learning_rate=0.0001,
1021
+ use_amp=True, # Enable mixed precision for 2-3x speedup
1022
+ gradient_accumulation_steps=1 # Increase if you get OOM errors
1023
+ )
1024
+
1025
+ # Train
1026
+ train_losses, val_losses = trainer.train(num_epochs=50, save_path='best_model.pt')
1027
+
1028
+ # Inference example
1029
+ test_sentence = "Hello, how are you?"
1030
+ translation = greedy_decode(
1031
+ model,
1032
+ test_sentence,
1033
+ src_tok,
1034
+ tgt_tok,
1035
+ vocab_info,
1036
+ device=device # Explicitly pass device
1037
+ )
1038
+ print(f"Source: {test_sentence}")
1039
+ print(f"Translation: {translation}")
1040
 
1041
+ print("Training pipeline ready with fixed device handling!")