Upload folder using huggingface_hub
Browse files- best_model.pt +1 -1
- main.py +65 -63
best_model.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 726511201
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:faa1c740cfc878fcbbf4495d80e1648e108c990029ead1b5145014d06e2c8e0a
|
| 3 |
size 726511201
|
main.py
CHANGED
|
@@ -689,7 +689,7 @@ def greedy_decode(
|
|
| 689 |
model = model.to(device)
|
| 690 |
|
| 691 |
# Encode source sentence
|
| 692 |
-
src_ids = source_tokenizer.encode(src_sentence)
|
| 693 |
src_ids = src_ids[:max_length]
|
| 694 |
src_padded = src_ids + [vocab_info['src_pad_idx']] * (max_length - len(src_ids))
|
| 695 |
src = torch.tensor([src_padded], dtype=torch.long).to(device)
|
|
@@ -749,30 +749,6 @@ def greedy_decode(
|
|
| 749 |
|
| 750 |
|
| 751 |
|
| 752 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 753 |
-
print(f"Using device: {device}")
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
with open(f'dataset/train.en', 'r') as file:
|
| 757 |
-
english_texts = file.readlines()
|
| 758 |
-
with open(f'dataset/train.kn', 'r') as file:
|
| 759 |
-
kannada_texts = file.readlines()
|
| 760 |
-
|
| 761 |
-
english_texts= [sentence.rstrip('\n').lower() for sentence in english_texts]
|
| 762 |
-
kannada_texts = [sentence.rstrip('\n') for sentence in kannada_texts]
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
# Prepare data with OPTIMIZED settings for faster training
|
| 766 |
-
# CRITICAL: Reduced vocab sizes from 64k/40k to 16k/12k for 3-5x speedup
|
| 767 |
-
# The final linear layer size is vocab_size × d_model, so smaller vocab = much faster
|
| 768 |
-
train_loader, val_loader, src_tok, tgt_tok, vocab_info = prepare_data(
|
| 769 |
-
english_texts,
|
| 770 |
-
kannada_texts,
|
| 771 |
-
source_vocab_size=50000, # Reduced from 64000 - still captures most words
|
| 772 |
-
target_vocab_size=32000, # Reduced from 40000 - 3x faster computation
|
| 773 |
-
max_length=75, # Reduced from 100 - fewer tokens to process
|
| 774 |
-
batch_size=500 # Reduced from 300 - better GPU utilization
|
| 775 |
-
)
|
| 776 |
|
| 777 |
class TransformerTrainer:
|
| 778 |
"""
|
|
@@ -993,47 +969,73 @@ class TransformerTrainer:
|
|
| 993 |
|
| 994 |
return self.train_losses, self.val_losses
|
| 995 |
|
|
|
|
|
|
|
|
|
|
| 996 |
|
| 997 |
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
ffn_hidden=1536, # Reduced from 2048 (4x d_model ratio maintained)
|
| 1003 |
-
num_heads=6, # Reduced from 8 (d_model must be divisible by num_heads)
|
| 1004 |
-
drop_prob=0.1,
|
| 1005 |
-
num_layers=4, # Reduced from 6 - still effective for translation
|
| 1006 |
-
max_sequence_length=75, # Match the max_length from data prep
|
| 1007 |
-
src_vocab_size=vocab_info['source_vocab_size'],
|
| 1008 |
-
tgt_vocab_size=vocab_info['target_vocab_size']
|
| 1009 |
-
)
|
| 1010 |
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
model=model,
|
| 1014 |
-
train_loader=train_loader,
|
| 1015 |
-
val_loader=val_loader,
|
| 1016 |
-
vocab_info=vocab_info,
|
| 1017 |
-
device=device,
|
| 1018 |
-
learning_rate=0.0001,
|
| 1019 |
-
use_amp=True, # Enable mixed precision for 2-3x speedup
|
| 1020 |
-
gradient_accumulation_steps=1 # Increase if you get OOM errors
|
| 1021 |
-
)
|
| 1022 |
|
| 1023 |
-
# Train
|
| 1024 |
-
train_losses, val_losses = trainer.train(num_epochs=50, save_path='best_model.pt')
|
| 1025 |
|
| 1026 |
-
#
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1038 |
|
| 1039 |
-
print("Training pipeline ready with fixed device handling!")
|
|
|
|
| 689 |
model = model.to(device)
|
| 690 |
|
| 691 |
# Encode source sentence
|
| 692 |
+
src_ids = source_tokenizer.encode(src_sentence)
|
| 693 |
src_ids = src_ids[:max_length]
|
| 694 |
src_padded = src_ids + [vocab_info['src_pad_idx']] * (max_length - len(src_ids))
|
| 695 |
src = torch.tensor([src_padded], dtype=torch.long).to(device)
|
|
|
|
| 749 |
|
| 750 |
|
| 751 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
|
| 753 |
class TransformerTrainer:
|
| 754 |
"""
|
|
|
|
| 969 |
|
| 970 |
return self.train_losses, self.val_losses
|
| 971 |
|
| 972 |
+
if __name__ == "__main__":
|
| 973 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 974 |
+
print(f"Using device: {device}")
|
| 975 |
|
| 976 |
|
| 977 |
+
with open(f'dataset/train.en', 'r') as file:
|
| 978 |
+
english_texts = file.readlines()
|
| 979 |
+
with open(f'dataset/train.kn', 'r') as file:
|
| 980 |
+
kannada_texts = file.readlines()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
|
| 982 |
+
english_texts= [sentence.rstrip('\n').lower() for sentence in english_texts]
|
| 983 |
+
kannada_texts = [sentence.rstrip('\n') for sentence in kannada_texts]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 984 |
|
|
|
|
|
|
|
| 985 |
|
| 986 |
+
# Prepare data with OPTIMIZED settings for faster training
|
| 987 |
+
# CRITICAL: Reduced vocab sizes from 64k/40k to 16k/12k for 3-5x speedup
|
| 988 |
+
# The final linear layer size is vocab_size × d_model, so smaller vocab = much faster
|
| 989 |
+
train_loader, val_loader, src_tok, tgt_tok, vocab_info = prepare_data(
|
| 990 |
+
english_texts,
|
| 991 |
+
kannada_texts,
|
| 992 |
+
source_vocab_size=50000, # Reduced from 64000 - still captures most words
|
| 993 |
+
target_vocab_size=32000, # Reduced from 40000 - 3x faster computation
|
| 994 |
+
max_length=75, # Reduced from 100 - fewer tokens to process
|
| 995 |
+
batch_size=500 # Reduced from 300 - better GPU utilization
|
| 996 |
+
)
|
| 997 |
+
|
| 998 |
+
|
| 999 |
+
|
| 1000 |
+
# Initialize model with optimized size
|
| 1001 |
+
# PERFORMANCE: Smaller model = faster training, often better generalization
|
| 1002 |
+
model = Transformer(
|
| 1003 |
+
d_model=384, # Reduced from 512 for faster computation
|
| 1004 |
+
ffn_hidden=1536, # Reduced from 2048 (4x d_model ratio maintained)
|
| 1005 |
+
num_heads=6, # Reduced from 8 (d_model must be divisible by num_heads)
|
| 1006 |
+
drop_prob=0.1,
|
| 1007 |
+
num_layers=4, # Reduced from 6 - still effective for translation
|
| 1008 |
+
max_sequence_length=75, # Match the max_length from data prep
|
| 1009 |
+
src_vocab_size=vocab_info['source_vocab_size'],
|
| 1010 |
+
tgt_vocab_size=vocab_info['target_vocab_size']
|
| 1011 |
+
)
|
| 1012 |
+
|
| 1013 |
+
# Initialize trainer with performance optimizations
|
| 1014 |
+
trainer = TransformerTrainer(
|
| 1015 |
+
model=model,
|
| 1016 |
+
train_loader=train_loader,
|
| 1017 |
+
val_loader=val_loader,
|
| 1018 |
+
vocab_info=vocab_info,
|
| 1019 |
+
device=device,
|
| 1020 |
+
learning_rate=0.0001,
|
| 1021 |
+
use_amp=True, # Enable mixed precision for 2-3x speedup
|
| 1022 |
+
gradient_accumulation_steps=1 # Increase if you get OOM errors
|
| 1023 |
+
)
|
| 1024 |
+
|
| 1025 |
+
# Train
|
| 1026 |
+
train_losses, val_losses = trainer.train(num_epochs=50, save_path='best_model.pt')
|
| 1027 |
+
|
| 1028 |
+
# Inference example
|
| 1029 |
+
test_sentence = "Hello, how are you?"
|
| 1030 |
+
translation = greedy_decode(
|
| 1031 |
+
model,
|
| 1032 |
+
test_sentence,
|
| 1033 |
+
src_tok,
|
| 1034 |
+
tgt_tok,
|
| 1035 |
+
vocab_info,
|
| 1036 |
+
device=device # Explicitly pass device
|
| 1037 |
+
)
|
| 1038 |
+
print(f"Source: {test_sentence}")
|
| 1039 |
+
print(f"Translation: {translation}")
|
| 1040 |
|
| 1041 |
+
print("Training pipeline ready with fixed device handling!")
|