Spaces:
Configuration error
Configuration error
Commit ·
a76bfd1
1
Parent(s): 9a56158
model trainig added
Browse files- config/config.yaml +6 -0
- main.py +12 -0
- params.yaml +13 -1
- research/model_training.ipynb +0 -0
- src/benglasummarization/components/model_training.py +145 -0
- src/benglasummarization/config/configuration.py +25 -2
- src/benglasummarization/entity/config_entity.py +17 -0
- src/benglasummarization/pipeline/stage_04_model_Training.py +12 -0
config/config.yaml
CHANGED
|
@@ -16,5 +16,11 @@ train_tokenize:
|
|
| 16 |
input_file_dir : artifacts/ban_tokenization/combined_text.txt
|
| 17 |
save_file : artifacts/train_tokenization
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
|
|
|
| 16 |
input_file_dir : artifacts/ban_tokenization/combined_text.txt
|
| 17 |
save_file : artifacts/train_tokenization
|
| 18 |
|
| 19 |
+
model_training:
|
| 20 |
+
root_dir : artifacts/model_training
|
| 21 |
+
data_dir : artifacts/data_ingestion/BanSum.csv
|
| 22 |
+
ben_tokenizer_dir : artifacts/train_tokenization/cbengali_tokenizer.model
|
| 23 |
+
save_trained_model_dir : artifacts/model_training
|
| 24 |
+
|
| 25 |
|
| 26 |
|
main.py
CHANGED
|
@@ -2,6 +2,9 @@ from src.benglasummarization.logging import logger
|
|
| 2 |
from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
|
| 3 |
from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
|
| 4 |
from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
|
|
|
|
|
|
|
|
|
|
| 5 |
STAGE_NAME = 'Data Ingestion Stage'
|
| 6 |
|
| 7 |
try:
|
|
@@ -35,3 +38,12 @@ except Exception as e:
|
|
| 35 |
logger.exception(e)
|
| 36 |
raise e
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
|
| 3 |
from src.benglasummarization.pipeline.stage_02_prepare_ben_tok import BenTokenizationPreparePipeLine
|
| 4 |
from src.benglasummarization.pipeline.stage_03_train_ban_token import TrainTokenizePipeLine
|
| 5 |
+
from src.benglasummarization.pipeline.stage_04_model_Training import ModelTrainingPipeline
|
| 6 |
+
|
| 7 |
+
|
| 8 |
STAGE_NAME = 'Data Ingestion Stage'
|
| 9 |
|
| 10 |
try:
|
|
|
|
| 38 |
logger.exception(e)
|
| 39 |
raise e
|
| 40 |
|
| 41 |
+
STAGE_NAME = 'Model Training PipeLine Stage'
|
| 42 |
+
try:
|
| 43 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
| 44 |
+
train_model = ModelTrainingPipeline()
|
| 45 |
+
train_model.main()
|
| 46 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.exception(e)
|
| 49 |
+
raise e
|
params.yaml
CHANGED
|
@@ -4,4 +4,16 @@ pre_tokenize:
|
|
| 4 |
train_tokenize:
|
| 5 |
model_prefix : 'cbengali_tokenizer'
|
| 6 |
model_type : 'unigram'
|
| 7 |
-
vocab_size : 91902
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
train_tokenize:
|
| 5 |
model_prefix : 'cbengali_tokenizer'
|
| 6 |
model_type : 'unigram'
|
| 7 |
+
vocab_size : 91902
|
| 8 |
+
|
| 9 |
+
training_model:
|
| 10 |
+
max_input_length : 256
|
| 11 |
+
max_output_length : 125
|
| 12 |
+
model_name : 'google/pegasus-large'
|
| 13 |
+
batch_size : 1
|
| 14 |
+
num_epochs : 1
|
| 15 |
+
learning_rate : 1e-4
|
| 16 |
+
accumulator_steps : 4
|
| 17 |
+
max_grad_norm : 1.0
|
| 18 |
+
early_stopping_patience : 3
|
| 19 |
+
patience_counter : 0
|
research/model_training.ipynb
ADDED
|
File without changes
|
src/benglasummarization/components/model_training.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch.utils.data import Dataset
|
| 2 |
+
from transformers import PegasusTokenizer
|
| 3 |
+
import os
|
| 4 |
+
import torch
|
| 5 |
+
from torch.utils.data import DataLoader, random_split
|
| 6 |
+
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
from src.benglasummarization.logging import logger
|
| 11 |
+
from src.benglasummarization.entity.config_entity import ModelTrainingConfig
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BengaliSummaryDataset(Dataset):
|
| 15 |
+
def __init__(self, texts, summaries, tokenizer: PegasusTokenizer, config: ModelTrainingConfig):
|
| 16 |
+
self.config = config
|
| 17 |
+
self.texts = texts
|
| 18 |
+
self.summaries = summaries
|
| 19 |
+
self.tokenizer = tokenizer
|
| 20 |
+
|
| 21 |
+
def __len__(self):
|
| 22 |
+
return len(self.texts)
|
| 23 |
+
|
| 24 |
+
def __getitem__(self, idx):
|
| 25 |
+
text = self.texts[idx]
|
| 26 |
+
summary = self.summaries[idx]
|
| 27 |
+
|
| 28 |
+
inputs = self.tokenizer(
|
| 29 |
+
text,
|
| 30 |
+
truncation=True,
|
| 31 |
+
padding="max_length",
|
| 32 |
+
max_length=self.config.max_input_length,
|
| 33 |
+
return_tensors="pt"
|
| 34 |
+
)
|
| 35 |
+
labels = self.tokenizer(
|
| 36 |
+
summary,
|
| 37 |
+
truncation=True,
|
| 38 |
+
padding="max_length",
|
| 39 |
+
max_length=self.config.max_output_length,
|
| 40 |
+
return_tensors="pt"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
input_ids = inputs['input_ids'].squeeze()
|
| 44 |
+
attention_mask = inputs['attention_mask'].squeeze()
|
| 45 |
+
labels = labels['input_ids'].squeeze()
|
| 46 |
+
|
| 47 |
+
# Replace padding token id's with -100 to ignore them during loss computation
|
| 48 |
+
labels[labels == self.tokenizer.pad_token_id] = -100
|
| 49 |
+
|
| 50 |
+
return {
|
| 51 |
+
"input_ids": input_ids,
|
| 52 |
+
"attention_mask": attention_mask,
|
| 53 |
+
"labels": labels
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class ModelTraining:
|
| 58 |
+
def __init__(self, config: ModelTrainingConfig):
|
| 59 |
+
self.config = config
|
| 60 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 61 |
+
|
| 62 |
+
def load_data(self):
|
| 63 |
+
df = pd.read_csv(self.config.data_dir)
|
| 64 |
+
df = df.head(1000)
|
| 65 |
+
texts = df['main'].tolist()
|
| 66 |
+
summaries = df['sum3'].tolist()
|
| 67 |
+
return train_test_split(texts, summaries, test_size=0.1, random_state=42)
|
| 68 |
+
|
| 69 |
+
def create_datasets(self, train_texts, train_summaries, val_texts, val_summaries):
|
| 70 |
+
tokenizer = PegasusTokenizer.from_pretrained(self.config.ben_tokenizer_dir)
|
| 71 |
+
train_dataset = BengaliSummaryDataset(train_texts, train_summaries, tokenizer, self.config)
|
| 72 |
+
val_dataset = BengaliSummaryDataset(val_texts, val_summaries, tokenizer, self.config)
|
| 73 |
+
return train_dataset, val_dataset, tokenizer
|
| 74 |
+
|
| 75 |
+
def train(self):
|
| 76 |
+
# Load and split data
|
| 77 |
+
train_texts, val_texts, train_summaries, val_summaries = self.load_data()
|
| 78 |
+
|
| 79 |
+
# Create datasets and tokenizer
|
| 80 |
+
train_dataset, val_dataset, tokenizer = self.create_datasets(train_texts, train_summaries, val_texts, val_summaries)
|
| 81 |
+
|
| 82 |
+
# Create data loaders
|
| 83 |
+
train_dataloader = DataLoader(train_dataset, batch_size=self.config.batch_size, shuffle=True)
|
| 84 |
+
val_dataloader = DataLoader(val_dataset, batch_size=self.config.batch_size)
|
| 85 |
+
|
| 86 |
+
# Initialize model
|
| 87 |
+
model = PegasusForConditionalGeneration.from_pretrained(self.config.model_name).to(self.device)
|
| 88 |
+
|
| 89 |
+
# Optimizer and scheduler
|
| 90 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
| 91 |
+
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.5, total_iters=len(train_dataloader) * self.config.num_epochs)
|
| 92 |
+
|
| 93 |
+
# Training loop
|
| 94 |
+
best_val_loss = float('inf')
|
| 95 |
+
for epoch in range(self.config.num_epochs):
|
| 96 |
+
model.train()
|
| 97 |
+
total_loss = 0
|
| 98 |
+
progress_bar = tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}")
|
| 99 |
+
|
| 100 |
+
for step, batch in enumerate(train_dataloader):
|
| 101 |
+
input_ids = batch['input_ids'].to(self.device)
|
| 102 |
+
attention_mask = batch['attention_mask'].to(self.device)
|
| 103 |
+
labels = batch['labels'].to(self.device)
|
| 104 |
+
|
| 105 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
|
| 106 |
+
loss = outputs.loss
|
| 107 |
+
loss = loss / self.config.accumulator_steps
|
| 108 |
+
loss.backward()
|
| 109 |
+
|
| 110 |
+
total_loss += loss.item()
|
| 111 |
+
|
| 112 |
+
if (step + 1) % self.config.accumulator_steps == 0 or step == len(train_dataloader) - 1:
|
| 113 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.max_grad_norm)
|
| 114 |
+
optimizer.step()
|
| 115 |
+
scheduler.step()
|
| 116 |
+
optimizer.zero_grad()
|
| 117 |
+
|
| 118 |
+
progress_bar.update(1)
|
| 119 |
+
progress_bar.set_postfix({'loss': total_loss / (step + 1)})
|
| 120 |
+
|
| 121 |
+
progress_bar.close()
|
| 122 |
+
|
| 123 |
+
# Validation
|
| 124 |
+
model.eval()
|
| 125 |
+
val_loss = 0
|
| 126 |
+
with torch.no_grad():
|
| 127 |
+
for batch in tqdm(val_dataloader, desc="Validation"):
|
| 128 |
+
input_ids = batch['input_ids'].to(self.device)
|
| 129 |
+
attention_mask = batch['attention_mask'].to(self.device)
|
| 130 |
+
labels = batch['labels'].to(self.device)
|
| 131 |
+
|
| 132 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
|
| 133 |
+
val_loss += outputs.loss.item()
|
| 134 |
+
|
| 135 |
+
val_loss /= len(val_dataloader)
|
| 136 |
+
print(f"Epoch {epoch + 1} - Validation Loss: {val_loss:.4f}")
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
logger.info(f"Training Completed")
|
| 141 |
+
save_path = os.path.join(self.config.save_trained_model_dir)
|
| 142 |
+
model.save_pretrained(save_path)
|
| 143 |
+
tokenizer.save_pretrained(save_path)
|
| 144 |
+
logger.info(f'Model Saved to {self.config.save_trained_model_dir}')
|
| 145 |
+
|
src/benglasummarization/config/configuration.py
CHANGED
|
@@ -2,7 +2,7 @@ from src.benglasummarization.constants import *
|
|
| 2 |
from src.benglasummarization.utils.common import read_yaml, create_directories
|
| 3 |
from benglasummarization.entity.config_entity import DataIngestionConfig
|
| 4 |
from src.benglasummarization.entity.config_entity import BanTokenizationConfig
|
| 5 |
-
from src.benglasummarization.entity.config_entity import BanTokenTrainConfig
|
| 6 |
class ConfigurationManager:
|
| 7 |
def __init__(
|
| 8 |
self,
|
|
@@ -56,4 +56,27 @@ class ConfigurationManager:
|
|
| 56 |
model_type= params.model_type,
|
| 57 |
vocab_size= params.vocab_size
|
| 58 |
)
|
| 59 |
-
return train_token_config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from src.benglasummarization.utils.common import read_yaml, create_directories
|
| 3 |
from benglasummarization.entity.config_entity import DataIngestionConfig
|
| 4 |
from src.benglasummarization.entity.config_entity import BanTokenizationConfig
|
| 5 |
+
from src.benglasummarization.entity.config_entity import BanTokenTrainConfig, ModelTrainingConfig
|
| 6 |
class ConfigurationManager:
|
| 7 |
def __init__(
|
| 8 |
self,
|
|
|
|
| 56 |
model_type= params.model_type,
|
| 57 |
vocab_size= params.vocab_size
|
| 58 |
)
|
| 59 |
+
return train_token_config
|
| 60 |
+
|
| 61 |
+
def get_model_trainer_config(self) -> ModelTrainingConfig:
|
| 62 |
+
config = self.config.model_training
|
| 63 |
+
param = self.params.training_model
|
| 64 |
+
create_directories([config.root_dir])
|
| 65 |
+
model_trainer_config = ModelTrainingConfig(
|
| 66 |
+
root_dir= config.root_dir,
|
| 67 |
+
data_dir= config.data_dir,
|
| 68 |
+
ben_tokenizer_dir= config.ben_tokenizer_dir,
|
| 69 |
+
save_trained_model_dir= config.save_trained_model_dir,
|
| 70 |
+
max_input_length = param.max_input_length,
|
| 71 |
+
max_output_length = param.max_output_length,
|
| 72 |
+
batch_size = param.batch_size,
|
| 73 |
+
num_epochs = param.num_epochs,
|
| 74 |
+
accumulator_steps = param.accumulator_steps,
|
| 75 |
+
max_grad_norm = param.max_grad_norm,
|
| 76 |
+
early_stopping_patience = param.early_stopping_patience,
|
| 77 |
+
patience_counter = param.patience_counter,
|
| 78 |
+
model_name = param.model_name,
|
| 79 |
+
learning_rate = param.learning_rate
|
| 80 |
+
|
| 81 |
+
)
|
| 82 |
+
return model_trainer_config
|
src/benglasummarization/entity/config_entity.py
CHANGED
|
@@ -24,4 +24,21 @@ class BanTokenTrainConfig:
|
|
| 24 |
model_prefix : str
|
| 25 |
model_type : str
|
| 26 |
vocab_size : int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
|
|
|
| 24 |
model_prefix : str
|
| 25 |
model_type : str
|
| 26 |
vocab_size : int
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class ModelTrainingConfig:
|
| 30 |
+
root_dir : Path
|
| 31 |
+
data_dir : Path
|
| 32 |
+
ben_tokenizer_dir : Path
|
| 33 |
+
save_trained_model_dir : Path
|
| 34 |
+
max_input_length : int
|
| 35 |
+
max_output_length : int
|
| 36 |
+
batch_size : int
|
| 37 |
+
num_epochs : int
|
| 38 |
+
accumulator_steps : int
|
| 39 |
+
max_grad_norm : float
|
| 40 |
+
early_stopping_patience : int
|
| 41 |
+
patience_counter : int
|
| 42 |
+
model_name : str
|
| 43 |
+
learning_rate : float
|
| 44 |
|
src/benglasummarization/pipeline/stage_04_model_Training.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.benglasummarization.components.model_training import ModelTraining
|
| 2 |
+
from src.benglasummarization.config.configuration import ConfigurationManager
|
| 3 |
+
|
| 4 |
+
class ModelTrainingPipeline:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
def main(self):
|
| 9 |
+
config_manager = ConfigurationManager()
|
| 10 |
+
model_training_config = config_manager.get_model_trainer_config()
|
| 11 |
+
model_trainer = ModelTraining(config=model_training_config)
|
| 12 |
+
model_trainer.train()
|