File size: 16,560 Bytes
ad0be11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b58b67f3",
"metadata": {},
"outputs": [],
"source": [
"# Import model and training classes from Hugging Face Transformers\n",
"from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments\n",
"# Import dataset utilities from the datasets library\n",
"from datasets import load_dataset, Dataset\n",
"# Import pandas for CSV/DF handling\n",
"import pandas as pd\n",
"# Import torch for device checks and tensors\n",
"import torch\n",
"# Import evaluate to load evaluation metrics\n",
"import evaluate\n",
"# Import numpy for numeric array manipulation\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "349f59d3",
"metadata": {},
"outputs": [],
"source": [
"# Path to the main corpus CSV file (expects two columns: source and target)\n",
"data_path = \"./your/main/corpus.csv\" # Two columns: 'en' and 't'\n",
"# Read the CSV into a pandas DataFrame\n",
"df = pd.read_csv(data_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9232b43",
"metadata": {},
"outputs": [],
"source": [
"# Display the first 3 rows of the DataFrame to inspect loaded data\n",
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88f11bdd",
"metadata": {},
"outputs": [],
"source": [
"# Ensure the dataframe has correct column names and no list-type values\n",
"def ensure_text_columns(df):\n",
" # If the DataFrame uses 'Src_lang'/'Tgt_lang', rename them to 'src'/'tgt'\n",
" if 'Src_lang' in df.columns and 'Tgt_lang' in df.columns:\n",
" df = df.rename(columns={\"Src_lang\": \"src\", \"Tgt_lang\": \"tgt\"})\n",
" # blank line preserved for readability\n",
" # Ensure all values are strings to avoid list/object types during tokenization\n",
" df['src'] = df['src'].astype(str)\n",
" df['tgt'] = df['tgt'].astype(str)\n",
" # blank line preserved for readability\n",
" return df # return the normalized DataFrame\n",
"# Apply the helper to the loaded DataFrame\n",
"df = ensure_text_columns(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c3fdfa7",
"metadata": {},
"outputs": [],
"source": [
"# Re-inspect the DataFrame after normalization\n",
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3aaec3a8",
"metadata": {},
"outputs": [],
"source": [
"# Add language prefix tokens that will be prepended to source/target sentences\n",
"prefix_src = \"src_lang_code\" # placeholder source language token\n",
"prefix_tgt = \"tgt_lang_code\" # placeholder target language token"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06bbfc98",
"metadata": {},
"outputs": [],
"source": [
"# Preprocessing function that adds language prefix tokens to each example\n",
"def preprocess(example):\n",
" # change the prefix_src and prefix_tgt to change the translation direction\n",
" return {\n",
" \"translation\": {\n",
" \"src\": f\"{prefix_src} {example['src']}\", # prepend source prefix\n",
" \"tgt\": f\"{prefix_tgt} {example['tgt']}\" # prepend target prefix\n",
" }\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad52beb7",
"metadata": {},
"outputs": [],
"source": [
"# Rename columns (no-op here but kept for clarity) and apply preprocessing to create a Dataset\n",
"df = df.rename(columns={\"src\": \"src\", \"tgt\": \"tgt\"}) # explicit rename placeholder\n",
"# Convert pandas DataFrame to a Hugging Face Dataset\n",
"dataset = Dataset.from_pandas(df)\n",
"# Apply preprocessing function to each example in the dataset\n",
"dataset = dataset.map(preprocess)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd30423f",
"metadata": {},
"outputs": [],
"source": [
"# Split the Dataset into training and validation sets\n",
"split_dataset = dataset.train_test_split(test_size=0.1, seed=42) # 10% for validation\n",
"# Extract train and validation Dataset objects\n",
"train_data = split_dataset[\"train\"]\n",
"val_data = split_dataset[\"test\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "951b1f86",
"metadata": {},
"outputs": [],
"source": [
"# Save processed train/validation splits to CSV files for later use\n",
"train_data.to_csv(\"train_set.csv\", index=False) # write training set\n",
"val_data.to_csv(\"val_set.csv\", index=False) # write validation set\n",
"print(\"Train and validation data saved successfully:\") # confirmation\n",
"print(f\"Train size: {len(train_data)}\") # show train count\n",
"print(f\"Validation size: {len(val_data)}\") # show validation count"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3cbc12e7",
"metadata": {},
"outputs": [],
"source": [
"# Load the MBART-50 tokenizer (fast implementation)\n",
"tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
"# Add any custom special tokens required (e.g., our target language code)\n",
"tokenizer.add_special_tokens({'additional_special_tokens': [\"tgt_lang_code\"]})\n",
"# Register the new lang token in the tokenizer's lang_code mapping\n",
"tokenizer.lang_code_to_id[\"tgt_lang_code\"] = len(tokenizer.lang_code_to_id)\n",
"# Rebuild reverse mapping from id to lang code (useful later)\n",
"tokenizer.id_to_lang_code = {v: k for k, v in tokenizer.lang_code_to_id.items()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc507095",
"metadata": {},
"outputs": [],
"source": [
"# Load the pretrained MBART model for conditional generation\n",
"model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n",
"# Resize model token embeddings to account for any newly added tokens in the tokenizer\n",
"model.resize_token_embeddings(len(tokenizer))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ebd78be",
"metadata": {},
"outputs": [],
"source": [
"# --- Step 4: Tokenize data --- # tokenization settings and helper\n",
"# Maximum tokenization length for inputs/targets\n",
"max_length = 128\n",
"# blank line for readability\n",
"# Tokenization function applied to dataset examples\n",
"def tokenize_function(examples):\n",
" # Tokenize source with padding/truncation to max_length\n",
" inputs = tokenizer(examples[\"translation\"][\"src\"], padding=\"max_length\", truncation=True, max_length=max_length)\n",
" # Tokenize target similarly\n",
" targets = tokenizer(examples[\"translation\"][\"tgt\"], padding=\"max_length\", truncation=True, max_length=max_length)\n",
" # Use tokenized target input_ids as labels for seq2seq training\n",
" inputs[\"labels\"] = targets[\"input_ids\"]\n",
" return inputs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb922c07",
"metadata": {},
"outputs": [],
"source": [
"# Tokenize the dataset using the helper function defined above\n",
"train_dataset = train_data.map(tokenize_function)\n",
"val_dataset = val_data.map(tokenize_function)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "213966dc",
"metadata": {},
"outputs": [],
"source": [
"# Import evaluation utilities (repeated import is safe inside notebook but already imported above)\n",
"import evaluate\n",
"# numpy imported earlier; this duplicate import is harmless\n",
"import numpy as np\n",
"# blank line for readability\n",
"# Load metric implementations once to reuse inside compute_metrics\n",
"bleu_metric = evaluate.load(\"bleu\")\n",
"meteor_metric = evaluate.load(\"meteor\")\n",
"ter_metric = evaluate.load(\"ter\")\n",
"chrf_metric = evaluate.load(\"chrf\")\n",
"# blank line for readability\n",
"# Function used by Trainer to compute evaluation metrics from model outputs\n",
"def compute_metrics(eval_preds):\n",
" # eval_preds is a tuple (predictions, labels)\n",
" preds, labels = eval_preds\n",
" # blank line for readability\n",
" # Decode predictions from token ids to strings\n",
" decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
" # blank line for readability\n",
" # Replace masked label tokens (-100) with pad token id so they decode properly\n",
" labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
" # blank line for readability\n",
" # Decode label ids to strings\n",
" decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
" # blank line for readability\n",
" # Clean whitespace from decoded strings\n",
" decoded_preds = [p.strip() for p in decoded_preds]\n",
" decoded_labels = [[l.strip()] for l in decoded_labels] # convert to list-of-lists for metrics\n",
" # blank line for readability\n",
" # Compute each metric using the decoded predictions and references\n",
" bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
" meteor = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
" ter = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
" chrf = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
" # blank line for readability\n",
" # BLEU implementations may return different keys; try common ones\n",
" bleu_score = bleu.get(\"score\", bleu.get(\"bleu\"))\n",
" # blank line for readability\n",
" return {\n",
" \"ChrF\": chrf[\"score\"], # MAIN METRIC\n",
" \"BLEU\": bleu_score,\n",
" \"METEOR\": meteor[\"meteor\"],\n",
" \"TER\": ter[\"score\"]\n",
" }\n",
"# end of cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "824252e3",
"metadata": {},
"outputs": [],
"source": [
"# Configure Seq2Seq training arguments for the Hugging Face Trainer\n",
"training_args = Seq2SeqTrainingArguments(\n",
" output_dir=\"./your/model/checkpoints\", # directory for checkpoints and outputs\n",
" per_device_train_batch_size=8, # batch size per device for training\n",
" per_device_eval_batch_size=8, # batch size per device for evaluation\n",
" gradient_accumulation_steps=4, # effective batch size = 8*4 = 32\n",
" learning_rate=3e-5, # initial learning rate\n",
" weight_decay=0.01, # weight decay for optimizer\n",
" num_train_epochs=3, # number of training epochs\n",
" warmup_steps=1000, # number of warmup steps for scheduler\n",
" lr_scheduler_type=\"cosine\", # learning rate scheduler type\n",
" fp16=torch.cuda.is_available(), # enable fp16 if CUDA is available\n",
" evaluation_strategy=\"steps\", # evaluate every X steps\n",
" eval_steps=2000, # evaluation interval in steps\n",
" save_strategy=\"steps\", # save checkpoints every X steps\n",
" save_steps=2000, # checkpoint saving interval\n",
" load_best_model_at_end=True, # keep the best model according to metric\n",
" metric_for_best_model=\"ChrF\", # metric used to select best model\n",
" greater_is_better=True, # higher metric value is better\n",
" save_total_limit=5, # limit number of saved checkpoints\n",
" predict_with_generate=True, # use generate() for predictions during eval\n",
" generation_max_length=128, # max length when generating predictions\n",
" generation_num_beams=4, # number of beams for generation\n",
" logging_dir=\"./logs\", # tensorboard/logging dir\n",
" logging_steps=200, # logging interval\n",
" seed=42, # random seed for reproducibility\n",
" report_to=\"none\", # disable reporting to external services\n",
")\n",
"# end of training_args cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc6cb1bd",
"metadata": {},
"outputs": [],
"source": [
"# Import a data collator that pads to longest sequence in the batch for seq2seq models\n",
"from transformers import DataCollatorForSeq2Seq\n",
"# blank line for readability\n",
"# Create the data collator which will dynamically pad batch examples\n",
"data_collator = DataCollatorForSeq2Seq(\n",
" tokenizer,\n",
" model=model,\n",
" padding=\"longest\", # pad to the longest sequence in the batch\n",
")\n",
"# end of data_collator cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9508fc22",
"metadata": {},
"outputs": [],
"source": [
"# Create the Seq2SeqTrainer wrapper which handles training/evaluation loops\n",
"trainer = Seq2SeqTrainer(\n",
" model=model, # the model to train\n",
" args=training_args, # training configuration\n",
" train_dataset=train_dataset, # training data\n",
" eval_dataset=val_dataset, # evaluation data\n",
" processing_class=tokenizer, # tokenizer/processor used for the model\n",
" data_collator=data_collator, # handles padding in batches\n",
" compute_metrics=compute_metrics, # metrics callback for evaluation\n",
")\n",
"# end of trainer creation cell"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5aea8ad",
"metadata": {},
"outputs": [],
"source": [
"# Start training. This runs the main training loop according to training_args\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "462c0da0",
"metadata": {},
"outputs": [],
"source": [
"# Evaluate the trained model on the validation set and print returned metrics\n",
"eval_results = trainer.evaluate()\n",
"print(eval_results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59cfe87b",
"metadata": {},
"outputs": [],
"source": [
"# Show eval_results variable (already printed above) in a notebook cell to display its value\n",
"eval_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d629dc4b",
"metadata": {},
"outputs": [],
"source": [
"# Save the fine-tuned model weights and tokenizer to a directory\n",
"model.save_pretrained(\"./your/model/name\") # saves model config and weights\n",
"tokenizer.save_pretrained(\"./your/model/name\") # saves tokenizer files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0641551e",
"metadata": {},
"outputs": [],
"source": [
"# Load pipeline utilities for quick inference\n",
"import torch\n",
"from transformers import pipeline\n",
"# blank line for readability\n",
"# Create a translation pipeline pointing at the saved model directory\n",
"pipeline = pipeline(\n",
" task=\"translation\", # pipeline task\n",
" model=\"./your/model/name\", # path to saved model\n",
" device=0, # device id (0 for first GPU); set to -1 for CPU\n",
" torch_dtype=torch.float16, # use float16 if model and device support it\n",
" src_lang=\"src_lang_code\", # source language code token\n",
" tgt_lang=\"tgt_lang_code\", # target language code token\n",
")\n",
"# Run the pipeline on a sample sentence and print the translation\n",
"print(pipeline(\"I like singing\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ptorch",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|