File size: 4,972 Bytes
ee1d4aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os

# Base data folder, assuming COCO 2017 is extracted here
# Adjust this path if your data is located elsewhere
_BASE_DATA_FOLDER = 'data/images'

# Output directory for logs, saved vocabulary, and temporary files
# and where the best model checkpoint will be saved for easy access
_OUTPUT_DIR = 'output'
_MODELS_DIR = 'models' # Where the final best model will be saved

# Ensure output and models directories exist
os.makedirs(_OUTPUT_DIR, exist_ok=True)
os.makedirs(_MODELS_DIR, exist_ok=True)


# --- Configuration for Training ---
TRAINING_CONFIG = {
    'data_folder': _BASE_DATA_FOLDER,
    'train_image_folder': 'train2017',
    'val_image_folder': 'val2017',
    'train_caption_file': os.path.join(_BASE_DATA_FOLDER, 'annotations', 'captions_train2017.json'),
    'val_caption_file': os.path.join(_BASE_DATA_FOLDER, 'annotations', 'captions_val2017.json'),

    # Subset sizes for quicker testing during development. Set to None for full dataset.
    'vocab_subset_size': None,
    'train_subset_size': None, # e.g., 200000 for a large subset, None for full
    'val_subset_size': None,   # e.g., 10000 for a subset, None for full

    # Model Hyperparameters
    'embed_dim': 256,
    'attention_dim': 256,
    'decoder_dim': 256,
    'dropout': 0.5,
    'fine_tune_encoder': True, # Set to False to freeze ResNet weights during training

    # Training Parameters
    'batch_size': 64,
    'num_workers': 4, # Adjust based on your CPU cores and RAM (e.g., 2, 4, 8)
    'learning_rate': 4e-4,
    'encoder_learning_rate': 1e-5, # Lower LR for encoder if fine_tune_encoder is True
    'lr_reduce_factor': 0.5,
    'lr_patience': 5,
    'num_epochs': 20, # Total number of epochs to run
    'log_step': 100,  # Print loss every N steps
    'grad_clip': 5.0, # Gradient clipping value

    'max_caption_length': 30, # Max length of captions, including <START> and <END>
    'val_beam_size': 3,       # Beam size used for validation inference during training
    'val_inference_batches': None, # None to generate captions for all validation batches, or an int for a subset

    'output_dir': _OUTPUT_DIR, # Where training logs and vocabulary will be saved
    'models_dir': _MODELS_DIR  # Where the best model checkpoint will be saved
}


# --- Configuration for Evaluation ---
# This uses the validation set for evaluation, as is common practice.
EVALUATION_CONFIG = {
    'data_folder': _BASE_DATA_FOLDER,
    'test_image_folder': 'val2017', # Typically evaluate on the validation set for final metrics
    'test_caption_file': os.path.join(_BASE_DATA_FOLDER, 'annotations', 'captions_val2017.json'),
    'test_subset_size': None, # Evaluate on a subset for faster testing, or None for full validation set
    'eval_batch_size': 1,     # Must be 1 for accurate beam search evaluation
    'beam_size': 5,           # Beam size for caption generation during evaluation
    'max_caption_length': 30,
    'num_workers': 4,
    'eval_output_dir': os.path.join(_OUTPUT_DIR, 'evaluation_results'), # Directory to save evaluation results JSONs
    'output_dir': _OUTPUT_DIR,

    # Placeholder for model path. This will be updated dynamically after training or
    # can be set manually if you have a pre-trained model.
    'model_path': os.path.join(_MODELS_DIR, 'best_model_bleu0.1058.pth') # Placeholder, update after training
}


# --- Configuration for Inference Example ---
INFERENCE_CONFIG = {
    'beam_size': 5,
    'max_caption_length': 30,
    # Placeholder for model path. This will be updated dynamically after training or
    # can be set manually if you have a pre-trained model.
    'model_path': os.path.join(_MODELS_DIR, 'models/best_model_bleu0.1058.pth'), # Placeholder, update after training

    # Path to an example image for quick inference demonstration
    'example_image_path': os.path.join(_BASE_DATA_FOLDER, 'new_one.jpg') # Example image from COCO val2017
}


# --- Utility Functions for updating config with latest trained model ---
def update_config_with_latest_model(config_dict):
    """
    Finds the latest best model checkpoint in the models directory and updates
    the given configuration dictionary's 'model_path'.
    """
    saved_models = [f for f in os.listdir(_MODELS_DIR) if f.startswith('best_model_bleu') and f.endswith('.pth')]
    if saved_models:
        # Get the one with the highest BLEU score in its name
        latest_model_name = max(saved_models, key=lambda f: float(f.split('bleu')[1].replace('.pth', '')))
        latest_model_path = os.path.join(_MODELS_DIR, latest_model_name)
        config_dict['model_path'] = latest_model_path
        print(f"Updated config with latest model: {latest_model_path}")
    else:
        print(f"Warning: No best model found in '{_MODELS_DIR}'. Inference/Evaluation might fail.")

# Update inference and evaluation configs to point to the latest model if available
update_config_with_latest_model(EVALUATION_CONFIG)
update_config_with_latest_model(INFERENCE_CONFIG)