Spaces:
Sleeping
Sleeping
File size: 5,082 Bytes
290f366 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | """
CaptionIQ β Central Configuration
All hyperparameters, paths, and constants in one place.
"""
import os
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Paths
# ββββββββββββββββββββββββββββββββββββββββββββββ
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
OUTPUTS_DIR = os.path.join(BASE_DIR, "outputs")
# Raw dataset paths (Flickr8K)
FLICKR_IMAGES_DIR = os.path.join(DATA_DIR, "Flickr8k_Dataset")
FLICKR_TEXT_DIR = os.path.join(DATA_DIR, "Flickr8k_text")
# Processed data paths
CAPTIONS_FILE = os.path.join(DATA_DIR, "captions_clean.txt")
TOKENIZER_FILE = os.path.join(DATA_DIR, "tokenizer.pkl")
TRAIN_IMAGES_FILE = os.path.join(DATA_DIR, "train_images.txt")
VAL_IMAGES_FILE = os.path.join(DATA_DIR, "val_images.txt")
TEST_IMAGES_FILE = os.path.join(DATA_DIR, "test_images.txt")
# Feature files
VGG16_FEATURES_FILE = os.path.join(DATA_DIR, "vgg16_features.pkl")
VGG19_FEATURES_FILE = os.path.join(DATA_DIR, "vgg19_features.pkl")
# Model checkpoint paths
VGG16_MODEL_FILE = os.path.join(MODELS_DIR, "model_vgg16.h5")
VGG19_MODEL_FILE = os.path.join(MODELS_DIR, "model_vgg19.h5")
# Output paths
VGG16_LOSS_PLOT = os.path.join(OUTPUTS_DIR, "loss_vgg16.png")
VGG19_LOSS_PLOT = os.path.join(OUTPUTS_DIR, "loss_vgg19.png")
BLEU_RESULTS_FILE = os.path.join(OUTPUTS_DIR, "bleu_results.json")
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Image parameters
# ββββββββββββββββββββββββββββββββββββββββββββββ
IMAGE_SIZE = 224 # VGG input requirement
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Model hyperparameters
# ββββββββββββββββββββββββββββββββββββββββββββββ
EMBED_DIM = 256 # Word embedding dimension
LSTM_UNITS = 512 # LSTM hidden units
DROPOUT_RATE = 0.3 # Dropout rate (reduced from 0.5)
FEATURE_DIM = 512 # VGG spatial feature channels
FEATURE_LOCATIONS = 49 # 7x7 spatial positions from block5_pool
ATTENTION_DIM = 256 # Bahdanau attention hidden dim
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Vocabulary
# ββββββββββββββββββββββββββββββββββββββββββββββ
MIN_WORD_FREQ = 5 # Drop words appearing fewer than this many times
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Training hyperparameters
# ββββββββββββββββββββββββββββββββββββββββββββββ
EPOCHS = 30 # 30 is enough; early stopping handles the rest
BATCH_SIZE = 64 # 2x bigger batch = 2x fewer steps = 2x faster
LEARNING_RATE = 0.001
# Callbacks
EARLY_STOP_PATIENCE = 6 # EarlyStopping patience (epochs)
LR_PATIENCE = 3 # ReduceLROnPlateau patience (epochs)
LR_FACTOR = 0.5 # LR reduction factor
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Inference
# ββββββββββββββββββββββββββββββββββββββββββββββ
BEAM_WIDTH = 5
MAX_LENGTH = 34 # Maximum caption length in tokens
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Special tokens
# ββββββββββββββββββββββββββββββββββββββββββββββ
START_TOKEN = "startseq"
END_TOKEN = "endseq"
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Dataset splits
# ββββββββββββββββββββββββββββββββββββββββββββββ
TRAIN_SIZE = 6000
VAL_SIZE = 1000
TEST_SIZE = 1000
# ββββββββββββββββββββββββββββββββββββββββββββββ
# Create directories if they don't exist
# ββββββββββββββββββββββββββββββββββββββββββββββ
for d in [DATA_DIR, MODELS_DIR, OUTPUTS_DIR]:
os.makedirs(d, exist_ok=True)
|