Spaces:

pavanpraneeth
/

CaptionIQ

Sleeping

File size: 4,398 Bytes

290f366

"""
CaptionIQ — Image Feature Extraction
Extract spatial feature maps from VGG16 and VGG19 (`block5_pool`).
Save features as pickle files for training.
"""

import os
import pickle
import argparse
import numpy as np
from tqdm import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input as vgg16_preprocess
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input as vgg19_preprocess
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model

import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.config import (
    FLICKR_IMAGES_DIR, IMAGE_SIZE,
    VGG16_FEATURES_FILE, VGG19_FEATURES_FILE,
)


def build_feature_extractor(backbone: str = "vgg16") -> tuple:
    """
    Build a feature extractor from a pre-trained VGG model.
    Outputs `block5_pool` spatial features (7x7x512).

    Args:
        backbone: "vgg16" or "vgg19"

    Returns:
        (model, preprocess_fn) tuple
    """
    if backbone == "vgg16":
        base_model = VGG16(weights="imagenet")
        preprocess_fn = vgg16_preprocess
    elif backbone == "vgg19":
        base_model = VGG19(weights="imagenet")
        preprocess_fn = vgg19_preprocess
    else:
        raise ValueError(f"Unknown backbone: {backbone}. Use 'vgg16' or 'vgg19'.")

    # Use block5_pool for spatial features (7x7x512) instead of fc2 (4096)
    model = Model(
        inputs=base_model.input,
        outputs=base_model.get_layer("block5_pool").output
    )
    print(f"\n{backbone.upper()} feature extractor loaded")
    print(f"  Output shape: {model.output_shape}  (spatial features)")
    return model, preprocess_fn


def extract_features(model, preprocess_fn, images_dir: str) -> dict:
    """
    Extract features for all images in a directory.

    Returns:
        dict mapping filename → numpy array of shape (49, 512)
    """
    features = {}
    image_files = [
        f for f in os.listdir(images_dir)
        if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    print(f"Extracting features for {len(image_files)} images...")
    for fname in tqdm(image_files, desc="Extracting"):
        filepath = os.path.join(images_dir, fname)
        try:
            # Load and preprocess image
            image = load_img(filepath, target_size=(IMAGE_SIZE, IMAGE_SIZE))
            image = img_to_array(image)
            image = np.expand_dims(image, axis=0)
            image = preprocess_fn(image)

            # Extract spatial feature map and reshape to (49, 512)
            feature = model.predict(image, verbose=0)[0]  # (7, 7, 512)
            h, w, c = feature.shape
            features[fname] = feature.reshape(h * w, c)  # (49, 512)
        except Exception as e:
            print(f"  Warning: Failed to process {fname}: {e}")

    print(f"Extracted features for {len(features)} images")
    return features


def save_features(features: dict, filepath: str):
    """Save features dict to pickle file."""
    with open(filepath, "wb") as f:
        pickle.dump(features, f)
    size_mb = os.path.getsize(filepath) / (1024 * 1024)
    print(f"Features saved to: {filepath} ({size_mb:.1f} MB)")


def main():
    """Extract features using VGG16 and/or VGG19."""
    parser = argparse.ArgumentParser(description="Extract VGG features from images")
    parser.add_argument(
        "--backbone", type=str, default="vgg19",
        choices=["vgg16", "vgg19", "both"],
        help="Which backbone to use for extraction (default: vgg19)"
    )
    args = parser.parse_args()

    if not os.path.exists(FLICKR_IMAGES_DIR):
        print(f"Error: Image directory not found: {FLICKR_IMAGES_DIR}")
        print("Please run preprocess.py first to download the dataset.")
        return

    backbones = ["vgg16", "vgg19"] if args.backbone == "both" else [args.backbone]

    for backbone in backbones:
        print("\n" + "=" * 60)
        print(f"  Extracting {backbone.upper()} features")
        print("=" * 60)

        model, preprocess_fn = build_feature_extractor(backbone)
        features = extract_features(model, preprocess_fn, FLICKR_IMAGES_DIR)

        output_file = VGG16_FEATURES_FILE if backbone == "vgg16" else VGG19_FEATURES_FILE
        save_features(features, output_file)

    print("\n✓ Feature extraction complete!")


if __name__ == "__main__":
    main()