Spaces:

AOUNZakaria
/

ImageCaptionner

Running

App Files Files Community

AOUNZakaria commited on 26 days ago

Commit

32d4a86

1 Parent(s): 400b4a4

Deploy image captioner

Browse files

Files changed (27) hide show

README.md +35 -5
app/__init__.py +102 -0
app/__pycache__/__init__.cpython-314.pyc +0 -0
app/__pycache__/config.cpython-314.pyc +0 -0
app/__pycache__/routes.cpython-314.pyc +0 -0
app/config.py +31 -0
app/routes.py +194 -0
app/utils/__init__.py +2 -0
app/utils/__pycache__/__init__.cpython-314.pyc +0 -0
app/utils/__pycache__/model_cache.cpython-314.pyc +0 -0
app/utils/model_cache.py +343 -0
hf_space_Dockerfile +50 -0
hf_space_app.py +21 -0
hf_space_requirements.txt +13 -0
scripts/download_model.py +112 -0
scripts/efficient_caption.py +82 -0
scripts/optimize_models.py +323 -0
scripts/resnet_caption.py +39 -0
static/css/custom.css +144 -0
static/js/main.js +85 -0
templates/index.html +71 -0
training/__pycache__/efficient_train.cpython-314.pyc +0 -0
training/__pycache__/resnet_train.cpython-314.pyc +0 -0
training/efficient_train.py +499 -0
training/hyperparameter_tuning.py +197 -0
training/resnet_train.py +497 -0
training/train_advanced.py +306 -0

README.md CHANGED Viewed

@@ -1,11 +1,41 @@
 ---
-title: ImageCaptionner
-emoji: 🚀
-colorFrom: indigo
 colorTo: purple
 sdk: docker
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Image Caption Generator
+emoji: 🖼️
+colorFrom: blue
 colorTo: purple
 sdk: docker
+sdk_version: latest
+app_file: app.py
 pinned: false
+license: mit
 ---
+# Image Caption Generator
+Generate captions for images using an optimized EfficientNet-B3 model.
+## Features
+- ✅ EfficientNet-B3 model for high-quality captions
+- ✅ Optimized quantized model (~245MB)
+- ✅ Fast inference
+- ✅ Simple web interface
+## How to Use
+1. Upload an image (PNG, JPG, JPEG)
+2. Click "Generate Caption"
+3. Get your caption!
+## Model
+- **Architecture:** EfficientNet-B3
+- **Optimization:** INT8 Quantization
+- **Size:** ~245MB
+## Technical Details
+- Built with PyTorch and Transformers
+- Uses GPT-2 tokenizer
+- Optimized for production deployment

app/__init__.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Image Caption Generator - Flask Application
+Production-ready application with model caching and security.
+"""
+from flask import Flask
+import os
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def create_app(config=None):
+    """
+    Application factory pattern.
+    Creates and configures the Flask application.
+    """
+    # Get base directory (project root)
+    import os
+    from pathlib import Path
+    base_dir = Path(__file__).resolve().parent.parent
+    app = Flask(__name__,
+                template_folder=str(base_dir / 'templates'),
+                static_folder=str(base_dir / 'static'))
+    # Load configuration
+    app.secret_key = os.environ.get("SESSION_SECRET")
+    if not app.secret_key or app.secret_key == "default-secret-key":
+        if os.environ.get("FLASK_ENV") == "production":
+            raise ValueError("SESSION_SECRET must be set in production environment!")
+        else:
+            logger.warning("Using default secret key. Set SESSION_SECRET in production!")
+            app.secret_key = "default-secret-key"
+    # Configuration
+    app.config['UPLOAD_FOLDER'] = os.environ.get('UPLOAD_FOLDER', 'uploads')
+    app.config['MAX_CONTENT_LENGTH'] = int(os.environ.get('MAX_FILE_SIZE', 10 * 1024 * 1024))
+    app.config['ALLOWED_EXTENSIONS'] = {'png', 'jpg', 'jpeg'}
+    # Create uploads directory
+    if not os.path.exists(app.config['UPLOAD_FOLDER']):
+        os.makedirs(app.config['UPLOAD_FOLDER'])
+    # Register blueprints/routes
+    from app.routes import bp
+    app.register_blueprint(bp)
+    # Download model if needed (before loading)
+    # Try HF Hub first, then download URL
+    if os.environ.get("FLASK_ENV") == "production" or os.environ.get("LOAD_MODELS", "true").lower() == "true":
+        try:
+            # Try downloading from Hugging Face Hub first
+            model_repo = os.environ.get("HF_MODEL_REPO")
+            if model_repo:
+                try:
+                    from huggingface_hub import hf_hub_download
+                    logger.info(f"Downloading model from HF Hub: {model_repo}")
+                    model_path = hf_hub_download(
+                        repo_id=model_repo,
+                        filename="efficientnet_efficient_best_model_quantized.pth",
+                        cache_dir=str(base_dir / "models" / "optimized_models")
+                    )
+                    logger.info(f"Model downloaded from HF Hub: {model_path}")
+                except Exception as e:
+                    logger.warning(f"Could not download from HF Hub: {e}. Trying download URL...")
+                    import sys
+                    sys.path.insert(0, str(base_dir))
+                    from scripts.download_model import download_efficientnet_model
+                    download_efficientnet_model()
+            else:
+                # Fallback to download URL method
+                import sys
+                sys.path.insert(0, str(base_dir))
+                from scripts.download_model import download_efficientnet_model
+                download_efficientnet_model()
+        except Exception as e:
+            logger.warning(f"Could not download model: {e}. Will try to use existing model if available.")
+    # Initialize models at startup (production)
+    if os.environ.get("FLASK_ENV") == "production" or os.environ.get("LOAD_MODELS", "true").lower() == "true":
+        logger.info("Initializing models...")
+        try:
+            from app.utils.model_cache import model_cache
+            # Only load EfficientNet model
+            model_cache.load_efficientnet_model_only(use_optimized=True)
+            logger.info("EfficientNet model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load models: {e}", exc_info=True)
+            # Don't raise here - let the app start and handle errors gracefully
+    return app
+# For backward compatibility
+app = create_app()

app/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (3.47 kB). View file

app/__pycache__/config.cpython-314.pyc ADDED Viewed

Binary file (1.69 kB). View file

app/__pycache__/routes.cpython-314.pyc ADDED Viewed

Binary file (9.07 kB). View file

app/config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+Application configuration.
+"""
+import os
+from pathlib import Path
+# Base directory
+BASE_DIR = Path(__file__).resolve().parent.parent
+# Flask configuration
+SECRET_KEY = os.environ.get("SESSION_SECRET", "dev-secret-key-change-in-production")
+FLASK_ENV = os.environ.get("FLASK_ENV", "development")
+DEBUG = FLASK_ENV != "production"
+# Upload configuration
+UPLOAD_FOLDER = os.environ.get("UPLOAD_FOLDER", str(BASE_DIR / "uploads"))
+MAX_FILE_SIZE = int(os.environ.get("MAX_FILE_SIZE", 10 * 1024 * 1024))  # 10MB
+ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg'}
+# Model paths
+MODELS_DIR = BASE_DIR / "models"
+OPTIMIZED_MODELS_DIR = MODELS_DIR / "optimized_models"
+RESNET_MODEL_PATH = MODELS_DIR / "resnet_best_model.pth"
+EFFICIENTNET_MODEL_PATH = MODELS_DIR / "efficient_best_model.pth"
+VOCAB_PATH = MODELS_DIR / "vocab.pkl"
+# Model configuration
+USE_OPTIMIZED_MODELS = os.environ.get("USE_OPTIMIZED_MODELS", "true").lower() == "true"
+LOAD_MODELS_ON_STARTUP = os.environ.get("LOAD_MODELS", "true").lower() == "true"

app/routes.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Application routes.
+"""
+import os
+import logging
+import time
+from datetime import datetime
+from flask import Blueprint, render_template, request, jsonify
+from werkzeug.utils import secure_filename
+from torchvision import transforms
+from PIL import Image
+import torch
+from app.utils.model_cache import model_cache
+from app.config import MAX_FILE_SIZE, ALLOWED_EXTENSIONS
+# Import training functions (handle both old and new locations)
+try:
+    from training.resnet_train import visualize_attention
+    from training.efficient_train import generate_caption
+except ImportError:
+    # Fallback for backward compatibility (before reorganization)
+    from resnet_train import visualize_attention
+    from efficient_train import generate_caption
+logger = logging.getLogger(__name__)
+bp = Blueprint('main', __name__)
+# Image transformation for EfficientNet
+efficientnet_transform = transforms.Compose([
+    transforms.Resize(224),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+def allowed_file(filename):
+    """Check if file extension is allowed."""
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def validate_file_type(file_path):
+    """Validate file is actually an image (not just extension)."""
+    try:
+        img = Image.open(file_path)
+        img.verify()
+        return True
+    except Exception:
+        return False
+@bp.before_request
+def before_request():
+    """Log request start time."""
+    request.start_time = time.time()
+@bp.after_request
+def after_request(response):
+    """Add security headers and log request duration."""
+    # Security headers
+    response.headers['X-Content-Type-Options'] = 'nosniff'
+    response.headers['X-Frame-Options'] = 'DENY'
+    response.headers['X-XSS-Protection'] = '1; mode=block'
+    # Log request
+    duration = time.time() - request.start_time
+    logger.info(f"{request.method} {request.path} - {response.status_code} - {duration:.3f}s")
+    return response
+@bp.route('/')
+def index():
+    """Serve the main page."""
+    return render_template('index.html')
+@bp.route('/health')
+def health_check():
+    """Health check endpoint for load balancers."""
+    return jsonify({
+        'status': 'healthy',
+        'timestamp': datetime.utcnow().isoformat(),
+        'models_loaded': {
+            'resnet': model_cache.is_resnet_loaded(),
+            'efficientnet': model_cache.is_efficientnet_loaded()
+        }
+    }), 200
+@bp.route('/ready')
+def readiness_check():
+    """Readiness check - ensures models are loaded."""
+    if not model_cache.is_resnet_loaded() and not model_cache.is_efficientnet_loaded():
+        return jsonify({'status': 'not ready', 'reason': 'models not loaded'}), 503
+    return jsonify({'status': 'ready'}), 200
+@bp.route('/upload', methods=['POST'])
+def upload_file():
+    """Handle image upload and generate caption."""
+    if 'image' not in request.files:
+        logger.warning("Upload request missing 'image' field")
+        return jsonify({'error': 'No file part'}), 400
+    file = request.files['image']
+    model_choice = request.form.get('model', 'efficientnet')  # Default to EfficientNet
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'}), 400
+    if not file or not allowed_file(file.filename):
+        return jsonify({'error': 'Invalid file type. Only PNG, JPG, JPEG allowed.'}), 400
+    # Get upload folder from current app (set in __init__.py)
+    from flask import current_app
+    upload_folder = current_app.config['UPLOAD_FOLDER']
+    # Save file temporarily
+    filename = secure_filename(file.filename)
+    filepath = os.path.join(upload_folder, filename)
+    try:
+        file.save(filepath)
+        # Validate file size
+        file_size = os.path.getsize(filepath)
+        if file_size > MAX_FILE_SIZE:
+            os.remove(filepath)
+            return jsonify({'error': f'File too large. Maximum size: {MAX_FILE_SIZE / 1024 / 1024}MB'}), 400
+        # Validate file is actually an image
+        if not validate_file_type(filepath):
+            os.remove(filepath)
+            return jsonify({'error': 'Invalid image file'}), 400
+        # Generate caption based on model choice
+        start_time = time.time()
+        if model_choice == 'efficientnet':
+            if not model_cache.is_efficientnet_loaded():
+                return jsonify({'error': 'EfficientNet model not available'}), 503
+            model, tokenizer = model_cache.get_efficientnet_model()
+            # Load and preprocess image
+            image = Image.open(filepath).convert('RGB')
+            image_tensor = efficientnet_transform(image).to(model_cache._device)
+            # Generate caption
+            with torch.no_grad():
+                caption = generate_caption(
+                    model,
+                    image_tensor,
+                    tokenizer,
+                    model_cache._device,
+                    max_length=64
+                )
+        else:  # resnet50
+            if not model_cache.is_resnet_loaded():
+                return jsonify({'error': 'ResNet model not available'}), 503
+            encoder, decoder, vocab = model_cache.get_resnet_models()
+            # Generate caption
+            with torch.no_grad():
+                caption = visualize_attention(filepath, encoder, decoder, model_cache._device)
+        inference_time = time.time() - start_time
+        logger.info(f"Caption generated in {inference_time:.3f}s using {model_choice}")
+        # Clean up uploaded file
+        os.remove(filepath)
+        return jsonify({
+            'success': True,
+            'caption': caption,
+            'model': model_choice,
+            'inference_time': round(inference_time, 3)
+        })
+    except Exception as e:
+        logger.error(f"Error generating caption: {e}", exc_info=True)
+        # Clean up file on error
+        if os.path.exists(filepath):
+            os.remove(filepath)
+        return jsonify({'error': 'Failed to generate caption. Please try again.'}), 500

app/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Utilities package."""
2	+

app/utils/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (198 Bytes). View file

app/utils/__pycache__/model_cache.cpython-314.pyc ADDED Viewed

Binary file (16.1 kB). View file

app/utils/model_cache.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""
+Model Caching Module for Production
+Loads models once at startup and reuses them for all requests.
+This eliminates the overhead of loading models per-request.
+"""
+import torch
+import os
+import logging
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# Get base directory (project root)
+BASE_DIR = Path(__file__).resolve().parent.parent.parent
+MODELS_DIR = BASE_DIR / "models"
+class ModelCache:
+    """Singleton class to cache loaded models in memory."""
+    def __init__(self):
+        self._resnet_encoder = None
+        self._resnet_decoder = None
+        self._resnet_vocab = None
+        self._efficientnet_model = None
+        self._efficientnet_tokenizer = None
+        self._device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self._models_loaded = False
+        logger.info(f"ModelCache initialized on device: {self._device}")
+    def load_all_models(self,
+                       resnet_path=None,
+                       efficientnet_path=None,
+                       use_optimized=True):
+        """
+        Load all models at startup.
+        Args:
+            resnet_path: Path to ResNet checkpoint (default: models/resnet_best_model.pth)
+            efficientnet_path: Path to EfficientNet checkpoint (default: models/efficient_best_model.pth)
+            use_optimized: If True, try to load optimized models first
+        """
+        if self._models_loaded:
+            logger.warning("Models already loaded, skipping")
+            return
+        # Set default paths
+        if resnet_path is None:
+            resnet_path = str(MODELS_DIR / "resnet_best_model.pth")
+        if efficientnet_path is None:
+            efficientnet_path = str(MODELS_DIR / "efficient_best_model.pth")
+        # Try optimized models first if requested
+        if use_optimized:
+            # Check multiple possible locations for optimized models
+            optimized_resnet_paths = [
+                str(MODELS_DIR / "optimized_models" / "resnet_resnet_best_model_quantized.pth"),
+                str(BASE_DIR / "optimized_models" / "resnet_resnet_best_model_quantized.pth"),
+                resnet_path.replace('.pth', '_quantized.pth'),
+                resnet_path.replace('resnet_best_model.pth', 'resnet_resnet_best_model_quantized.pth'),
+            ]
+            optimized_efficient_paths = [
+                str(MODELS_DIR / "optimized_models" / "efficientnet_efficient_best_model_quantized.pth"),
+                str(BASE_DIR / "optimized_models" / "efficientnet_efficient_best_model_quantized.pth"),
+                efficientnet_path.replace('.pth', '_quantized.pth'),
+                efficientnet_path.replace('efficient_best_model.pth', 'efficientnet_efficient_best_model_quantized.pth'),
+            ]
+            # Find optimized ResNet model
+            for opt_path in optimized_resnet_paths:
+                if os.path.exists(opt_path):
+                    resnet_path = opt_path
+                    logger.info(f"Using optimized ResNet model: {resnet_path}")
+                    break
+            # Find optimized EfficientNet model
+            for opt_path in optimized_efficient_paths:
+                if os.path.exists(opt_path):
+                    efficientnet_path = opt_path
+                    logger.info(f"Using optimized EfficientNet model: {efficientnet_path}")
+                    break
+        # Load EfficientNet only (ResNet skipped)
+        try:
+            self.load_efficientnet_model(efficientnet_path)
+            logger.info("EfficientNet model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load EfficientNet model: {e}", exc_info=True)
+        self._models_loaded = True
+    def load_efficientnet_model_only(self, use_optimized=True):
+        """
+        Load only EfficientNet model (skip ResNet).
+        Useful when only EfficientNet is needed.
+        """
+        if self._models_loaded:
+            logger.warning("Models already loaded, skipping")
+            return
+        efficientnet_path = str(MODELS_DIR / "efficient_best_model.pth")
+        # Try optimized model first if requested
+        if use_optimized:
+            optimized_efficient_paths = [
+                str(MODELS_DIR / "optimized_models" / "efficientnet_efficient_best_model_quantized.pth"),
+                str(BASE_DIR / "optimized_models" / "efficientnet_efficient_best_model_quantized.pth"),
+                efficientnet_path.replace('.pth', '_quantized.pth'),
+                efficientnet_path.replace('efficient_best_model.pth', 'efficientnet_efficient_best_model_quantized.pth'),
+            ]
+            # Find optimized EfficientNet model
+            for opt_path in optimized_efficient_paths:
+                if os.path.exists(opt_path):
+                    efficientnet_path = opt_path
+                    logger.info(f"Using optimized EfficientNet model: {efficientnet_path}")
+                    break
+        # Load EfficientNet
+        try:
+            self.load_efficientnet_model(efficientnet_path)
+            logger.info("EfficientNet model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load EfficientNet model: {e}", exc_info=True)
+        self._models_loaded = True
+    def load_resnet_models(self, checkpoint_path=None):
+        """Load ResNet encoder and decoder models."""
+        if self._resnet_encoder is not None:
+            return self._resnet_encoder, self._resnet_decoder, self._resnet_vocab
+        if checkpoint_path is None:
+            checkpoint_path = str(MODELS_DIR / "resnet_best_model.pth")
+        # Resolve path - try multiple locations
+        checkpoint_path = self._resolve_model_path(checkpoint_path)
+        logger.info(f"Loading ResNet models from {checkpoint_path}")
+        # Import from training module (handles both old and new locations)
+        # Need to do this BEFORE loading checkpoint to avoid pickle issues
+        try:
+            from training.resnet_train import EncoderCNN, DecoderRNN
+            # Add to sys.modules to help with pickle loading
+            import sys
+            if 'resnet_train' not in sys.modules:
+                sys.modules['resnet_train'] = sys.modules['training.resnet_train']
+        except ImportError:
+            try:
+                # Fallback for backward compatibility
+                import sys
+                sys.path.insert(0, str(BASE_DIR))
+                from resnet_train import EncoderCNN, DecoderRNN
+            except ImportError:
+                logger.error("Could not import ResNet model classes. Make sure resnet_train.py exists in training/ or root.")
+                raise
+        # Load checkpoint with proper module mapping
+        import sys
+        import importlib.util
+        # Map old module names for pickle compatibility
+        if 'resnet_train' not in sys.modules:
+            try:
+                spec = importlib.util.spec_from_file_location("resnet_train", str(BASE_DIR / "training" / "resnet_train.py"))
+                if spec and spec.loader:
+                    resnet_module = importlib.util.module_from_spec(spec)
+                    sys.modules['resnet_train'] = resnet_module
+                    spec.loader.exec_module(resnet_module)
+            except Exception:
+                pass
+        checkpoint = torch.load(checkpoint_path, map_location=self._device, weights_only=False)
+        # Initialize models
+        self._resnet_encoder = EncoderCNN().to(self._device)
+        self._resnet_decoder = DecoderRNN().to(self._device)
+        # Load weights
+        self._resnet_encoder.load_state_dict(checkpoint['encoder'])
+        self._resnet_decoder.load_state_dict(checkpoint['decoder'])
+        # Set to eval mode
+        self._resnet_encoder.eval()
+        self._resnet_decoder.eval()
+        # Store vocabulary
+        self._resnet_vocab = checkpoint.get('vocab')
+        # Warm up models (first inference is slower)
+        logger.info("Warming up ResNet models...")
+        dummy_input = torch.randn(1, 3, 224, 224).to(self._device)
+        with torch.no_grad():
+            _ = self._resnet_encoder(dummy_input)
+        logger.info("ResNet models warmed up")
+        return self._resnet_encoder, self._resnet_decoder, self._resnet_vocab
+    def load_efficientnet_model(self, checkpoint_path=None):
+        """Load EfficientNet model."""
+        if self._efficientnet_model is not None:
+            return self._efficientnet_model, self._efficientnet_tokenizer
+        if checkpoint_path is None:
+            checkpoint_path = str(MODELS_DIR / "efficient_best_model.pth")
+        # Resolve path - try multiple locations
+        checkpoint_path = self._resolve_model_path(checkpoint_path)
+        logger.info(f"Loading EfficientNet model from {checkpoint_path}")
+        # Import from training module (handles both old and new locations)
+        try:
+            from training.efficient_train import Encoder, Decoder, ImageCaptioningModel
+        except ImportError:
+            try:
+                # Fallback for backward compatibility
+                import sys
+                sys.path.insert(0, str(BASE_DIR))
+                from efficient_train import Encoder, Decoder, ImageCaptioningModel
+            except ImportError:
+                logger.error("Could not import EfficientNet model classes. Make sure efficient_train.py exists in training/ or root.")
+                raise
+        from transformers import AutoTokenizer
+        # Initialize tokenizer
+        tokenizer = AutoTokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+        special_tokens = {'additional_special_tokens': ['<start>', '<end>']}
+        tokenizer.add_special_tokens(special_tokens)
+        self._efficientnet_tokenizer = tokenizer
+        # Initialize model
+        encoder = Encoder(model_name='efficientnet_b3', embed_dim=512)
+        decoder = Decoder(
+            vocab_size=len(tokenizer),
+            embed_dim=512,
+            num_layers=8,
+            num_heads=8,
+            max_seq_length=64
+        )
+        self._efficientnet_model = ImageCaptioningModel(encoder, decoder).to(self._device)
+        # Load weights
+        checkpoint = torch.load(checkpoint_path, map_location=self._device, weights_only=False)
+        # Check if this is a quantized model (has _packed_params keys)
+        is_quantized = any('_packed_params' in key for key in checkpoint.get('model_state', checkpoint).keys())
+        if is_quantized:
+            # For quantized models, we need to prepare the model for quantization first
+            logger.info("Detected quantized model, preparing model for quantization...")
+            try:
+                # Prepare model for quantization
+                import torch.quantization as quant
+                self._efficientnet_model = quant.quantize_dynamic(
+                    self._efficientnet_model, {torch.nn.Linear}, dtype=torch.qint8
+                )
+                logger.info("Model prepared for quantization")
+            except Exception as e:
+                logger.warning(f"Could not prepare model for quantization: {e}. Trying to load anyway...")
+        if 'model_state' in checkpoint:
+            try:
+                self._efficientnet_model.load_state_dict(checkpoint['model_state'], strict=False)
+            except Exception as e:
+                logger.warning(f"Could not load quantized state dict: {e}. Trying regular model...")
+                # Try loading non-quantized model instead
+                regular_path = checkpoint_path.replace('_quantized.pth', '.pth').replace('efficientnet_efficient_best_model', 'efficient_best_model')
+                if os.path.exists(regular_path) and regular_path != checkpoint_path:
+                    logger.info(f"Trying regular model: {regular_path}")
+                    checkpoint = torch.load(regular_path, map_location=self._device, weights_only=False)
+                    if 'model_state' in checkpoint:
+                        self._efficientnet_model.load_state_dict(checkpoint['model_state'])
+                    else:
+                        self._efficientnet_model.load_state_dict(checkpoint)
+        else:
+            # Fallback: try loading directly
+            try:
+                self._efficientnet_model.load_state_dict(checkpoint, strict=False)
+            except Exception:
+                logger.warning("Could not load state dict. Model may not work correctly.")
+        self._efficientnet_model.eval()
+        # Warm up
+        logger.info("Warming up EfficientNet model...")
+        dummy_input = torch.randn(1, 3, 224, 224).to(self._device)
+        with torch.no_grad():
+            _ = self._efficientnet_model.encoder(dummy_input)
+        logger.info("EfficientNet model warmed up")
+        return self._efficientnet_model, self._efficientnet_tokenizer
+    def _resolve_model_path(self, checkpoint_path):
+        """Resolve model path, trying multiple locations."""
+        # If path exists, use it
+        if os.path.exists(checkpoint_path):
+            return checkpoint_path
+        # Try in models directory
+        alt_path = str(MODELS_DIR / os.path.basename(checkpoint_path))
+        if os.path.exists(alt_path):
+            logger.info(f"Found model at: {alt_path}")
+            return alt_path
+        # Try in root directory (backward compatibility)
+        alt_path = str(BASE_DIR / os.path.basename(checkpoint_path))
+        if os.path.exists(alt_path):
+            logger.info(f"Found model at: {alt_path}")
+            return alt_path
+        # Return original path (will fail with clear error)
+        return checkpoint_path
+    def get_resnet_models(self):
+        """Get cached ResNet models."""
+        if self._resnet_encoder is None:
+            raise RuntimeError("ResNet models not loaded. Call load_resnet_models() first.")
+        return self._resnet_encoder, self._resnet_decoder, self._resnet_vocab
+    def get_efficientnet_model(self):
+        """Get cached EfficientNet model."""
+        if self._efficientnet_model is None:
+            raise RuntimeError("EfficientNet model not loaded. Call load_efficientnet_model() first.")
+        return self._efficientnet_model, self._efficientnet_tokenizer
+    def is_resnet_loaded(self):
+        """Check if ResNet models are loaded."""
+        return self._resnet_encoder is not None
+    def is_efficientnet_loaded(self):
+        """Check if EfficientNet model is loaded."""
+        return self._efficientnet_model is not None
+# Singleton instance
+model_cache = ModelCache()

hf_space_Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Dockerfile for Hugging Face Spaces
+# Based on: https://huggingface.co/docs/hub/spaces-sdks-docker
+FROM python:3.10-slim
+# Create user (HF Spaces requirement)
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+# Install system dependencies
+USER root
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+USER user
+# Copy and install Python dependencies
+COPY --chown=user requirements.txt requirements.txt
+RUN pip install --no-cache-dir --user --upgrade -r requirements.txt
+# Download NLTK data
+RUN python -c "import nltk; nltk.download('punkt', quiet=True)"
+# Copy application files
+COPY --chown=user app/ /app/app/
+COPY --chown=user training/ /app/training/
+COPY --chown=user scripts/ /app/scripts/
+COPY --chown=user templates/ /app/templates/
+COPY --chown=user static/ /app/static/
+COPY --chown=user app.py /app/
+# Create necessary directories
+RUN mkdir -p /app/models/optimized_models /app/uploads
+# HF Spaces uses port 7860
+EXPOSE 7860
+# Set environment variables
+ENV FLASK_ENV=production
+ENV PORT=7860
+# Run the application on port 7860 (HF Spaces requirement)
+# Use app.py as entry point (HF Spaces looks for app.py)
+CMD ["gunicorn", "app:app", "--bind", "0.0.0.0:7860", "--workers", "1", "--timeout", "120", "--threads", "2"]

hf_space_app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Hugging Face Spaces - Flask Application Entry Point
+HF Spaces expects app.py with an 'app' variable
+"""
+import os
+import sys
+from pathlib import Path
+# Add project root to path
+BASE_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(BASE_DIR))
+# Import Flask app from app package
+# This will trigger model loading at startup
+from app import app
+# HF Spaces requires 'app' variable to be available
+# The app is already created in app/__init__.py
+# No need to run it here - Gunicorn will handle it

hf_space_requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch>=2.0.0
+torchvision>=0.15.0
+transformers>=4.30.0
+Pillow>=10.0.0
+timm>=0.9.0
+numpy>=1.24.0
+flask>=2.3.0
+gunicorn>=21.2.0
+werkzeug>=2.3.0
+nltk>=3.8.1
+requests>=2.31.0
+huggingface_hub>=0.20.0

scripts/download_model.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Download EfficientNet model from cloud storage if not present.
+This script runs at application startup to download the model if needed.
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+import requests
+logger = logging.getLogger(__name__)
+def download_efficientnet_model():
+    """
+    Download EfficientNet optimized model if it doesn't exist.
+    Supports two methods:
+    1. Hugging Face Hub (set HF_MODEL_REPO environment variable)
+    2. Direct URL download (set EFFICIENTNET_MODEL_URL environment variable)
+    """
+    # Get base directory
+    base_dir = Path(__file__).resolve().parent.parent
+    models_dir = base_dir / "models" / "optimized_models"
+    models_dir.mkdir(parents=True, exist_ok=True)
+    model_path = models_dir / "efficientnet_efficient_best_model_quantized.pth"
+    # Check if model already exists
+    if model_path.exists():
+        size_mb = model_path.stat().st_size / (1024 * 1024)
+        logger.info(f"EfficientNet model already exists ({size_mb:.1f}MB)")
+        return True
+    # Try Hugging Face Hub first
+    hf_repo = os.environ.get("HF_MODEL_REPO")
+    if hf_repo:
+        try:
+            from huggingface_hub import hf_hub_download
+            logger.info(f"Downloading model from Hugging Face Hub: {hf_repo}")
+            downloaded_path = hf_hub_download(
+                repo_id=hf_repo,
+                filename="efficientnet_efficient_best_model_quantized.pth",
+                cache_dir=str(models_dir),
+                local_dir=str(models_dir),
+                local_dir_use_symlinks=False
+            )
+            # Move to expected location if needed
+            if downloaded_path != str(model_path):
+                import shutil
+                shutil.move(downloaded_path, model_path)
+            size_mb = model_path.stat().st_size / (1024 * 1024)
+            logger.info(f"Model downloaded from HF Hub successfully ({size_mb:.1f}MB)")
+            return True
+        except ImportError:
+            logger.warning("huggingface_hub not installed. Install with: pip install huggingface_hub")
+        except Exception as e:
+            logger.warning(f"Failed to download from HF Hub: {e}. Trying direct URL...")
+    # Fallback to direct URL download
+    model_url = os.environ.get("EFFICIENTNET_MODEL_URL")
+    if not model_url:
+        logger.warning("Neither HF_MODEL_REPO nor EFFICIENTNET_MODEL_URL is set.")
+        logger.warning("Model will not be downloaded. Set one of these environment variables.")
+        return False
+    try:
+        logger.info(f"Downloading EfficientNet model from {model_url}...")
+        logger.info("This may take a few minutes (model is ~245MB)...")
+        # Download with progress
+        response = requests.get(model_url, stream=True, timeout=300)
+        response.raise_for_status()
+        total_size = int(response.headers.get('content-length', 0))
+        downloaded = 0
+        with open(model_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+                    downloaded += len(chunk)
+                    if total_size > 0:
+                        percent = (downloaded / total_size) * 100
+                        if downloaded % (10 * 1024 * 1024) == 0:  # Log every 10MB
+                            logger.info(f"Downloaded {downloaded / (1024 * 1024):.1f}MB / {total_size / (1024 * 1024):.1f}MB ({percent:.1f}%)")
+        size_mb = model_path.stat().st_size / (1024 * 1024)
+        logger.info(f"EfficientNet model downloaded successfully ({size_mb:.1f}MB)")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download model: {e}")
+        # Clean up partial download
+        if model_path.exists():
+            model_path.unlink()
+        return False
+    except Exception as e:
+        logger.error(f"Error downloading model: {e}", exc_info=True)
+        # Clean up partial download
+        if model_path.exists():
+            model_path.unlink()
+        return False
+if __name__ == "__main__":
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    download_efficientnet_model()

scripts/efficient_caption.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import argparse
+import logging
+import torch
+from PIL import Image
+from torchvision import transforms
+from transformers import AutoTokenizer
+from efficient_train import Encoder, Decoder, ImageCaptioningModel, generate_caption
+import os
+# Configuration
+MODEL_PATH = 'efficient_best_model.pth'  # Path to your saved model
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+MAX_SEQ_LENGTH = 64  # Ensure this matches the value used during training
+# Image transformation (ensure it matches the preprocessing used during training)
+transform = transforms.Compose([
+    transforms.Resize(224),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained('gpt2')
+tokenizer.pad_token = tokenizer.eos_token
+special_tokens = {'additional_special_tokens': ['<start>', '<end>']}
+tokenizer.add_special_tokens(special_tokens)
+# Initialize the model components
+encoder = Encoder(model_name='efficientnet_b3', embed_dim=512)
+decoder = Decoder(
+    vocab_size=len(tokenizer),
+    embed_dim=512,
+    num_layers=8,
+    num_heads=8,
+    max_seq_length=MAX_SEQ_LENGTH
+)
+model = ImageCaptioningModel(encoder, decoder).to(DEVICE)
+# Load the trained model weights
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Model file not found at: {MODEL_PATH}. Please ensure you have a trained model checkpoint at this location.")
+# Add a check for the size of the file
+if os.path.getsize(MODEL_PATH) == 0:
+    raise ValueError(f"Model file at {MODEL_PATH} is empty. Please check the saved model.")
+checkpoint = torch.load(MODEL_PATH, map_location=DEVICE, weights_only=False)
+# Check if the checkpoint has the model_state key
+if 'model_state' not in checkpoint:
+    raise KeyError("The checkpoint file does not contain the key 'model_state'. Please ensure the model was saved correctly using 'torch.save(model.state_dict(), path)'.")
+model.load_state_dict(checkpoint['model_state'])
+model.eval()
+def caption(image_path):
+    # Load and preprocess the image
+    image = Image.open(image_path).convert('RGB')
+    image = transform(image).to(DEVICE)
+    # Generate caption
+    caption1 = generate_caption(model, image, tokenizer, DEVICE, max_length=MAX_SEQ_LENGTH)
+    return caption1
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Generate a caption for the provided image.")
+    parser.add_argument('--image_dir', type=str, required=True, help="Path to the input image file")
+    args = parser.parse_args()
+    try:
+        result = caption(args.image_dir)
+        print(result)
+    except Exception as e:
+        logging.error(f"Error generating caption: {str(e)}")
+        exit(1)

scripts/optimize_models.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Model Optimization Script for Production Deployment
+Reduces model size and improves inference speed through:
+1. Quantization (INT8)
+2. TorchScript compilation
+3. Model pruning (optional)
+4. State dict optimization
+"""
+import torch
+import os
+import argparse
+from pathlib import Path
+# Import model classes BEFORE loading checkpoints (needed for unpickling)
+# This ensures PyTorch can find the class definitions when loading saved objects
+# Note: resnet_train.py has module-level code that loads COCO data, which may fail
+# if training files aren't present. We'll handle this in the functions.
+def quantize_model(checkpoint_path, output_path, model_type='resnet'):
+    """
+    Quantize model to INT8 for 4x size reduction and faster inference.
+    Note: Slight accuracy loss (usually <1%)
+    """
+    print(f"Quantizing {model_type} model...")
+    device = torch.device('cpu')  # Quantization typically done on CPU
+    # Import classes before loading (required for unpickling)
+    # resnet_train.py now handles missing training data gracefully
+    if model_type == 'resnet':
+        # Import the module itself so we can update vocab later
+        import resnet_train
+        from resnet_train import EncoderCNN, DecoderRNN, Vocabulary
+        # Make Vocabulary available in __main__ for unpickling
+        # This handles cases where checkpoint was saved with Vocabulary from __main__
+        import __main__
+        if not hasattr(__main__, 'Vocabulary'):
+            __main__.Vocabulary = Vocabulary
+    elif model_type == 'efficientnet':
+        from efficient_train import Encoder, Decoder, ImageCaptioningModel
+        from transformers import AutoTokenizer
+    # Load checkpoint (now all classes are available for unpickling)
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    if model_type == 'resnet':
+        # For ResNet, quantize encoder and decoder separately
+        # IMPORTANT: Update vocab from checkpoint before creating DecoderRNN
+        # The decoder uses len(vocab.word2idx) in its __init__, so we need the full vocab
+        if 'vocab' in checkpoint and checkpoint['vocab'] is not None:
+            # Update the vocab in resnet_train module (DecoderRNN.__init__ references resnet_train.vocab)
+            resnet_train.vocab = checkpoint['vocab']
+            print(f"  Updated vocab size: {len(checkpoint['vocab'].word2idx)}")
+        else:
+            raise ValueError("Checkpoint does not contain 'vocab' key. Cannot proceed.")
+        encoder = EncoderCNN()
+        decoder = DecoderRNN()  # Now uses the correct vocab size from checkpoint
+        encoder.load_state_dict(checkpoint['encoder'])
+        decoder.load_state_dict(checkpoint['decoder'])
+        # Set to eval mode
+        encoder.eval()
+        decoder.eval()
+        # Prepare for quantization (dummy input)
+        dummy_input = torch.randn(1, 3, 224, 224)
+        # Quantize encoder (only Linear and Conv2d layers)
+        encoder_quantized = torch.quantization.quantize_dynamic(
+            encoder, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8
+        )
+        # Quantize decoder (only Linear layers - Embedding requires special config)
+        # Embeddings are typically small and don't benefit much from quantization
+        decoder_quantized = torch.quantization.quantize_dynamic(
+            decoder, {torch.nn.Linear}, dtype=torch.qint8
+        )
+        # Save quantized model
+        quantized_checkpoint = {
+            'encoder': encoder_quantized.state_dict(),
+            'decoder': decoder_quantized.state_dict(),
+            'vocab': checkpoint.get('vocab'),
+            'quantized': True
+        }
+    elif model_type == 'efficientnet':
+        # Classes already imported above before loading checkpoint
+        tokenizer = AutoTokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+        special_tokens = {'additional_special_tokens': ['<start>', '<end>']}
+        tokenizer.add_special_tokens(special_tokens)
+        encoder = Encoder(model_name='efficientnet_b3', embed_dim=512)
+        decoder = Decoder(
+            vocab_size=len(tokenizer),
+            embed_dim=512,
+            num_layers=8,
+            num_heads=8,
+            max_seq_length=64
+        )
+        model = ImageCaptioningModel(encoder, decoder)
+        # Load state dict - handle both 'model_state' key and direct state dict
+        if 'model_state' in checkpoint:
+            model.load_state_dict(checkpoint['model_state'])
+        else:
+            model.load_state_dict(checkpoint)
+        model.eval()
+        # Quantize the full model
+        model_quantized = torch.quantization.quantize_dynamic(
+            model, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8
+        )
+        quantized_checkpoint = {
+            'model_state': model_quantized.state_dict(),
+            'quantized': True
+        }
+    torch.save(quantized_checkpoint, output_path)
+    # Compare sizes
+    original_size = os.path.getsize(checkpoint_path) / (1024 * 1024)  # MB
+    quantized_size = os.path.getsize(output_path) / (1024 * 1024)  # MB
+    reduction = (1 - quantized_size / original_size) * 100
+    print(f"✓ Quantization complete!")
+    print(f"  Original size: {original_size:.2f} MB")
+    print(f"  Quantized size: {quantized_size:.2f} MB")
+    print(f"  Size reduction: {reduction:.1f}%")
+    return output_path
+def optimize_state_dict(checkpoint_path, output_path):
+    """
+    Remove unnecessary metadata and optimize state dict for smaller size.
+    """
+    print(f"Optimizing state dict...")
+    # Import classes before loading (required for unpickling)
+    try:
+        from resnet_train import Vocabulary
+        # Make Vocabulary available in __main__ for unpickling
+        import __main__
+        if not hasattr(__main__, 'Vocabulary'):
+            __main__.Vocabulary = Vocabulary
+    except ImportError:
+        pass
+    checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+    # Create optimized checkpoint with only essential data
+    optimized = {}
+    for key, value in checkpoint.items():
+        if key not in ['optimizer', 'scheduler', 'epoch', 'loss', 'metrics']:
+            optimized[key] = value
+    # Save with highest compression
+    torch.save(optimized, output_path, _use_new_zipfile_serialization=True)
+    original_size = os.path.getsize(checkpoint_path) / (1024 * 1024)
+    optimized_size = os.path.getsize(output_path) / (1024 * 1024)
+    reduction = (1 - optimized_size / original_size) * 100
+    print(f"✓ State dict optimized!")
+    print(f"  Original: {original_size:.2f} MB")
+    print(f"  Optimized: {optimized_size:.2f} MB")
+    print(f"  Reduction: {reduction:.1f}%")
+    return output_path
+def create_torchscript(checkpoint_path, output_path, model_type='resnet'):
+    """
+    Convert model to TorchScript for faster loading and inference.
+    Note: Requires example input for tracing.
+    """
+    print(f"Creating TorchScript model...")
+    device = torch.device('cpu')
+    # Import classes before loading (required for unpickling)
+    if model_type == 'resnet':
+        import resnet_train
+        from resnet_train import EncoderCNN, DecoderRNN, Vocabulary
+        # Make Vocabulary available in __main__ for unpickling
+        import __main__
+        if not hasattr(__main__, 'Vocabulary'):
+            __main__.Vocabulary = Vocabulary
+    elif model_type == 'efficientnet':
+        from efficient_train import Encoder, Decoder, ImageCaptioningModel
+        from transformers import AutoTokenizer
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    if model_type == 'resnet':
+        # Update vocab from checkpoint before creating DecoderRNN
+        if 'vocab' in checkpoint and checkpoint['vocab'] is not None:
+            resnet_train.vocab = checkpoint['vocab']
+            print(f"  Updated vocab size: {len(checkpoint['vocab'].word2idx)}")
+        else:
+            raise ValueError("Checkpoint does not contain 'vocab' key. Cannot proceed.")
+        encoder = EncoderCNN().eval()
+        decoder = DecoderRNN().eval()  # Now uses the correct vocab size
+        encoder.load_state_dict(checkpoint['encoder'])
+        decoder.load_state_dict(checkpoint['decoder'])
+        # Trace encoder
+        dummy_image = torch.randn(1, 3, 224, 224)
+        encoder_traced = torch.jit.trace(encoder, dummy_image)
+        # For decoder, we need to trace with proper inputs
+        # This is more complex due to RNN structure
+        print("  ⚠ TorchScript for RNN decoder may require manual scripting")
+        print("  ✓ Encoder traced successfully")
+        torch.jit.save(encoder_traced, output_path.replace('.pth', '_encoder.pt'))
+    elif model_type == 'efficientnet':
+        # Classes already imported above
+        tokenizer = AutoTokenizer.from_pretrained('gpt2')
+        tokenizer.pad_token = tokenizer.eos_token
+        special_tokens = {'additional_special_tokens': ['<start>', '<end>']}
+        tokenizer.add_special_tokens(special_tokens)
+        encoder = Encoder(model_name='efficientnet_b3', embed_dim=512)
+        decoder = Decoder(
+            vocab_size=len(tokenizer),
+            embed_dim=512,
+            num_layers=8,
+            num_heads=8,
+            max_seq_length=64
+        )
+        model = ImageCaptioningModel(encoder, decoder).eval()
+        model.load_state_dict(checkpoint['model_state'])
+        # Trace encoder only (decoder has dynamic inputs)
+        dummy_image = torch.randn(1, 3, 224, 224)
+        encoder_traced = torch.jit.trace(model.encoder, dummy_image)
+        torch.jit.save(encoder_traced, output_path.replace('.pth', '_encoder.pt'))
+        print("  ✓ Encoder traced successfully")
+    print(f"✓ TorchScript saved to {output_path}")
+    return output_path
+def main():
+    parser = argparse.ArgumentParser(description='Optimize models for production deployment')
+    parser.add_argument('--model', type=str, choices=['resnet', 'efficientnet', 'both'],
+                       default='both', help='Model to optimize')
+    parser.add_argument('--method', type=str, choices=['quantize', 'optimize', 'torchscript', 'all'],
+                       default='all', help='Optimization method')
+    parser.add_argument('--resnet-path', type=str, default='resnet_best_model.pth',
+                       help='Path to ResNet checkpoint')
+    parser.add_argument('--efficientnet-path', type=str, default='efficient_best_model.pth',
+                       help='Path to EfficientNet checkpoint')
+    parser.add_argument('--output-dir', type=str, default='optimized_models',
+                       help='Output directory for optimized models')
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    models_to_process = []
+    if args.model in ['resnet', 'both']:
+        if os.path.exists(args.resnet_path):
+            models_to_process.append(('resnet', args.resnet_path))
+        else:
+            print(f"⚠ Warning: {args.resnet_path} not found, skipping ResNet")
+    if args.model in ['efficientnet', 'both']:
+        if os.path.exists(args.efficientnet_path):
+            models_to_process.append(('efficientnet', args.efficientnet_path))
+        else:
+            print(f"⚠ Warning: {args.efficientnet_path} not found, skipping EfficientNet")
+    if not models_to_process:
+        print("❌ No models found to optimize!")
+        return
+    for model_type, model_path in models_to_process:
+        print(f"\n{'='*60}")
+        print(f"Processing {model_type.upper()} model")
+        print(f"{'='*60}")
+        base_name = Path(model_path).stem
+        output_base = os.path.join(args.output_dir, f"{model_type}_{base_name}")
+        if args.method in ['quantize', 'all']:
+            quantized_path = f"{output_base}_quantized.pth"
+            quantize_model(model_path, quantized_path, model_type)
+        if args.method in ['optimize', 'all']:
+            optimized_path = f"{output_base}_optimized.pth"
+            optimize_state_dict(model_path, optimized_path)
+        if args.method in ['torchscript', 'all']:
+            torchscript_path = f"{output_base}_torchscript.pt"
+            create_torchscript(model_path, torchscript_path, model_type)
+    print(f"\n{'='*60}")
+    print("✓ Optimization complete!")
+    print(f"Optimized models saved to: {args.output_dir}")
+    print(f"{'='*60}")
+if __name__ == '__main__':
+    main()

scripts/resnet_caption.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env python
+import argparse
+import torch
+from PIL import Image
+import nltk
+nltk.download('punkt', quiet=True)
+# Import the necessary components from resnet_train.py
+from resnet_train import EncoderCNN, DecoderRNN, visualize_attention, CONFIG, Vocabulary
+import resnet_train  # To update its global vocab variable
+def main():
+    parser = argparse.ArgumentParser(description="Generate image caption from a trained model.")
+    parser.add_argument("--image", type=str, required=True, help="Path to the input image")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Path to the trained model checkpoint")
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load checkpoint
+    checkpoint = torch.load(args.checkpoint, map_location=device,weights_only=False)
+    # Initialize models
+    encoder = EncoderCNN().to(device)
+    decoder = DecoderRNN().to(device)
+    # Load state dictionaries
+    encoder.load_state_dict(checkpoint['encoder'])
+    decoder.load_state_dict(checkpoint['decoder'])
+    # Update the global vocabulary from the checkpoint
+    resnet_train.vocab = checkpoint['vocab']
+    # Generate caption using the provided image path
+    caption = visualize_attention(args.image, encoder, decoder, device)
+    print(caption)
+if __name__ == "__main__":
+    main()

static/css/custom.css ADDED Viewed

	@@ -0,0 +1,144 @@

+.card {
+    border-radius: 1rem;
+    box-shadow: 0 0.5rem 1rem rgba(0, 0, 0, 0.15);
+}
+.card-header {
+    border-top-left-radius: 1rem !important;
+    border-top-right-radius: 1rem !important;
+    background-color: var(--bs-dark);
+}
+#previewImage {
+    max-height: 400px;
+    width: auto;
+    object-fit: contain;
+}
+.form-check {
+    margin-bottom: 0.5rem;
+}
+.alert {
+    margin-bottom: 0;
+}
+.btn-primary {
+    padding: 0.5rem 1.5rem;
+}
+/* Custom upload button styling */
+.upload-container {
+    position: relative;
+    width: 120px;
+    height: 42px;
+    margin: 0 auto;
+}
+.upload-container input[type="file"] {
+    display: none;
+}
+.upload-app {
+    display: block;
+    position: relative;
+    width: 120px;
+    height: 42px;
+    transition: 0.3s ease width;
+    cursor: pointer;
+}
+.upload-btn {
+    position: absolute;
+    top: 0;
+    right: 0;
+    bottom: 0;
+    left: 0;
+    background-color: var(--bs-dark);
+    border: 2px solid var(--bs-border-color);
+    border-radius: 0.375rem;
+    overflow: hidden;
+}
+.upload-btn:before {
+    content: "Upload";
+    position: absolute;
+    top: 50%;
+    left: 45%;
+    transform: translate(-50%, -50%);
+    color: var(--bs-body-color);
+    font-size: 14px;
+    font-weight: bold;
+    transition: opacity 0.3s ease;
+}
+.file-selected .upload-btn:before {
+    opacity: 0;
+}
+.upload-arrow {
+    position: absolute;
+    top: 0;
+    right: 0;
+    width: 38px;
+    height: 38px;
+    background-color: var(--bs-dark);
+    transition: opacity 0.3s ease;
+}
+.file-selected .upload-arrow {
+    opacity: 0;
+}
+.upload-arrow:before,
+.upload-arrow:after {
+    content: "";
+    position: absolute;
+    top: 18px;
+    width: 10px;
+    height: 2px;
+    background-color: var(--bs-body-color);
+}
+.upload-arrow:before {
+    right: 17px;
+    transform: rotateZ(-45deg);
+}
+.upload-arrow:after {
+    right: 11px;
+    transform: rotateZ(45deg);
+}
+.upload-success {
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    width: 24px;
+    height: 24px;
+    margin: 0;
+    background-color: var(--bs-success);
+    transform: translate(-50%, -50%) scale(0);
+    border-radius: 50%;
+    opacity: 0;
+    transition: transform 0.3s ease, opacity 0.3s ease;
+}
+.upload-success i {
+    font-size: 16px;
+    color: #fff;
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%) scale(0);
+    transition: transform 0.3s ease 0.1s;
+}
+.file-selected .upload-success {
+    transform: translate(-50%, -50%) scale(1);
+    opacity: 1;
+}
+.file-selected .upload-success i {
+    transform: translate(-50%, -50%) scale(1);
+}

static/js/main.js ADDED Viewed

	@@ -0,0 +1,85 @@

+document.addEventListener('DOMContentLoaded', function() {
+    const form = document.getElementById('uploadForm');
+    const imageInput = document.getElementById('imageInput');
+    const submitBtn = document.getElementById('submitBtn');
+    const spinner = submitBtn.querySelector('.spinner-border');
+    const resultSection = document.getElementById('resultSection');
+    const previewImage = document.getElementById('previewImage');
+    const captionText = document.getElementById('captionText');
+    const errorAlert = document.getElementById('errorAlert');
+    const uploadApp = document.querySelector('.upload-app');
+    // Preview image when selected
+    imageInput.addEventListener('change', function(e) {
+        const file = e.target.files[0];
+        if (file) {
+            const reader = new FileReader();
+            reader.onload = function(e) {
+                previewImage.src = e.target.result;
+                resultSection.classList.remove('d-none');
+                captionText.textContent = '';
+                errorAlert.classList.add('d-none');
+                // Add success animation class
+                uploadApp.classList.add('file-selected');
+            };
+            reader.readAsDataURL(file);
+        }
+    });
+    form.addEventListener('submit', async function(e) {
+        e.preventDefault();
+        const formData = new FormData();
+        const file = imageInput.files[0];
+        if (!file) {
+            showError('Please select an image first.');
+            return;
+        }
+        // Add file and selected model to form data
+        formData.append('image', file);
+        formData.append('model', document.querySelector('input[name="model"]:checked').value);
+        // Show loading state
+        setLoading(true);
+        try {
+            const response = await fetch('/upload', {
+                method: 'POST',
+                body: formData
+            });
+            const data = await response.json();
+            if (!response.ok) {
+                throw new Error(data.error || 'Failed to generate caption');
+            }
+            // Display the caption
+            captionText.textContent = data.caption;
+            resultSection.classList.remove('d-none');
+            errorAlert.classList.add('d-none');
+        } catch (error) {
+            showError(error.message || 'An error occurred while generating the caption');
+        } finally {
+            setLoading(false);
+        }
+    });
+    function setLoading(isLoading) {
+        submitBtn.disabled = isLoading;
+        spinner.classList.toggle('d-none', !isLoading);
+        submitBtn.textContent = isLoading ? ' Processing...' : 'Generate Caption';
+        if (isLoading) {
+            submitBtn.prepend(spinner);
+        }
+    }
+    function showError(message) {
+        errorAlert.textContent = message;
+        errorAlert.classList.remove('d-none');
+    }
+});

templates/index.html ADDED Viewed

	@@ -0,0 +1,71 @@

+<!DOCTYPE html>
+<html lang="en" data-bs-theme="dark">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Image Caption Generator</title>
+    <link href="https://cdn.replit.com/agent/bootstrap-agent-dark-theme.min.css" rel="stylesheet">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
+    <link href="{{ url_for('static', filename='css/custom.css') }}" rel="stylesheet">
+</head>
+<body>
+    <div class="container py-5">
+        <div class="row justify-content-center">
+            <div class="col-md-8">
+                <div class="card">
+                    <div class="card-header">
+                        <h2 class="text-center mb-0">Image Caption Generator</h2>
+                    </div>
+                    <div class="card-body">
+                        <form id="uploadForm">
+                            <div class="mb-4">
+                                <label class="form-label">Select Model:</label>
+                                <div class="form-check">
+                                    <input class="form-check-input" type="radio" name="model" id="efficientnet" value="efficientnet" checked>
+                                    <label class="form-check-label" for="efficientnet">
+                                        EfficientNet-B3
+                                    </label>
+                                </div>
+                            </div>
+                            <div class="mb-4">
+                                <label class="form-label d-block text-center">Upload Image:</label>
+                                <div class="upload-container">
+                                    <label class="upload-app">
+                                        <input type="file" id="imageInput" accept="image/png,image/jpeg,image/jpg" required>
+                                        <div class="upload-btn">
+                                            <div class="upload-arrow"></div>
+                                            <div class="upload-success">
+                                                <i class="bi bi-check"></i>
+                                            </div>
+                                        </div>
+                                    </label>
+                                </div>
+                            </div>
+                            <div class="text-center">
+                                <button type="submit" class="btn btn-primary" id="submitBtn">
+                                    <span class="spinner-border spinner-border-sm d-none" role="status" aria-hidden="true"></span>
+                                    Generate Caption
+                                </button>
+                            </div>
+                        </form>
+                        <div id="resultSection" class="mt-4 d-none">
+                            <div class="text-center">
+                                <img id="previewImage" class="img-fluid mb-3 rounded" alt="Uploaded image">
+                                <div id="captionText" class="alert alert-info"></div>
+                            </div>
+                        </div>
+                        <div id="errorAlert" class="alert alert-danger mt-3 d-none"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
+    <script src="{{ url_for('static', filename='js/main.js') }}"></script>
+</body>
+</html>

training/__pycache__/efficient_train.cpython-314.pyc ADDED Viewed

Binary file (25.9 kB). View file

training/__pycache__/resnet_train.cpython-314.pyc ADDED Viewed

Binary file (33.4 kB). View file

training/efficient_train.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import argparse
+import os
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader, random_split
+from torchvision import transforms
+from timm import create_model
+from transformers import AutoTokenizer
+from pycocotools.coco import COCO
+from datetime import datetime
+from PIL import Image
+# Distributed training imports
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+# ------------------- DDP Setup Functions ------------------- #
+def setup_distributed():
+    dist.init_process_group(backend='nccl')
+def cleanup_distributed():
+    dist.destroy_process_group()
+# ------------------- Configuration and Constants ------------------- #
+DEFAULT_MAX_SEQ_LENGTH = 64
+DEFAULT_EMBED_DIM = 512
+DEFAULT_NUM_LAYERS = 8
+DEFAULT_NUM_HEADS = 8
+# ------------------- Data Preparation ------------------- #
+class CocoCaptionDataset(Dataset):
+    """Custom COCO dataset that returns image-caption pairs with processing"""
+    def __init__(self, root, ann_file, transform=None, max_seq_length=DEFAULT_MAX_SEQ_LENGTH):
+        self.coco = COCO(ann_file)
+        self.root = root
+        self.transform = transform
+        self.max_seq_length = max_seq_length
+        self.ids = list(self.coco.imgs.keys())
+        # Initialize tokenizer with special tokens
+        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        special_tokens = {'additional_special_tokens': ['<start>', '<end>']}
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.vocab_size = len(self.tokenizer)
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, idx):
+        img_id = self.ids[idx]
+        img_info = self.coco.loadImgs(img_id)[0]
+        img_path = os.path.join(self.root, img_info['file_name'])
+        img = Image.open(img_path).convert('RGB')
+        # Get random caption from available annotations
+        ann_ids = self.coco.getAnnIds(imgIds=img_id)
+        anns = self.coco.loadAnns(ann_ids)
+        caption = random.choice(anns)['caption']
+        # Apply transforms
+        if self.transform:
+            img = self.transform(img)
+        # Tokenize caption with special tokens
+        caption = f"<start> {caption} <end>"
+        inputs = self.tokenizer(
+            caption,
+            padding='max_length',
+            max_length=self.max_seq_length,
+            truncation=True,
+            return_tensors='pt',
+        )
+        return img, inputs.input_ids.squeeze(0)
+class CocoTestDataset(Dataset):
+    """COCO test dataset that loads images only (no annotations available)"""
+    def __init__(self, root, transform=None):
+        self.root = root
+        self.transform = transform
+        # Assumes all files in the directory are images
+        self.img_files = sorted(os.listdir(root))
+    def __len__(self):
+        return len(self.img_files)
+    def __getitem__(self, idx):
+        img_file = self.img_files[idx]
+        img_path = os.path.join(self.root, img_file)
+        img = Image.open(img_path).convert('RGB')
+        if self.transform:
+            img = self.transform(img)
+        return img, img_file  # Return the filename for reference
+# ------------------- Model Architecture ------------------- #
+class Encoder(nn.Module):
+    """CNN encoder using timm models"""
+    def __init__(self, model_name='efficientnet_b3', embed_dim=DEFAULT_EMBED_DIM):
+        super().__init__()
+        self.backbone = create_model(
+            model_name,
+            pretrained=True,
+            num_classes=0,
+            global_pool='',
+            features_only=False
+        )
+        # Get output channels from backbone
+        with torch.no_grad():
+            dummy = torch.randn(1, 3, 224, 224)
+            features = self.backbone(dummy)
+            in_features = features.shape[1]
+        self.projection = nn.Linear(in_features, embed_dim)
+    def forward(self, x):
+        features = self.backbone(x)  # (batch, channels, height, width)
+        batch_size, channels, height, width = features.shape
+        features = features.permute(0, 2, 3, 1).reshape(batch_size, -1, channels)
+        return self.projection(features)
+class Decoder(nn.Module):
+    """Transformer decoder with positional embeddings and causal masking"""
+    def __init__(self, vocab_size, embed_dim, num_layers, num_heads, max_seq_length, dropout=0.1):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.positional_encoding = nn.Embedding(max_seq_length, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=embed_dim,
+            nhead=num_heads,
+            dropout=dropout,
+            batch_first=False
+        )
+        self.layers = nn.TransformerDecoder(decoder_layer, num_layers)
+        self.fc = nn.Linear(embed_dim, vocab_size)
+        self.max_seq_length = max_seq_length
+        # Register causal mask buffer
+        self.register_buffer(
+            "causal_mask",
+            torch.triu(torch.full((max_seq_length, max_seq_length), float('-inf')), diagonal=1)
+        )
+    def forward(self, x, memory, tgt_mask=None):
+        seq_length = x.size(1)
+        positions = torch.arange(0, seq_length, device=x.device).unsqueeze(0)
+        x_emb = self.embedding(x) + self.positional_encoding(positions)
+        x_emb = self.dropout(x_emb)
+        # Reshape for transformer: (seq, batch, features)
+        x_emb = x_emb.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        # Apply causal mask
+        mask = self.causal_mask[:seq_length, :seq_length]
+        output = self.layers(
+            x_emb,
+            memory,
+            tgt_mask=mask
+        )
+        return self.fc(output.permute(1, 0, 2))
+class ImageCaptioningModel(nn.Module):
+    """Complete image captioning model"""
+    def __init__(self, encoder, decoder):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+    def forward(self, images, captions, tgt_mask=None):
+        memory = self.encoder(images)
+        return self.decoder(captions, memory)
+# ------------------- Inference Utility ------------------- #
+def generate_caption(model, image, tokenizer, device, max_length=DEFAULT_MAX_SEQ_LENGTH):
+    """
+    Generate a caption for a single image using greedy decoding.
+    Assumes the tokenizer has '<start>' and '<end>' as special tokens.
+    """
+    model.eval()
+    with torch.no_grad():
+        image = image.unsqueeze(0)  # shape: (1, 3, H, W)
+        if isinstance(model, DDP):
+            memory = model.module.encoder(image)
+        else:
+            memory = model.encoder(image)
+        start_token = tokenizer.convert_tokens_to_ids("<start>")
+        end_token = tokenizer.convert_tokens_to_ids("<end>")
+        caption_ids = [start_token]
+        for _ in range(max_length - 1):
+            decoder_input = torch.tensor(caption_ids, device=device).unsqueeze(0)
+            if isinstance(model, DDP):
+                output = model.module.decoder(decoder_input, memory)
+            else:
+                output = model.decoder(decoder_input, memory)
+            next_token_logits = output[0, -1, :]
+            next_token = next_token_logits.argmax().item()
+            caption_ids.append(next_token)
+            if next_token == end_token:
+                break
+        caption_text = tokenizer.decode(caption_ids, skip_special_tokens=True)
+    return caption_text
+# ------------------- Training Utilities ------------------- #
+def create_dataloaders(args):
+    """Create train/val/test dataloaders with appropriate transforms"""
+    train_transform = transforms.Compose([
+        transforms.Resize(256),
+        transforms.RandomCrop(224),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    eval_transform = transforms.Compose([
+        transforms.Resize(224),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    # Load datasets
+    train_set = CocoCaptionDataset(
+        root=args.train_image_dir,
+        ann_file=args.train_ann_file,
+        transform=train_transform
+    )
+    val_set = CocoCaptionDataset(
+        root=args.val_image_dir,
+        ann_file=args.val_ann_file,
+        transform=eval_transform
+    )
+    test_set = CocoTestDataset(
+        root=args.test_image_dir,
+        transform=eval_transform
+    )
+    # For distributed training, use DistributedSampler
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
+    else:
+        train_sampler = None
+    # Optimize for GPU: use pin_memory and more workers if CUDA is available
+    pin_memory = torch.cuda.is_available()
+    num_workers = 8 if torch.cuda.is_available() else 4  # More workers for GPU
+    persistent_workers = torch.cuda.is_available()  # Keep workers alive between epochs
+    train_loader = DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        shuffle=(train_sampler is None),
+        sampler=train_sampler,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        persistent_workers=persistent_workers,
+        prefetch_factor=2 if num_workers > 0 else None  # Prefetch batches
+    )
+    val_loader = DataLoader(
+        val_set,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        persistent_workers=persistent_workers
+    )
+    test_loader = DataLoader(
+        test_set,
+        batch_size=1,  # For inference, process one image at a time
+        shuffle=False,
+        num_workers=num_workers
+    )
+    return train_loader, val_loader, test_loader, train_set.tokenizer, train_set
+def train_epoch(model, loader, optimizer, criterion, scaler, scheduler, device, args):
+    model.train()
+    total_loss = 0.0
+    if args.distributed:
+        loader.sampler.set_epoch(args.epoch)
+    for batch_idx, (images, captions) in enumerate(loader):
+        images = images.to(device)
+        captions = captions.to(device)
+        # Teacher forcing: use shifted captions as decoder input
+        decoder_input = captions[:, :-1]
+        targets = captions[:, 1:].contiguous()
+        optimizer.zero_grad()
+        # Use new API for PyTorch 2.6+
+        if hasattr(torch.amp, 'autocast'):
+            autocast_context = torch.amp.autocast('cuda', enabled=args.use_amp)
+        else:
+            autocast_context = torch.cuda.amp.autocast(enabled=args.use_amp)
+        with autocast_context:
+            logits = model(images, decoder_input)
+            loss = criterion(
+                logits.view(-1, logits.size(-1)),
+                targets.view(-1)
+            )
+        scaler.scale(loss).backward()
+        if (batch_idx + 1) % args.grad_accum == 0:
+            scaler.step(optimizer)
+            scaler.update()
+            # Only step scheduler if it's provided and supports per-step updates
+            if scheduler is not None:
+                scheduler.step()  # Update learning rate
+            optimizer.zero_grad()
+        total_loss += loss.item()
+    return total_loss / len(loader)
+def validate(model, loader, criterion, device):
+    model.eval()
+    total_loss = 0.0
+    with torch.no_grad():
+        for images, captions in loader:
+            images = images.to(device)
+            captions = captions.to(device)
+            decoder_input = captions[:, :-1]
+            targets = captions[:, 1:].contiguous()
+            logits = model(images, decoder_input)
+            loss = criterion(
+                logits.view(-1, logits.size(-1)),
+                targets.view(-1)
+            )
+            total_loss += loss.item()
+    return total_loss / len(loader)
+def main(args):
+    if args.distributed:
+        setup_distributed()
+    device = torch.device("cuda", args.local_rank) if args.distributed else torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.manual_seed(args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Create dataloaders and obtain tokenizer and training dataset (for sampler)
+    train_loader, val_loader, test_loader, tokenizer, train_set = create_dataloaders(args)
+    # Initialize model
+    encoder = Encoder(args.model_name, args.embed_dim)
+    decoder = Decoder(
+        vocab_size=tokenizer.vocab_size + 2,
+        embed_dim=args.embed_dim,
+        num_layers=args.num_layers,
+        num_heads=args.num_heads,
+        max_seq_length=DEFAULT_MAX_SEQ_LENGTH,
+        dropout=0.1
+    )
+    model = ImageCaptioningModel(encoder, decoder).to(device)
+    if args.distributed:
+        model = DDP(model, device_ids=[args.local_rank])
+    # Set up training components
+    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
+    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
+    # Use new API for PyTorch 2.6+
+    if hasattr(torch.amp, 'GradScaler'):
+        scaler = torch.amp.GradScaler('cuda', enabled=args.use_amp)
+    else:
+        scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer,
+        T_max=args.epochs * len(train_loader),
+        eta_min=1e-6
+    )
+    best_val_loss = float('inf')
+    patience_counter = 0
+    # Support resume training
+    start_epoch = 0
+    if args.resume_checkpoint:
+        # Handle PyTorch 2.6+ security: allow tokenizer classes
+        try:
+            from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
+            torch.serialization.add_safe_globals([GPT2TokenizerFast])
+        except ImportError:
+            pass
+        # Load checkpoint (weights_only=False for backward compatibility with tokenizer)
+        checkpoint = torch.load(args.resume_checkpoint, map_location=device, weights_only=False)
+        if args.distributed:
+            model.module.load_state_dict(checkpoint['model_state'])
+        else:
+            model.load_state_dict(checkpoint['model_state'])
+        optimizer.load_state_dict(checkpoint['optimizer_state'])
+        start_epoch = checkpoint['epoch'] + 1
+        best_val_loss = checkpoint.get('val_loss', best_val_loss)
+        print(f"Resumed training from epoch {start_epoch}")
+    # Training loop
+    for epoch in range(start_epoch, args.epochs):
+        args.epoch = epoch  # Useful for the sampler in distributed training
+        if args.distributed:
+            train_loader.sampler.set_epoch(epoch)
+        if args.local_rank == 0 or not args.distributed:
+            print(f"Epoch {epoch+1}/{args.epochs}")
+        train_loss = train_epoch(
+            model, train_loader, optimizer, criterion, scaler, scheduler, device, args
+        )
+        val_loss = validate(model, val_loader, criterion, device)
+        if args.local_rank == 0 or not args.distributed:
+            print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
+            # Checkpointing
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                patience_counter = 0
+                torch.save({
+                    'epoch': epoch,
+                    'model_state': model.module.state_dict() if args.distributed else model.state_dict(),
+                    'optimizer_state': optimizer.state_dict(),
+                    'scheduler_state': scheduler.state_dict(),
+                    'val_loss': val_loss,
+                    'tokenizer': tokenizer,
+                }, os.path.join(args.checkpoint_dir, 'best_model.pth'))
+            else:
+                patience_counter += 1
+            if patience_counter >= args.early_stopping_patience:
+                print("Early stopping triggered")
+                break
+    # Inference on test set
+    if args.local_rank == 0 or not args.distributed:
+        print("\nGenerating captions on test set images:")
+        model.eval()
+        for idx, (image, filename) in enumerate(test_loader):
+            image = image.to(device).squeeze(0)
+            caption = generate_caption(model, image, tokenizer, device)
+            print(f"{filename}: {caption}")
+            if idx >= 4:
+                break
+    if args.distributed:
+        cleanup_distributed()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Data arguments
+    parser.add_argument('--train_image_dir', type=str, required=True)
+    parser.add_argument('--train_ann_file', type=str, required=True)
+    parser.add_argument('--val_image_dir', type=str, required=True)
+    parser.add_argument('--val_ann_file', type=str, required=True)
+    parser.add_argument('--test_image_dir', type=str, required=True)  # Test set images only
+    # Model arguments
+    parser.add_argument('--model_name', type=str, default='efficientnet_b3')
+    parser.add_argument('--embed_dim', type=int, default=DEFAULT_EMBED_DIM)
+    parser.add_argument('--num_layers', type=int, default=DEFAULT_NUM_LAYERS)
+    parser.add_argument('--num_heads', type=int, default=DEFAULT_NUM_HEADS)
+    # Training arguments
+    parser.add_argument('--batch_size', type=int, default=96)
+    parser.add_argument('--lr', type=float, default=3e-4)
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--use_amp', action='store_true')
+    parser.add_argument('--grad_accum', type=int, default=1)
+    parser.add_argument('--checkpoint_dir', type=str, default='/workspace')
+    parser.add_argument('--early_stopping_patience', type=int, default=3)
+    # Distributed training arguments
+    # Accept both --local_rank and --local-rank
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0,
+                        help="Local rank. Necessary for using distributed training.")
+    parser.add_argument('--distributed', action='store_true', help="Use distributed training")
+    # Resume training argument
+    parser.add_argument('--resume_checkpoint', type=str, default=None, help="Path to checkpoint to resume training from.")
+    args = parser.parse_args()
+    # Override local_rank from environment variable if set
+    if "LOCAL_RANK" in os.environ:
+        args.local_rank = int(os.environ["LOCAL_RANK"])
+    # Create checkpoint directory
+    os.makedirs(args.checkpoint_dir, exist_ok=True)
+    main(args)

training/hyperparameter_tuning.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Hyperparameter Optimization using Optuna
+Run this to find the best hyperparameters for your model
+"""
+import optuna
+import torch
+import argparse
+import os
+import sys
+from efficient_train import create_dataloaders, Encoder, Decoder, ImageCaptioningModel
+from efficient_train import train_epoch, validate, generate_caption
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
+def train_with_config(trial, args):
+    """Train model with suggested hyperparameters from Optuna"""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Suggest hyperparameters
+    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
+    batch_size = trial.suggest_categorical('batch_size', [32, 64, 96, 128])
+    embed_dim = trial.suggest_categorical('embed_dim', [256, 512, 768])
+    num_layers = trial.suggest_int('num_layers', 4, 12)
+    num_heads = trial.suggest_categorical('num_heads', [4, 8, 12, 16])
+    dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
+    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
+    warmup_epochs = trial.suggest_int('warmup_epochs', 0, 3)
+    # Update args with suggested values
+    args.lr = lr
+    args.batch_size = batch_size
+    args.embed_dim = embed_dim
+    args.num_layers = num_layers
+    args.num_heads = num_heads
+    args.epochs = 5  # Fewer epochs for hyperparameter search
+    # Create dataloaders
+    train_loader, val_loader, test_loader, tokenizer, train_set = create_dataloaders(args)
+    # Initialize model
+    encoder = Encoder(args.model_name, embed_dim)
+    decoder = Decoder(
+        vocab_size=tokenizer.vocab_size + 2,
+        embed_dim=embed_dim,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        max_seq_length=64,
+        dropout=dropout
+    )
+    model = ImageCaptioningModel(encoder, decoder).to(device)
+    # Optimizer
+    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    # Scheduler
+    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
+    # Loss
+    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
+    # Mixed precision
+    scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
+    # Training loop (fewer epochs for hyperparameter search)
+    best_val_loss = float('inf')
+    for epoch in range(args.epochs):
+        # Train
+        train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler,
+                                scheduler, device, args)
+        # Validate
+        val_loss = validate(model, val_loader, criterion, device)
+        # Update scheduler
+        scheduler.step(val_loss)
+        # Report to Optuna
+        trial.report(val_loss, epoch)
+        # Prune trial if not promising
+        if trial.should_prune():
+            raise optuna.exceptions.TrialPruned()
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+    return best_val_loss
+def objective(trial):
+    """Optuna objective function"""
+    # Create minimal args object
+    args = argparse.Namespace(
+        train_image_dir='Data/train2017/train2017',
+        train_ann_file='Data/annotations_trainval2017/annotations/captions_train2017.json',
+        val_image_dir='Data/val2017',
+        val_ann_file='Data/annotations_trainval2017/annotations/captions_val2017.json',
+        test_image_dir='Data/test2017/test2017',
+        model_name='efficientnet_b3',
+        embed_dim=512,  # Will be overridden
+        num_layers=8,   # Will be overridden
+        num_heads=8,    # Will be overridden
+        batch_size=96,  # Will be overridden
+        lr=3e-4,        # Will be overridden
+        epochs=5,
+        seed=42,
+        use_amp=True,
+        grad_accum=1,
+        checkpoint_dir='checkpoints',
+        early_stopping_patience=3,
+        distributed=False,
+        local_rank=0,
+        resume_checkpoint=None
+    )
+    try:
+        val_loss = train_with_config(trial, args)
+        return val_loss
+    except Exception as e:
+        print(f"Trial failed: {e}")
+        return float('inf')
+def main():
+    parser = argparse.ArgumentParser(description='Hyperparameter optimization with Optuna')
+    parser.add_argument('--n_trials', type=int, default=50, help='Number of trials')
+    parser.add_argument('--timeout', type=int, default=3600*24, help='Timeout in seconds')
+    parser.add_argument('--study_name', type=str, default='efficientnet_captioning',
+                       help='Study name')
+    parser.add_argument('--storage', type=str, default='sqlite:///optuna_study.db',
+                       help='Storage URL for study')
+    args = parser.parse_args()
+    # Create or load study
+    study = optuna.create_study(
+        direction='minimize',
+        study_name=args.study_name,
+        storage=args.storage,
+        load_if_exists=True,
+        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=3)
+    )
+    print(f"Starting optimization with {args.n_trials} trials...")
+    print(f"Study: {args.study_name}")
+    # Optimize
+    study.optimize(objective, n_trials=args.n_trials, timeout=args.timeout)
+    # Print results
+    print("\n" + "="*60)
+    print("Optimization Complete!")
+    print("="*60)
+    print(f"Best trial: {study.best_trial.number}")
+    print(f"Best validation loss: {study.best_value:.4f}")
+    print("\nBest parameters:")
+    for key, value in study.best_params.items():
+        print(f"  {key}: {value}")
+    # Save results
+    import json
+    with open('best_hyperparameters.json', 'w') as f:
+        json.dump(study.best_params, f, indent=2)
+    print("\nBest hyperparameters saved to best_hyperparameters.json")
+    # Visualize (optional, requires plotly)
+    try:
+        import optuna.visualization as vis
+        # Optimization history
+        fig = vis.plot_optimization_history(study)
+        fig.write_image("optimization_history.png")
+        print("Saved optimization_history.png")
+        # Parameter importances
+        fig = vis.plot_param_importances(study)
+        fig.write_image("param_importances.png")
+        print("Saved param_importances.png")
+        # Parallel coordinate plot
+        fig = vis.plot_parallel_coordinate(study)
+        fig.write_image("parallel_coordinate.png")
+        print("Saved parallel_coordinate.png")
+    except ImportError:
+        print("Install plotly to generate visualizations: pip install plotly")
+if __name__ == '__main__':
+    main()

training/resnet_train.py ADDED Viewed

	@@ -0,0 +1,497 @@

+import os
+import subprocess
+import json
+import torch
+import nltk
+import numpy as np
+from PIL import Image
+from pycocotools.coco import COCO
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import torch.nn as nn
+import torch.optim as optim
+from collections import Counter
+import matplotlib.pyplot as plt
+from torchvision import models
+from tqdm import tqdm
+import torch.distributed as dist
+import argparse
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from nltk.translate.meteor_score import meteor_score
+# Additional imports for extended metrics
+from rouge import Rouge
+from pycocoevalcap.cider.cider import Cider
+nltk.download('punkt', quiet=True)
+nltk.download('punkt_tab', quiet=True)
+nltk.download('wordnet', quiet=True)
+# ===========================
+# CONFIGURATION
+# ===========================
+CONFIG = {
+    # Paths
+    "train_ann": r"B:/!S3/Computer Vision/Project/annotations/captions_train2017.json",
+    "val_ann": r"B:/!S3/Computer Vision/Project/annotations/captions_val2017.json",
+    "train_img_dir": "images/train2017",
+    "val_img_dir": "images/val2017",
+    # Model
+    "img_size": 224,
+    "embed_size": 256,
+    "hidden_size": 512,
+    "attention_dim": 512,
+    "feature_map_size": 14,  # From ResNet feature maps
+    "dropout": 0.5,          # Dropout probability added
+    # Training
+    "batch_size": 176,
+    "num_epochs": 30,
+    "lr": 0.005,
+    "fine_tune_encoder": True,
+    "grad_clip": 5.0,
+    # Vocabulary
+    "vocab_threshold": 5,
+    "max_len": 20,
+    # Beam search
+    "beam_size": 3
+}
+# ===========================
+# Vocabulary Builder
+# ===========================
+class Vocabulary:
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+    def build(self, coco, threshold):
+        counter = Counter()
+        ids = list(coco.anns.keys())
+        for ann_id in tqdm(ids):
+            caption = coco.anns[ann_id]['caption']
+            tokens = nltk.word_tokenize(caption.lower())
+            counter.update(tokens)
+        # Add special tokens
+        self.add_word('<pad>')
+        self.add_word('<start>')
+        self.add_word('<end>')
+        self.add_word('<unk>')
+        # Add words meeting threshold
+        for word, cnt in counter.items():
+            if cnt >= threshold:
+                self.add_word(word)
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+# Initialize vocab with full training data (only if training data exists)
+# This allows the module to be imported for inference without training data
+vocab = Vocabulary()
+# Always add special tokens (needed for DecoderRNN class definition)
+vocab.add_word('<pad>')
+vocab.add_word('<start>')
+vocab.add_word('<end>')
+vocab.add_word('<unk>')
+if os.path.exists(CONFIG['train_ann']):
+    try:
+        coco_train = COCO(CONFIG['train_ann'])
+        vocab.build(coco_train, CONFIG['vocab_threshold'])
+        print(f"Vocabulary size: {len(vocab.word2idx)}")
+    except (FileNotFoundError, OSError) as e:
+        # Training data not available - vocab will be loaded from checkpoint
+        # Keep minimal vocab with special tokens for class definition
+        print(f"Warning: Could not load training data. Vocabulary will be loaded from checkpoint.")
+else:
+    # Training data path doesn't exist - keep minimal vocab for inference
+    print(f"Warning: Training data not found at {CONFIG['train_ann']}. Vocabulary will be loaded from checkpoint.")
+# ===========================
+# Attention-based Model
+# ===========================
+class EncoderCNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Use the new weights parameter instead of the deprecated 'pretrained'
+        from torchvision.models import resnet50, ResNet50_Weights
+        weights = ResNet50_Weights.IMAGENET1K_V1
+        resnet = resnet50(weights=weights)
+        modules = list(resnet.children())[:-2]
+        self.cnn = nn.Sequential(*modules)
+        self.adaptive_pool = nn.AdaptiveAvgPool2d((CONFIG['feature_map_size'], CONFIG['feature_map_size']))
+        if not CONFIG['fine_tune_encoder']:
+            for param in self.cnn.parameters():
+                param.requires_grad = False
+    def forward(self, x):
+        features = self.cnn(x)  # (batch, 2048, H, W)
+        features = self.adaptive_pool(features)  # (batch, 2048, 14, 14)
+        features = features.permute(0, 2, 3, 1)   # (batch, 14, 14, 2048)
+        features = features.view(features.size(0), -1, features.size(-1))  # (batch, 196, 2048)
+        return features
+class Attention(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.U = nn.Linear(CONFIG['hidden_size'], CONFIG['attention_dim'])
+        self.W = nn.Linear(2048, CONFIG['attention_dim'])
+        self.v = nn.Linear(CONFIG['attention_dim'], 1)
+        self.tanh = nn.Tanh()
+        self.softmax = nn.Softmax(dim=1)
+    def forward(self, features, hidden):
+        U_h = self.U(hidden).unsqueeze(1)  # (batch, 1, attention_dim)
+        W_s = self.W(features)              # (batch, 196, attention_dim)
+        att = self.tanh(W_s + U_h)          # (batch, 196, attention_dim)
+        e = self.v(att).squeeze(2)          # (batch, 196)
+        alpha = self.softmax(e)             # (batch, 196)
+        context = (features * alpha.unsqueeze(2)).sum(dim=1)  # (batch, 2048)
+        return context, alpha
+class DecoderRNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embed = nn.Embedding(len(vocab.word2idx), CONFIG['embed_size'])
+        self.lstm = nn.LSTM(CONFIG['embed_size'] + 2048,
+                            CONFIG['hidden_size'], batch_first=True)
+        self.attention = Attention()
+        self.fc = nn.Linear(CONFIG['hidden_size'], len(vocab.word2idx))
+        self.dropout = nn.Dropout(p=CONFIG['dropout'])
+    def forward(self, features, captions, teacher_forcing_ratio=0.5):
+        batch_size = features.size(0)
+        h, c = self.init_hidden(features)
+        seq_length = captions.size(1) - 1
+        outputs = torch.zeros(batch_size, seq_length, len(vocab.word2idx)).to(features.device)
+        embeddings = self.dropout(self.embed(captions[:, 0]))
+        for t in range(seq_length):
+            context, alpha = self.attention(features, h.squeeze(0))
+            lstm_input = torch.cat([embeddings, context], dim=1).unsqueeze(1)
+            out, (h, c) = self.lstm(lstm_input, (h, c))
+            out = self.dropout(out)
+            output = self.fc(out.squeeze(1))
+            outputs[:, t] = output
+            use_teacher_forcing = np.random.random() < teacher_forcing_ratio
+            if use_teacher_forcing and t < seq_length - 1:
+                embeddings = self.dropout(self.embed(captions[:, t+1]))
+            else:
+                embeddings = self.dropout(self.embed(output.argmax(dim=-1)))
+        return outputs
+    def init_hidden(self, features):
+        h = torch.zeros(1, features.size(0), CONFIG['hidden_size']).to(features.device)
+        c = torch.zeros(1, features.size(0), CONFIG['hidden_size']).to(features.device)
+        return h, c
+# ===========================
+# Enhanced Dataset Class
+# ===========================
+class CocoDataset(Dataset):
+    def __init__(self, ann_file, img_dir, vocab, transform=None):
+        self.coco = COCO(ann_file)
+        self.img_dir = img_dir
+        self.vocab = vocab
+        self.transform = transform or self.default_transform()
+        all_ids = list(self.coco.anns.keys())
+        valid_ids = []
+        for ann_id in all_ids:
+            ann = self.coco.anns[ann_id]
+            img_id = ann['image_id']
+            file_name = self.coco.loadImgs(img_id)[0]['file_name']
+            img_path = os.path.join(self.img_dir, file_name)
+            if os.path.exists(img_path):
+                valid_ids.append(ann_id)
+            else:
+                print(f"Warning: File {img_path} not found. Skipping annotation id {ann_id}.")
+        self.ids = valid_ids
+    def default_transform(self):
+        return transforms.Compose([
+            transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                 std=[0.229, 0.224, 0.225])
+        ])
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, idx):
+        ann_id = self.ids[idx]
+        ann = self.coco.anns[ann_id]
+        img_id = ann['image_id']
+        img_info = self.coco.loadImgs(img_id)[0]
+        img_path = os.path.join(self.img_dir, img_info['file_name'])
+        img = Image.open(img_path).convert('RGB')
+        img = self.transform(img)
+        caption = ann['caption']
+        tokens = ['<start>'] + nltk.word_tokenize(caption.lower()) + ['<end>']
+        caption_ids = [self.vocab.word2idx.get(token, self.vocab.word2idx['<unk>']) for token in tokens]
+        caption_ids += [self.vocab.word2idx['<pad>']] * (CONFIG['max_len'] - len(caption_ids))
+        caption_ids = caption_ids[:CONFIG['max_len']]
+        return img, torch.tensor(caption_ids)
+# ===========================
+# Distributed Setup Functions
+# ===========================
+def setup_distributed():
+    dist.init_process_group(backend='nccl')
+def cleanup_distributed():
+    dist.destroy_process_group()
+# ===========================
+# Training & Evaluation
+# ===========================
+def evaluate(encoder, decoder, loader, device, criterion, compute_extended=False):
+    encoder.eval()
+    decoder.eval()
+    total_loss = 0
+    # Instantiate smoothing function for BLEU score.
+    smoothing_fn = SmoothingFunction().method1
+    if compute_extended:
+        bleu_scores = []
+        meteor_scores = []
+        rouge = Rouge()
+        rouge1_scores = []
+        rougeL_scores = []
+        cider_scorer = Cider()
+        ref_dict = {}
+        hyp_dict = {}
+        sample_id = 0
+        with torch.no_grad():
+            for imgs, caps in loader:
+                imgs = imgs.to(device)
+                caps = caps.to(device)
+                features = encoder(imgs)
+                outputs = decoder(features, caps, teacher_forcing_ratio=0)
+                loss = criterion(outputs.view(-1, len(vocab.word2idx)), caps[:, 1:].reshape(-1))
+                total_loss += loss.item()
+                for i in range(imgs.size(0)):
+                    predicted_ids = beam_search(features[i].unsqueeze(0), decoder, device)
+                    predicted_caption = [vocab.idx2word[idx] for idx in predicted_ids
+                                         if idx not in [vocab.word2idx['<start>'], vocab.word2idx['<end>'], vocab.word2idx['<pad>']]]
+                    reference_ids = caps[i].tolist()
+                    reference_caption = [vocab.idx2word[idx] for idx in reference_ids
+                                         if idx not in [vocab.word2idx['<start>'], vocab.word2idx['<end>'], vocab.word2idx['<pad>']]]
+                    bleu = sentence_bleu([reference_caption], predicted_caption, smoothing_function=smoothing_fn)
+                    bleu_scores.append(bleu)
+                    meteor = meteor_score([reference_caption], predicted_caption)
+                    meteor_scores.append(meteor)
+                    pred_str = " ".join(predicted_caption)
+                    ref_str = " ".join(reference_caption)
+                    rouge_scores = rouge.get_scores(pred_str, ref_str)
+                    rouge1_scores.append(rouge_scores[0]['rouge-1']['f'])
+                    rougeL_scores.append(rouge_scores[0]['rouge-l']['f'])
+                    ref_dict[sample_id] = [ref_str]
+                    hyp_dict[sample_id] = [pred_str]
+                    sample_id += 1
+        avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
+        avg_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
+        avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
+        avg_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0
+        cider_score, _ = cider_scorer.compute_score(ref_dict, hyp_dict)
+        metrics = {'BLEU': avg_bleu, 'METEOR': avg_meteor,
+                   'ROUGE-1': avg_rouge1, 'ROUGE-L': avg_rougeL, 'CIDEr': cider_score}
+        if dist.is_initialized() and dist.get_rank() == 0:
+            print(f"Extended Metrics: {metrics}")
+        return total_loss / len(loader), metrics
+    else:
+        with torch.no_grad():
+            for imgs, caps in loader:
+                imgs = imgs.to(device)
+                caps = caps.to(device)
+                features = encoder(imgs)
+                outputs = decoder(features, caps, teacher_forcing_ratio=0)
+                loss = criterion(outputs.view(-1, len(vocab.word2idx)), caps[:, 1:].reshape(-1))
+                total_loss += loss.item()
+        return total_loss / len(loader)
+def beam_search(features, decoder, device):
+    k = CONFIG['beam_size']
+    start_token = vocab.word2idx['<start>']
+    h, c = (decoder.module.init_hidden(features) if isinstance(decoder, torch.nn.parallel.DistributedDataParallel)
+            else decoder.init_hidden(features))
+    sequences = [[[start_token], 0.0, h, c]]
+    for _ in range(CONFIG['max_len'] - 1):
+        all_candidates = []
+        for seq in sequences:
+            tokens, score, h, c = seq
+            if tokens[-1] == vocab.word2idx['<end>']:
+                all_candidates.append(seq)
+                continue
+            input_tensor = torch.LongTensor([tokens[-1]]).to(device)
+            if isinstance(decoder, torch.nn.parallel.DistributedDataParallel):
+                context, _ = decoder.module.attention(features, h.squeeze(0))
+                emb = decoder.module.embed(input_tensor)
+                lstm_input = torch.cat([emb, context], dim=1).unsqueeze(1)
+                out, (h, c) = decoder.module.lstm(lstm_input, (h, c))
+                output = decoder.module.fc(out.squeeze(1))
+            else:
+                context, _ = decoder.attention(features, h.squeeze(0))
+                emb = decoder.embed(input_tensor)
+                lstm_input = torch.cat([emb, context], dim=1).unsqueeze(1)
+                out, (h, c) = decoder.lstm(lstm_input, (h, c))
+                output = decoder.fc(out.squeeze(1))
+            log_probs = torch.log_softmax(output, dim=1)
+            top_probs, top_indices = log_probs.topk(k)
+            for i in range(k):
+                token = top_indices[0][i].item()
+                new_score = score + top_probs[0][i].item()
+                new_seq = tokens + [token]
+                all_candidates.append([new_seq, new_score, h, c])
+        ordered = sorted(all_candidates, key=lambda x: x[1] / len(x[0]), reverse=True)
+        sequences = ordered[:k]
+    return sequences[0][0]
+def visualize_attention(image_path, encoder, decoder, device):
+    img = Image.open(image_path).convert('RGB')
+    transform = transforms.Compose([
+        transforms.Resize((CONFIG['img_size'], CONFIG['img_size'])),
+        transforms.ToTensor(),
+        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    img_tensor = transform(img).unsqueeze(0).to(device)
+    encoder.eval()
+    decoder.eval()
+    with torch.no_grad():
+        features = encoder(img_tensor)
+        caption_ids = beam_search(features, decoder, device)
+    caption = [vocab.idx2word[idx] for idx in caption_ids
+               if idx not in [vocab.word2idx['<start>'], vocab.word2idx['<end>'], vocab.word2idx['<pad>']]]
+    return ' '.join(caption)
+def train(distributed=False, local_rank=0, device=torch.device('cpu'), resume_checkpoint=None):
+    train_set = CocoDataset(CONFIG['train_ann'], CONFIG['train_img_dir'], vocab)
+    val_set = CocoDataset(CONFIG['val_ann'], CONFIG['val_img_dir'], vocab)
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_set) if distributed else None
+    val_sampler = torch.utils.data.distributed.DistributedSampler(val_set, shuffle=False) if distributed else None
+    train_loader = DataLoader(train_set,
+                              batch_size=CONFIG['batch_size'],
+                              shuffle=(train_sampler is None),
+                              sampler=train_sampler,
+                              num_workers=8)
+    val_loader = DataLoader(val_set,
+                            batch_size=CONFIG['batch_size'],
+                            sampler=val_sampler,
+                            num_workers=8)
+    encoder = EncoderCNN().to(device)
+    decoder = DecoderRNN().to(device)
+    if distributed:
+        encoder = torch.nn.parallel.DistributedDataParallel(encoder, device_ids=[local_rank], output_device=local_rank)
+        decoder = torch.nn.parallel.DistributedDataParallel(decoder, device_ids=[local_rank], output_device=local_rank)
+    criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>'])
+    if CONFIG['fine_tune_encoder']:
+        params = list(decoder.parameters()) + list(encoder.parameters())
+    else:
+        params = list(decoder.parameters())
+    optimizer = optim.Adam(params, lr=CONFIG['lr'])
+    # Initialize training state variables
+    start_epoch = 0
+    best_val_loss = float('inf')
+    epochs_without_improvement = 0
+    # Resume from checkpoint if provided
+    if resume_checkpoint is not None:
+        print(f"Loading checkpoint from {resume_checkpoint}")
+        # Allow Vocabulary as a safe global so it can be unpickled
+        torch.serialization.add_safe_globals([Vocabulary])
+        checkpoint = torch.load(resume_checkpoint, map_location=device, weights_only=False)
+        encoder.load_state_dict(checkpoint['encoder'])
+        decoder.load_state_dict(checkpoint['decoder'])
+        if 'optimizer' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        else:
+            print("Warning: 'optimizer' state not found in checkpoint. Starting with fresh optimizer state.")
+        start_epoch = checkpoint['epoch'] + 1
+        best_val_loss = checkpoint.get('best_val_loss', float('inf'))
+        epochs_without_improvement = checkpoint.get('epochs_without_improvement', 0)
+        print(f"Resumed training from epoch {start_epoch}")
+    for epoch in range(start_epoch, CONFIG['num_epochs']):
+        if distributed:
+            train_sampler.set_epoch(epoch)
+        encoder.train()
+        decoder.train()
+        total_loss = 0
+        for imgs, caps in tqdm(train_loader):
+            imgs = imgs.to(device)
+            caps = caps.to(device)
+            optimizer.zero_grad()
+            features = encoder(imgs)
+            outputs = decoder(features, caps)
+            loss = criterion(outputs.view(-1, len(vocab.word2idx)),
+                             caps[:, 1:].reshape(-1))
+            loss.backward()
+            if CONFIG['grad_clip'] is not None:
+                nn.utils.clip_grad_norm_(decoder.parameters(), CONFIG['grad_clip'])
+            optimizer.step()
+            total_loss += loss.item()
+        if epoch % 5 == 0:
+            val_loss, metrics = evaluate(encoder, decoder, val_loader, device, criterion, compute_extended=True)
+            if local_rank == 0:
+                print(f"Epoch {epoch+1}/{CONFIG['num_epochs']} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss:.4f}")
+                with open("metrics_log_Resnet.txt", "a") as f:
+                    f.write(f"Epoch {epoch+1}: {metrics}\n")
+        else:
+            val_loss = evaluate(encoder, decoder, val_loader, device, criterion, compute_extended=False)
+            if local_rank == 0:
+                print(f"Epoch {epoch+1}/{CONFIG['num_epochs']} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss:.4f}")
+        if local_rank == 0:
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                epochs_without_improvement = 0
+                checkpoint_path = f'caption_model_best_epoch{epoch}.pth'
+                torch.save({
+                    'epoch': epoch,
+                    'encoder': encoder.state_dict(),
+                    'decoder': decoder.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'best_val_loss': best_val_loss,
+                    'epochs_without_improvement': epochs_without_improvement,
+                    'vocab': vocab,
+                    'config': CONFIG
+                }, checkpoint_path)
+                #upload_files(epoch)
+            else:
+                epochs_without_improvement += 1
+                if epochs_without_improvement >= 3:
+                    print("Early stopping triggered.")
+                    break
+def upload_files(i):
+    files = [f"caption_model_best_epoch{i}.pth", "metrics_log_Resnet.txt"]
+    for file in files:
+        result = subprocess.run(
+            ["rclone", "copy", file, "onedrive:/Computer_Viz/"],
+            capture_output=True, text=True
+        )
+        if result.returncode == 0:
+            print(f"{file} uploaded successfully.")
+        else:
+            print(f"Error during upload of {file}:", result.stderr)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--distributed", action="store_true", help="Enable distributed training")
+    parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint to resume training")
+    args = parser.parse_args()
+    if args.distributed:
+        setup_distributed()
+        local_rank = int(os.environ['LOCAL_RANK'])
+        torch.cuda.set_device(local_rank)
+        device = torch.device("cuda", local_rank)
+    else:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        local_rank = 0
+    train(distributed=args.distributed, local_rank=local_rank, device=device, resume_checkpoint=args.resume)
+    if args.distributed:
+        cleanup_distributed()

training/train_advanced.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""
+Advanced Training Script with Best Practices
+- Learning rate scheduling
+- Mixed precision training
+- Experiment tracking (W&B optional)
+- Comprehensive evaluation
+- Model checkpointing
+"""
+import argparse
+import os
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau, LambdaLR
+import math
+from efficient_train import (
+    create_dataloaders, Encoder, Decoder, ImageCaptioningModel,
+    train_epoch, validate, generate_caption
+)
+from datetime import datetime
+# Optional: Weights & Biases
+try:
+    import wandb
+    WANDB_AVAILABLE = True
+except ImportError:
+    WANDB_AVAILABLE = False
+    print("W&B not available. Install with: pip install wandb")
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
+    """Create learning rate schedule with warmup and cosine annealing"""
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
+    return LambdaLR(optimizer, lr_lambda)
+def train_advanced(args):
+    """Advanced training with all best practices"""
+    # Setup
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    torch.manual_seed(args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # GPU optimizations
+    if torch.cuda.is_available():
+        torch.backends.cudnn.benchmark = True  # Optimize for consistent input sizes
+        torch.backends.cudnn.deterministic = False  # Faster, but non-deterministic
+        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+    # Initialize W&B
+    if args.use_wandb and WANDB_AVAILABLE:
+        wandb.init(
+            project=args.wandb_project,
+            name=f"{args.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
+            config=vars(args)
+        )
+    # Create dataloaders
+    train_loader, val_loader, test_loader, tokenizer, train_set = create_dataloaders(args)
+    # Initialize model
+    encoder = Encoder(args.model_name, args.embed_dim)
+    decoder = Decoder(
+        vocab_size=tokenizer.vocab_size + 2,
+        embed_dim=args.embed_dim,
+        num_layers=args.num_layers,
+        num_heads=args.num_heads,
+        max_seq_length=64,
+        dropout=args.dropout
+    )
+    model = ImageCaptioningModel(encoder, decoder).to(device)
+    # Resume from checkpoint if provided
+    start_epoch = 0
+    best_val_loss = float('inf')
+    best_metrics = {}
+    if args.resume_checkpoint:
+        print(f"Loading checkpoint from {args.resume_checkpoint}")
+        # Handle PyTorch 2.6+ security: allow tokenizer classes
+        try:
+            from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
+            torch.serialization.add_safe_globals([GPT2TokenizerFast])
+        except ImportError:
+            pass
+        checkpoint = torch.load(args.resume_checkpoint, map_location=device, weights_only=False)
+        model.load_state_dict(checkpoint['model_state'])
+        start_epoch = checkpoint.get('epoch', 0) + 1
+        best_val_loss = checkpoint.get('val_loss', float('inf'))
+        print(f"Resumed from epoch {start_epoch}, best val loss: {best_val_loss:.4f}")
+    # Optimizer with different learning rates for encoder/decoder
+    encoder_params = [p for n, p in model.named_parameters() if 'encoder' in n]
+    decoder_params = [p for n, p in model.named_parameters() if 'decoder' in n]
+    if args.different_lr:
+        # Lower learning rate for encoder (fine-tuning)
+        optimizer = optim.AdamW([
+            {'params': encoder_params, 'lr': args.lr * 0.1},
+            {'params': decoder_params, 'lr': args.lr}
+        ], weight_decay=args.weight_decay)
+    else:
+        optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    # Learning rate scheduler
+    if args.scheduler == 'cosine':
+        scheduler = CosineAnnealingLR(
+            optimizer,
+            T_max=args.epochs * len(train_loader),
+            eta_min=args.min_lr
+        )
+    elif args.scheduler == 'plateau':
+        scheduler = ReduceLROnPlateau(
+            optimizer, mode='min', factor=0.5, patience=args.patience
+        )
+    elif args.scheduler == 'warmup_cosine':
+        num_training_steps = args.epochs * len(train_loader)
+        num_warmup_steps = args.warmup_epochs * len(train_loader)
+        scheduler = get_cosine_schedule_with_warmup(
+            optimizer, num_warmup_steps, num_training_steps
+        )
+    else:
+        scheduler = None
+    # Loss function
+    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
+    # Mixed precision training - Use new API for PyTorch 2.6+
+    if hasattr(torch.amp, 'GradScaler'):
+        scaler = torch.amp.GradScaler('cuda', enabled=args.use_amp)
+    else:
+        scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
+    # Create checkpoint directory
+    os.makedirs(args.checkpoint_dir, exist_ok=True)
+    # Training loop
+    patience_counter = 0
+    for epoch in range(start_epoch, args.epochs):
+        args.epoch = epoch  # Set epoch for train_epoch function
+        print(f"\nEpoch {epoch+1}/{args.epochs}")
+        print("-" * 60)
+        # Train
+        train_loss = train_epoch(
+            model, train_loader, optimizer, criterion, scaler,
+            scheduler if args.scheduler == 'cosine' or args.scheduler == 'warmup_cosine' else None,
+            device, args
+        )
+        # Validate
+        val_loss = validate(model, val_loader, criterion, device)
+        # Update scheduler
+        if args.scheduler == 'plateau':
+            scheduler.step(val_loss)
+        elif args.scheduler in ['cosine', 'warmup_cosine']:
+            # Already updated in train_epoch
+            pass
+        current_lr = optimizer.param_groups[0]['lr']
+        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {current_lr:.6f}")
+        # Log to W&B
+        log_dict = {
+            'epoch': epoch,
+            'train_loss': train_loss,
+            'val_loss': val_loss,
+            'learning_rate': current_lr
+        }
+        if args.use_wandb and WANDB_AVAILABLE:
+            wandb.log(log_dict)
+        # Checkpointing
+        is_best = val_loss < best_val_loss
+        if is_best:
+            best_val_loss = val_loss
+            patience_counter = 0
+            # Save best model
+            checkpoint = {
+                'epoch': epoch,
+                'model_state': model.state_dict(),
+                'optimizer_state': optimizer.state_dict(),
+                'scheduler_state': scheduler.state_dict() if scheduler else None,
+                'val_loss': val_loss,
+                'train_loss': train_loss,
+                'tokenizer': tokenizer,
+                'config': vars(args)
+            }
+            best_path = os.path.join(args.checkpoint_dir, 'best_model.pth')
+            torch.save(checkpoint, best_path)
+            print(f"✓ Saved best model (val_loss: {val_loss:.4f})")
+        else:
+            patience_counter += 1
+            # Save periodic checkpoints
+            if (epoch + 1) % args.save_every == 0:
+                checkpoint = {
+                    'epoch': epoch,
+                    'model_state': model.state_dict(),
+                    'optimizer_state': optimizer.state_dict(),
+                    'scheduler_state': scheduler.state_dict() if scheduler else None,
+                    'val_loss': val_loss,
+                    'train_loss': train_loss,
+                    'tokenizer': tokenizer,
+                    'config': vars(args)
+                }
+                checkpoint_path = os.path.join(
+                    args.checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pth'
+                )
+                torch.save(checkpoint, checkpoint_path)
+                print(f"✓ Saved periodic checkpoint (epoch {epoch+1})")
+        # Early stopping
+        if patience_counter >= args.early_stopping_patience:
+            print(f"\nEarly stopping triggered after {args.early_stopping_patience} epochs without improvement")
+            break
+    print("\n" + "="*60)
+    print("Training Complete!")
+    print(f"Best validation loss: {best_val_loss:.4f}")
+    print(f"Best model saved to: {os.path.join(args.checkpoint_dir, 'best_model.pth')}")
+    print("="*60)
+    if args.use_wandb and WANDB_AVAILABLE:
+        wandb.finish()
+def main():
+    parser = argparse.ArgumentParser(description='Advanced training with best practices')
+    # Data arguments
+    parser.add_argument('--train_image_dir', type=str, required=True)
+    parser.add_argument('--train_ann_file', type=str, required=True)
+    parser.add_argument('--val_image_dir', type=str, required=True)
+    parser.add_argument('--val_ann_file', type=str, required=True)
+    parser.add_argument('--test_image_dir', type=str, required=True)
+    # Model arguments
+    parser.add_argument('--model_name', type=str, default='efficientnet_b3')
+    parser.add_argument('--embed_dim', type=int, default=512)
+    parser.add_argument('--num_layers', type=int, default=8)
+    parser.add_argument('--num_heads', type=int, default=8)
+    parser.add_argument('--dropout', type=float, default=0.1)
+    # Training arguments
+    parser.add_argument('--batch_size', type=int, default=96)
+    parser.add_argument('--lr', type=float, default=3e-4)
+    parser.add_argument('--epochs', type=int, default=20)
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--use_amp', action='store_true', help='Use mixed precision')
+    parser.add_argument('--grad_accum', type=int, default=1)
+    parser.add_argument('--weight_decay', type=float, default=1e-4)
+    parser.add_argument('--different_lr', action='store_true',
+                       help='Use different LR for encoder/decoder')
+    # Scheduler arguments
+    parser.add_argument('--scheduler', type=str, default='plateau',
+                       choices=['cosine', 'plateau', 'warmup_cosine', 'none'])
+    parser.add_argument('--patience', type=int, default=3)
+    parser.add_argument('--min_lr', type=float, default=1e-6)
+    parser.add_argument('--warmup_epochs', type=int, default=2)
+    # Checkpointing
+    parser.add_argument('--checkpoint_dir', type=str, default='checkpoints')
+    parser.add_argument('--resume_checkpoint', type=str, default=None)
+    parser.add_argument('--save_every', type=int, default=5)
+    parser.add_argument('--early_stopping_patience', type=int, default=5)
+    # Experiment tracking
+    parser.add_argument('--use_wandb', action='store_true', help='Use Weights & Biases')
+    parser.add_argument('--wandb_project', type=str, default='image-captioning')
+    # Additional args needed by create_dataloaders and train_epoch
+    parser.add_argument('--distributed', action='store_true', help='Use distributed training')
+    parser.add_argument('--local_rank', type=int, default=0, help='Local rank for distributed training')
+    args = parser.parse_args()
+    # Set epoch attribute (will be updated during training)
+    args.epoch = 0
+    train_advanced(args)
+if __name__ == '__main__':
+    main()