Spaces:

dixisouls
/

image-captioning-api

Sleeping

App Files Files Community

dixisouls commited on Mar 18, 2025

Commit

a0c5c81

1 Parent(s): f2ddb15

Initial Commit

Browse files

Files changed (9) hide show

Dockerfile +35 -0
README.md +80 -12
app.py +35 -0
app/__init__.py +0 -0
app/api.py +177 -0
app/download_model.py +55 -0
app/fix_vocab_pickle.py +127 -0
app/image_captioning_service.py +352 -0
requirements.txt +11 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM python:3.9-slim
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Create necessary directories with correct permissions
+RUN mkdir -p /tmp/uploads && chmod 777 /tmp/uploads
+RUN mkdir -p app/models && chmod 777 app/models
+# Copy application code
+COPY app ./app
+COPY app.py .
+# Download NLTK data
+RUN python -c "import nltk; nltk.download('punkt')"
+# Download model files during build
+RUN python -m app.download_model
+# Expose port
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,80 @@
----
-title: Image Captioning Api
-emoji: 😻
-colorFrom: indigo
-colorTo: blue
-sdk: docker
-pinned: false
-license: mit
-short_description: API Endpoint for Image Captioning
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Image Captioning API
+A RESTful API for generating captions from images using a Transformer-based
+model. This service is designed to be deployed on Hugging Face Spaces.
+## Features
+- Upload any image file (jpg, png, etc.)
+- Get AI-generated captions based on image content
+- FastAPI-based REST API with documentation
+## API Endpoints
+- `GET /` - API information and usage
+- `POST /generate` - Upload an image and get a caption
+- `GET /health` - Health check endpoint
+- `GET /docs` - Swagger UI documentation
+## How to Use
+### API Request Example
+```bash
+curl -X POST "https://your-space-name.hf.space/generate" \
+  -H "accept: application/json" \
+  -H "Content-Type: multipart/form-data" \
+  -F "image=@your_image.jpg" \
+  -F "max_length=20"
+```
+### API Response Example
+```json
+{
+  "caption": "a person riding a snowboard down a snow covered slope",
+  "image": "base64_encoded_image_data..."
+}
+```
+## Local Development
+### Prerequisites
+- Python 3.9+
+- pip
+### Setup
+1. Clone the repository
+2. Install dependencies:
+   ```
+   pip install -r requirements.txt
+   ```
+3. Run the application:
+   ```
+   python app.py
+   ```
+4. Visit http://localhost:7860/docs to access the API documentation
+## Deployment on Hugging Face Spaces
+This application is designed to be deployed on
+[Hugging Face Spaces](https://huggingface.co/spaces) using Docker.
+1. Create a new Space on Hugging Face
+2. Select Docker as the SDK
+3. Upload all files to the repository
+4. Hugging Face will automatically build and deploy the application
+## Technical Details
+- **Model**: ResNet50 encoder with Transformer decoder
+- **Framework**: PyTorch
+- **API**: FastAPI
+- **Image Processing**: torchvision and PIL
+- **Model Hosting**: Hugging Face Hub
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Main application entry point for Image Captioning API
+"""
+import os
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Check if model files exist and download if needed
+def ensure_models_exist():
+    model_path = "app/models/image_captioning_model.pth"
+    vocab_path = "app/models/vocab.pkl"
+    if not os.path.exists(model_path) or not os.path.exists(vocab_path):
+        logger.info("Model files not found. Downloading...")
+        from app.download_model import download_models
+        download_models()
+    else:
+        logger.info("Model files found.")
+if __name__ == "__main__":
+    # Ensure model files exist
+    ensure_models_exist()
+    # Run the FastAPI application
+    import uvicorn
+    from app.api import app
+    logger.info("Starting Image Captioning API server...")
+    uvicorn.run(app, host="0.0.0.0", port=7860)

app/__init__.py ADDED Viewed

File without changes

app/api.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import base64
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import shutil
+import uuid
+import logging
+from typing import Dict, Any
+import torch
+# Import image captioning service
+from app.image_captioning_service import generate_caption
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Use /tmp directory which should be writable
+UPLOAD_DIR = "/tmp/uploads"
+# Create necessary directories
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+os.makedirs("app/models", exist_ok=True)
+# Initialize FastAPI app
+app = FastAPI(title="Image Captioning API")
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Get device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+logger.info(f"Using device: {device}")
+@app.get("/")
+def read_root():
+    return {
+        "message": "Image Captioning API is running",
+        "usage": "POST /generate with an image file to generate a caption",
+        "docs": "Visit /docs for API documentation"
+    }
+@app.post("/generate")
+async def generate_image_caption(
+    image: UploadFile = File(...),
+    max_length: int = Form(20),
+) -> Dict[str, Any]:
+    try:
+        # Debug information
+        logger.info(f"Received file: {image.filename}, content_type: {image.content_type}")
+        # Input validation with improved error handling
+        if image is None:
+            raise HTTPException(status_code=400, detail="No image file provided")
+        if not image.content_type:
+            # Set a default content type if none provided
+            logger.warning("No content type provided, assuming image/jpeg")
+            image.content_type = "image/jpeg"
+        if not image.content_type.startswith("image/"):
+            raise HTTPException(
+                status_code=400, detail=f"Uploaded file must be an image, got {image.content_type}"
+            )
+        if not (0 < max_length <= 100):
+            raise HTTPException(
+                status_code=400, detail="Maximum caption length must be between 1 and 100"
+            )
+        # Generate unique ID for this job
+        job_id = str(uuid.uuid4())
+        short_id = job_id.split("-")[0]
+        # Create directories for this job in /tmp which should be writable
+        upload_job_dir = os.path.join(UPLOAD_DIR, job_id)
+        # Create directories with explicit permission setting
+        os.makedirs(upload_job_dir, exist_ok=True, mode=0o777)
+        logger.info(f"Created upload directory: {upload_job_dir}")
+        # Determine file extension
+        file_ext = os.path.splitext(image.filename)[1] if image.filename else ".jpg"
+        if not file_ext:
+            file_ext = ".jpg"
+        # Save the uploaded image to /tmp
+        image_filename = f"{short_id}{file_ext}"
+        image_path = os.path.join(upload_job_dir, image_filename)
+        # Save the file with error handling
+        try:
+            # Explicitly open with write permissions
+            with open(image_path, "wb") as buffer:
+                contents = await image.read()
+                buffer.write(contents)
+            # Check if file was created and has size
+            if not os.path.exists(image_path):
+                raise HTTPException(status_code=400, detail=f"Failed to save uploaded file to {image_path}")
+            if os.path.getsize(image_path) == 0:
+                raise HTTPException(status_code=400, detail="Uploaded file is empty")
+            logger.info(f"Image saved to {image_path} ({os.path.getsize(image_path)} bytes)")
+        except Exception as e:
+            logger.error(f"Error saving file: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Error saving uploaded file: {str(e)}")
+        # Define model paths
+        model_path = "app/models/image_captioning_model.pth"
+        vocabulary_path = "app/models/vocab.pkl"
+        # Check if model files exist
+        if not os.path.exists(model_path):
+            logger.error(f"Model file not found: {model_path}")
+            raise HTTPException(status_code=500, detail=f"Model file not found: {model_path}")
+        if not os.path.exists(vocabulary_path):
+            logger.error(f"Vocabulary file not found: {vocabulary_path}")
+            raise HTTPException(status_code=500, detail=f"Vocabulary file not found: {vocabulary_path}")
+        # Generate caption
+        try:
+            caption = generate_caption(
+                image_path=image_path,
+                model_path=model_path,
+                vocab_path=vocabulary_path,
+                max_length=max_length,
+                device=device
+            )
+            logger.info(f"Generated caption: {caption}")
+        except Exception as e:
+            logger.error(f"Error generating caption: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Error generating caption: {str(e)}")
+        # Read the original image as base64
+        try:
+            with open(image_path, "rb") as img_file:
+                image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+            logger.info("Successfully encoded image as base64")
+        except Exception as e:
+            logger.error(f"Error reading image: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Error reading image: {str(e)}")
+        # Prepare response with base64 encoded image
+        response = {
+            "caption": caption,
+            "image": image_base64
+        }
+        # Clean up
+        try:
+            shutil.rmtree(upload_job_dir)
+            logger.info("Cleaned up temporary directories")
+        except Exception as e:
+            logger.warning(f"Error cleaning up temporary files: {str(e)}")
+        return response
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
+@app.get("/health")
+def health_check():
+    return {"status": "healthy"}

app/download_model.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import sys
+from huggingface_hub import hf_hub_download
+import shutil
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def download_models():
+    """Download model files from Hugging Face Hub"""
+    logger.info("Downloading model files...")
+    # Create directories if they don't exist
+    os.makedirs("app/models", exist_ok=True)
+    try:
+        # Download the model and vocabulary from Hugging Face
+        logger.info("Downloading model from dixisouls/image-captioning-model...")
+        model_path = hf_hub_download(
+            repo_id="dixisouls/image-captioning-model",
+            filename="image_captioning_model.pth",
+            repo_type="model"
+        )
+        logger.info("Downloading vocabulary from dixisouls/image-captioning-model...")
+        vocab_path = hf_hub_download(
+            repo_id="dixisouls/image-captioning-model",
+            filename="vocab.pkl",
+            repo_type="model"
+        )
+        # Copy the downloaded files to the app/models directory
+        shutil.copy(model_path, "app/models/image_captioning_model.pth")
+        shutil.copy(vocab_path, "app/models/vocab.pkl")
+        logger.info(f"Model downloaded successfully to app/models/image_captioning_model.pth")
+        logger.info(f"Vocabulary downloaded successfully to app/models/vocab.pkl")
+        # Create fixed vocabulary file if needed
+        try:
+            from app.fix_vocab_pickle import fix_vocab_pickle
+            fixed_vocab = fix_vocab_pickle("app/models/vocab.pkl", "app/models/vocab_fixed.pkl")
+            if fixed_vocab:
+                logger.info("Created fixed vocabulary file at app/models/vocab_fixed.pkl")
+        except Exception as e:
+            logger.warning(f"Could not create fixed vocabulary file: {str(e)}")
+    except Exception as e:
+        logger.error(f"Error downloading model files: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    download_models()

app/fix_vocab_pickle.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Script to fix the vocabulary pickle file by recreating it with correct module information.
+Run this script if you're still experiencing Vocabulary loading issues.
+"""
+import pickle
+import os
+import sys
+import nltk
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Make sure NLTK tokenizer is available
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# Vocabulary class for loading the vocabulary
+class Vocabulary:
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+    def __len__(self):
+        return len(self.word2idx)
+    def tokenize(self, text):
+        """Tokenize text into a list of tokens"""
+        tokens = nltk.tokenize.word_tokenize(str(text).lower())
+        return tokens
+def fix_vocab_pickle(input_path, output_path):
+    """
+    Load the vocabulary pickle file and create a new one with updated module information.
+    """
+    try:
+        logger.info(f"Attempting to load vocabulary from {input_path}...")
+        # Try first with a very permissive custom unpickler
+        class FixerUnpickler(pickle.Unpickler):
+            def find_class(self, module, name):
+                # For any class named Vocabulary, use our Vocabulary class
+                if name == 'Vocabulary':
+                    return Vocabulary
+                # Attempt default behavior, but catch and handle potential errors
+                try:
+                    return super().find_class(module, name)
+                except:
+                    # If we can't find the class in the specified module, try to find an equivalent
+                    if name == 'Vocabulary':
+                        return Vocabulary
+                    # For other classes, we might need more specific handling
+                    raise
+        # Try to load with our custom unpickler
+        with open(input_path, 'rb') as f:
+            try:
+                vocab = FixerUnpickler(f).load()
+                logger.info("Successfully loaded vocabulary!")
+            except Exception as e:
+                logger.warning(f"Custom unpickler failed: {str(e)}")
+                # If that fails, try raw load and extract data
+                f.seek(0)  # Reset file pointer
+                try:
+                    raw_data = pickle.load(f)
+                    logger.info("Loaded raw data, attempting to extract vocabulary...")
+                    # Create a new vocabulary
+                    vocab = Vocabulary()
+                    # Try to extract the necessary data
+                    if hasattr(raw_data, 'word2idx') and hasattr(raw_data, 'idx2word'):
+                        vocab.word2idx = raw_data.word2idx
+                        vocab.idx2word = raw_data.idx2word
+                        vocab.idx = raw_data.idx if hasattr(raw_data, 'idx') else len(vocab.word2idx)
+                    elif isinstance(raw_data, dict) and 'word2idx' in raw_data and 'idx2word' in raw_data:
+                        vocab.word2idx = raw_data['word2idx']
+                        vocab.idx2word = raw_data['idx2word']
+                        vocab.idx = raw_data.get('idx', len(vocab.word2idx))
+                    else:
+                        logger.error("Could not extract vocabulary data from the pickle file.")
+                        logger.error(f"Raw data type: {type(raw_data)}")
+                        return None
+                except Exception as e:
+                    logger.error(f"Raw data extraction failed: {str(e)}")
+                    return None
+        # Save the vocabulary with the correct module information
+        logger.info(f"Saving fixed vocabulary to {output_path}...")
+        with open(output_path, 'wb') as f:
+            pickle.dump(vocab, f, protocol=pickle.HIGHEST_PROTOCOL)
+        logger.info(f"Vocabulary successfully fixed and saved to {output_path}")
+        logger.info(f"Vocabulary size: {len(vocab)} words")
+        logger.info(f"Sample words: {list(vocab.word2idx.keys())[:5]}")
+        return vocab
+    except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
+        return None
+if __name__ == "__main__":
+    # Parse command line arguments
+    import argparse
+    parser = argparse.ArgumentParser(description='Fix vocabulary pickle file')
+    parser.add_argument('--input', type=str, default='app/models/vocab.pkl', help='Path to the input vocabulary pickle file')
+    parser.add_argument('--output', type=str, default='app/models/vocab_fixed.pkl', help='Path to save the fixed vocabulary pickle file')
+    args = parser.parse_args()
+    # Run the fix function
+    vocab = fix_vocab_pickle(args.input, args.output)
+    if vocab is not None:
+        logger.info("\nTo use the fixed vocabulary, update your paths to use the new file.")

app/image_captioning_service.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from PIL import Image
+import torchvision.transforms as transforms
+import nltk
+import pickle
+import warnings
+import logging
+warnings.filterwarnings("ignore")
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Make sure NLTK tokenizer is available
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+# Vocabulary class for loading the vocabulary
+class Vocabulary:
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = {}
+        self.idx = 0
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.word2idx[word] = self.idx
+            self.idx2word[self.idx] = word
+            self.idx += 1
+    def __len__(self):
+        return len(self.word2idx)
+    def tokenize(self, text):
+        """Tokenize text into a list of tokens"""
+        tokens = nltk.tokenize.word_tokenize(str(text).lower())
+        return tokens
+    @classmethod
+    def load(cls, path):
+        """Load vocabulary from pickle file"""
+        # Try multiple strategies to load the vocabulary
+        try:
+            # Strategy 1: Use a custom unpickler with more comprehensive handling
+            class CustomUnpickler(pickle.Unpickler):
+                def find_class(self, module, name):
+                    # Check for Vocabulary in any module path
+                    if name == 'Vocabulary':
+                        # Try to find Vocabulary in different possible modules
+                        # First in this current module
+                        return Vocabulary
+                    # Check for special cases
+                    if module == '__main__':
+                        # Look in typical modules where the class might be defined
+                        if name == 'Vocabulary':
+                            return Vocabulary
+                    # Default behavior
+                    return super().find_class(module, name)
+            with open(path, 'rb') as f:
+                return CustomUnpickler(f).load()
+        except Exception as e:
+            logger.error(f"First loading method failed: {str(e)}")
+            try:
+                # Strategy 2: Manual recreation of vocabulary object from raw pickle data
+                with open(path, 'rb') as f:
+                    raw_data = pickle.load(f)
+                    # If it's a dict-like object, we can try to extract the vocabulary data
+                    if hasattr(raw_data, 'word2idx') and hasattr(raw_data, 'idx2word'):
+                        # Create a new Vocabulary instance
+                        vocab = Vocabulary()
+                        vocab.word2idx = raw_data.word2idx
+                        vocab.idx2word = raw_data.idx2word
+                        vocab.idx = raw_data.idx
+                        return vocab
+                    else:
+                        # Create a fresh vocabulary directly from the dictionary data
+                        vocab = Vocabulary()
+                        # Try to extract word mappings from whatever structure the pickle has
+                        if isinstance(raw_data, dict):
+                            if 'word2idx' in raw_data and 'idx2word' in raw_data:
+                                vocab.word2idx = raw_data['word2idx']
+                                vocab.idx2word = raw_data['idx2word']
+                                vocab.idx = len(vocab.word2idx)
+                                return vocab
+                raise ValueError("Could not extract vocabulary data from pickle file")
+            except Exception as e:
+                logger.error(f"Second loading method failed: {str(e)}")
+                # Try to use fix_vocab_pickle as a last resort
+                try:
+                    from app.fix_vocab_pickle import fix_vocab_pickle
+                    fixed_path = path + "_fixed.pkl"
+                    vocab = fix_vocab_pickle(path, fixed_path)
+                    if vocab:
+                        logger.info(f"Vocabulary fixed and saved to {fixed_path}")
+                        return vocab
+                except Exception as e:
+                    logger.error(f"Vocabulary fixing failed: {str(e)}")
+                raise RuntimeError(f"All vocabulary loading methods failed. Original error: {str(e)}")
+# Encoder: Pretrained ResNet
+class EncoderCNN(torch.nn.Module):
+    def __init__(self, embed_dim):
+        super(EncoderCNN, self).__init__()
+        # Load pretrained ResNet
+        import torchvision.models as models
+        resnet = models.resnet50(pretrained=True)
+        # Remove the final FC layer
+        modules = list(resnet.children())[:-1]
+        self.resnet = torch.nn.Sequential(*modules)
+        # Project to embedding dimension
+        self.fc = torch.nn.Linear(resnet.fc.in_features, embed_dim)
+        self.bn = torch.nn.BatchNorm1d(embed_dim)
+        self.dropout = torch.nn.Dropout(0.5)
+    def forward(self, images):
+        with torch.no_grad():  # No gradients for pretrained model
+            features = self.resnet(images)
+        features = features.reshape(features.size(0), -1)
+        features = self.fc(features)
+        features = self.bn(features)
+        features = self.dropout(features)
+        return features
+# Positional Encoding for Transformer
+class PositionalEncoding(torch.nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        import math
+        # Create positional encoding
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        # Register buffer (not model parameter)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1), :].to(x.device)
+        return x
+# Custom Transformer Decoder
+class TransformerDecoder(torch.nn.Module):
+    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, dropout=0.1):
+        super(TransformerDecoder, self).__init__()
+        import math
+        # Embedding layer
+        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
+        self.positional_encoding = PositionalEncoding(embed_dim)
+        # Transformer decoder layers
+        decoder_layer = torch.nn.TransformerDecoderLayer(
+            d_model=embed_dim,
+            nhead=num_heads,
+            dim_feedforward=ff_dim,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.transformer_decoder = torch.nn.TransformerDecoder(
+            decoder_layer,
+            num_layers=num_layers
+        )
+        # Output layer
+        self.fc = torch.nn.Linear(embed_dim, vocab_size)
+        self.dropout = torch.nn.Dropout(dropout)
+    def generate_square_subsequent_mask(self, sz):
+        # Create mask to prevent attention to future tokens
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+    def forward(self, tgt, memory):
+        # Create mask for decoder
+        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
+        # Embed tokens and add positional encoding
+        tgt = self.embedding(tgt) * math.sqrt(self.embedding.embedding_dim)
+        tgt = self.positional_encoding(tgt)
+        tgt = self.dropout(tgt)
+        # Pass through transformer decoder
+        output = self.transformer_decoder(
+            tgt,
+            memory,
+            tgt_mask=tgt_mask
+        )
+        # Project to vocabulary
+        output = self.fc(output)
+        return output
+# Complete Image Captioning Model
+class ImageCaptioningModel(torch.nn.Module):
+    def __init__(self, vocab_size, embed_dim, hidden_dim, num_heads, num_layers):
+        super(ImageCaptioningModel, self).__init__()
+        # Image encoder
+        self.encoder = EncoderCNN(embed_dim)
+        # Caption decoder
+        self.decoder = TransformerDecoder(
+            vocab_size=vocab_size,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            ff_dim=hidden_dim,
+            num_layers=num_layers
+        )
+    def forward(self, images, captions):
+        # Encode images
+        img_features = self.encoder(images)
+        # Reshape for transformer (batch_size, seq_len, embed_dim)
+        # In this case, seq_len=1 since we have a single "token" representing the image
+        img_features = img_features.unsqueeze(1)
+        # Decode captions (excluding the last token, typically <EOS>)
+        outputs = self.decoder(captions[:, :-1], img_features)
+        return outputs
+    def generate_caption(self, image, vocab, max_length=20):
+        """Generate a caption for the given image"""
+        with torch.no_grad():
+            # Encode image
+            img_features = self.encoder(image.unsqueeze(0))
+            img_features = img_features.unsqueeze(1)
+            # Start with < SOS > token
+            current_ids = torch.tensor([[vocab.word2idx['< SOS >']]], dtype=torch.long).to(image.device)
+            # Generate words one by one
+            result_caption = []
+            for i in range(max_length):
+                # Predict next word
+                outputs = self.decoder(current_ids, img_features)
+                # Get the most likely next word
+                _, predicted = outputs[:, -1, :].max(1)
+                # Add predicted word to the sequence
+                result_caption.append(predicted.item())
+                # Break if <EOS>
+                if predicted.item() == vocab.word2idx['<EOS>']:
+                    break
+                # Add to current sequence for next iteration
+                current_ids = torch.cat([current_ids, predicted.unsqueeze(0)], dim=1)
+            # Convert word indices to words
+            words = [vocab.idx2word[idx] for idx in result_caption]
+            # Remove <EOS> token if present
+            if words and words[-1] == '<EOS>':
+                words = words[:-1]
+            return ' '.join(words)
+def load_image(image_path, transform=None):
+    """Load and preprocess an image"""
+    image = Image.open(image_path).convert('RGB')
+    if transform is None:
+        transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    image = transform(image)
+    return image
+def generate_caption(
+    image_path,
+    model_path,
+    vocab_path,
+    max_length=20,
+    device=None
+):
+    """Generate a caption for an image"""
+    # Set device
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Using device: {device}")
+    # Check if files exist
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image not found at {image_path}")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model not found at {model_path}")
+    if not os.path.exists(vocab_path):
+        raise FileNotFoundError(f"Vocabulary not found at {vocab_path}")
+    # Load vocabulary
+    logger.info(f"Loading vocabulary from {vocab_path}")
+    vocab = Vocabulary.load(vocab_path)
+    logger.info(f"Loaded vocabulary with {len(vocab)} words")
+    # Load model
+    # Hyperparameters - must match those used during training
+    embed_dim = 512
+    hidden_dim = 2048
+    num_layers = 6
+    num_heads = 8
+    # Initialize model
+    logger.info("Initializing model")
+    model = ImageCaptioningModel(
+        vocab_size=len(vocab),
+        embed_dim=embed_dim,
+        hidden_dim=hidden_dim,
+        num_heads=num_heads,
+        num_layers=num_layers
+    ).to(device)
+    # Load model weights
+    logger.info(f"Loading model weights from {model_path}")
+    checkpoint = torch.load(model_path, map_location=device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    # Load and process image
+    logger.info(f"Loading and processing image from {image_path}")
+    image = load_image(image_path)
+    image = image.to(device)
+    # Generate caption
+    logger.info("Generating caption")
+    caption = model.generate_caption(image, vocab, max_length=max_length)
+    logger.info(f"Generated caption: {caption}")
+    return caption

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi==0.103.1
+uvicorn==0.23.2
+python-multipart==0.0.6
+pillow==10.0.0
+torch==2.0.1
+torchvision==0.15.2
+nltk==3.8.1
+huggingface-hub==0.16.4
+numpy==1.24.3
+aiofiles==23.1.0
+python-dotenv==1.0.0