Spaces:

AhsanAftab
/

Action-recognition-and-captioning

Sleeping

App Files Files Community

AhsanAftab commited on Jan 4

Commit

c8b4cd2

verified ·

1 Parent(s): 7cd9526

Upload 3 files

Browse files

Files changed (3) hide show

app.py +206 -0
inference.py +121 -0
model_loader.py +138 -0

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Flask REST API for Image Captioning and Action Recognition
+"""
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import torch
+from PIL import Image
+import io
+import base64
+import logging
+from model_loader import load_caption_model, load_action_model, load_vocab
+from inference import generate_caption, predict_action
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize Flask app
+app = Flask(__name__)
+CORS(app)  # Enable CORS for frontend communication
+# Global variables for models
+caption_model = None
+action_model = None
+vocab = None
+device = None
+@app.route('/')
+def home():
+    """Home endpoint"""
+    return jsonify({
+        'message': 'Image Captioning & Action Recognition API',
+        'status': 'running',
+        'endpoints': {
+            'health': '/health',
+            'caption': '/api/caption',
+            'action': '/api/action',
+            'combined': '/api/combined'
+        }
+    })
+@app.route('/health')
+def health():
+    """Health check endpoint"""
+    return jsonify({
+        'status': 'healthy',
+        'models_loaded': {
+            'caption_model': caption_model is not None,
+            'action_model': action_model is not None,
+            'vocab': vocab is not None
+        },
+        'device': str(device)
+    })
+@app.route('/api/caption', methods=['POST'])
+def caption_image():
+    """
+    Generate caption for uploaded image
+    Expected: multipart/form-data with 'image' file
+    Returns: JSON with generated caption
+    """
+    try:
+        # Check if image is in request
+        if 'image' not in request.files:
+            return jsonify({'error': 'No image provided'}), 400
+        file = request.files['image']
+        # Read image
+        image_bytes = file.read()
+        image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
+        # Generate caption
+        caption = generate_caption(caption_model, image, vocab, device)
+        logger.info(f"Caption generated: {caption}")
+        return jsonify({
+            'success': True,
+            'caption': caption
+        })
+    except Exception as e:
+        logger.error(f"Error in caption generation: {str(e)}")
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+@app.route('/api/action', methods=['POST'])
+def recognize_action():
+    """
+    Recognize action in uploaded image
+    Expected: multipart/form-data with 'image' file
+    Returns: JSON with predicted action and confidence
+    """
+    try:
+        # Check if image is in request
+        if 'image' not in request.files:
+            return jsonify({'error': 'No image provided'}), 400
+        file = request.files['image']
+        # Read image
+        image_bytes = file.read()
+        image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
+        # Predict action
+        result = predict_action(action_model, image, device)
+        logger.info(f"Action predicted: {result['predicted_class']} ({result['confidence']:.2f}%)")
+        return jsonify({
+            'success': True,
+            'predicted_action': result['predicted_class'],
+            'confidence': result['confidence'],
+            'all_predictions': result['all_predictions']
+        })
+    except Exception as e:
+        logger.error(f"Error in action recognition: {str(e)}")
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+@app.route('/api/combined', methods=['POST'])
+def combined_inference():
+    """
+    Perform both captioning and action recognition
+    Expected: multipart/form-data with 'image' file
+    Returns: JSON with both caption and action prediction
+    """
+    try:
+        # Check if image is in request
+        if 'image' not in request.files:
+            return jsonify({'error': 'No image provided'}), 400
+        file = request.files['image']
+        # Read image
+        image_bytes = file.read()
+        image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
+        # Generate caption
+        caption = generate_caption(caption_model, image, vocab, device)
+        # Predict action
+        action_result = predict_action(action_model, image, device)
+        logger.info(f"Combined - Caption: {caption}, Action: {action_result['predicted_class']}")
+        return jsonify({
+            'success': True,
+            'caption': caption,
+            'action': {
+                'predicted_action': action_result['predicted_class'],
+                'confidence': action_result['confidence'],
+                'all_predictions': action_result['all_predictions']
+            }
+        })
+    except Exception as e:
+        logger.error(f"Error in combined inference: {str(e)}")
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+def initialize_models():
+    global caption_model, action_model, vocab, device
+    logger.info("Initializing models...")
+    # Set device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"Using device: {device}")
+    # Load models
+    try:
+        caption_model, vocab = load_caption_model(device)
+        logger.info(" Caption model loaded")
+        action_model = load_action_model(device)
+        logger.info(" Action model loaded")
+        logger.info("All models initialized successfully!")
+    except Exception as e:
+        logger.error(f"Error loading models: {str(e)}")
+        raise
+if __name__ == '__main__':
+    # Initialize models
+    initialize_models()
+    # Run Flask app
+    app.run(
+        host='0.0.0.0',
+        port=5000,
+        debug=False  # Set to False in production
+    )

inference.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+from torchvision import transforms
+from PIL import Image
+import pickle
+from pathlib import Path
+# Image transformations
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])
+])
+# Load action class names (we'll load this once)
+_action_class_names = None
+def get_action_class_names():
+    """Load action class names"""
+    global _action_class_names
+    if _action_class_names is None:
+        model_dir = Path(__file__).parent.parent / 'models'
+        with open(model_dir / 'action_model_config.pkl', 'rb') as f:
+            config = pickle.load(f)
+        _action_class_names = config['class_names']
+    return _action_class_names
+def generate_caption(model, image, vocab, device, max_length=30):
+    """
+    Generate caption for an image
+    Args:
+        model: Trained caption model
+        image: PIL Image
+        vocab: Vocabulary object
+        device: torch device
+        max_length: Maximum caption length
+    Returns:
+        caption: Generated caption string
+    """
+    model.eval()
+    # Transform image
+    image_tensor = transform(image).unsqueeze(0).to(device)
+    # Generate caption
+    with torch.no_grad():
+        caption_indices = model.generate_caption(image_tensor, max_length)
+    # Decode caption
+    caption_indices = caption_indices[0].cpu().numpy()
+    caption_words = vocab.decode(caption_indices)
+    # Remove special tokens and create caption
+    caption = []
+    for word in caption_words:
+        if word == vocab.start_token:
+            continue
+        if word == vocab.end_token:
+            break
+        if word == vocab.pad_token:
+            break
+        caption.append(word)
+    caption_text = ' '.join(caption)
+    # Capitalize first letter
+    if caption_text:
+        caption_text = caption_text[0].upper() + caption_text[1:]
+    return caption_text
+def predict_action(model, image, device):
+    """
+    Predict action for an image
+    Args:
+        model: Trained action model
+        image: PIL Image
+        device: torch device
+    Returns:
+        dict: Prediction results with class, confidence, and all predictions
+    """
+    model.eval()
+    # Get class names
+    class_names = get_action_class_names()
+    # Transform image
+    image_tensor = transform(image).unsqueeze(0).to(device)
+    # Predict
+    with torch.no_grad():
+        outputs = model(image_tensor)
+        probabilities = torch.softmax(outputs, dim=1)
+        confidence, predicted_idx = probabilities.max(dim=1)
+    predicted_class = class_names[predicted_idx.item()]
+    confidence_percent = confidence.item() * 100
+    # Get all predictions (sorted by probability)
+    all_probs = probabilities[0].cpu().numpy() * 100
+    # Create list of all predictions
+    all_predictions = []
+    for idx, prob in enumerate(all_probs):
+        all_predictions.append({
+            'class': class_names[idx],
+            'probability': float(prob)
+        })
+    # Sort by probability
+    all_predictions = sorted(all_predictions, key=lambda x: x['probability'], reverse=True)
+    return {
+        'predicted_class': predicted_class,
+        'confidence': float(confidence_percent),
+        'all_predictions': all_predictions[:5]  # Return top 5
+    }

model_loader.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import torch.nn as nn
+from torchvision import models
+import pickle
+from pathlib import Path
+# Model architecture classes (same as in training)
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = models.resnet50(pretrained=False)
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        self.fc = nn.Linear(resnet.fc.in_features, embed_size)
+        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
+    def forward(self, images):
+        features = self.resnet(images)
+        features = features.view(features.size(0), -1)
+        features = self.fc(features)
+        features = self.bn(features)
+        return features
+class DecoderLSTM(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
+        super(DecoderLSTM, self).__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers,
+                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_size, vocab_size)
+    def forward(self, features, captions):
+        embeddings = self.embed(captions)
+        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
+        hiddens, _ = self.lstm(embeddings)
+        outputs = self.fc(hiddens)
+        return outputs
+    def sample(self, features, max_length=50):
+        batch_size = features.size(0)
+        captions = []
+        states = None
+        inputs = features.unsqueeze(1)
+        for _ in range(max_length):
+            hiddens, states = self.lstm(inputs, states)
+            outputs = self.fc(hiddens.squeeze(1))
+            predicted = outputs.argmax(dim=1)
+            captions.append(predicted)
+            inputs = self.embed(predicted).unsqueeze(1)
+        captions = torch.stack(captions, dim=1)
+        return captions
+class ImageCaptioningModel(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
+        super(ImageCaptioningModel, self).__init__()
+        self.encoder = EncoderCNN(embed_size)
+        self.decoder = DecoderLSTM(embed_size, hidden_size, vocab_size, num_layers, dropout)
+    def forward(self, images, captions):
+        features = self.encoder(images)
+        outputs = self.decoder(features, captions)
+        return outputs
+    def generate_caption(self, images, max_length=50):
+        features = self.encoder(images)
+        captions = self.decoder.sample(features, max_length)
+        return captions
+class ActionRecognitionModel(nn.Module):
+    def __init__(self, num_classes, dropout=0.5):
+        super(ActionRecognitionModel, self).__init__()
+        self.backbone = models.resnet50(pretrained=False)
+        num_features = self.backbone.fc.in_features
+        self.backbone.fc = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Linear(num_features, 512),
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Dropout(dropout),
+            nn.Linear(512, num_classes)
+        )
+    def forward(self, x):
+        return self.backbone(x)
+def load_caption_model(device, model_dir='../models'):
+    model_dir = Path(model_dir)
+    # Load configuration
+    with open(model_dir / 'caption_model_config.pkl', 'rb') as f:
+        config = pickle.load(f)
+    # Load vocabulary
+    with open(model_dir / 'vocab.pkl', 'rb') as f:
+        vocab = pickle.load(f)
+    # Create model
+    model = ImageCaptioningModel(
+        embed_size=config['embed_size'],
+        hidden_size=config['hidden_size'],
+        vocab_size=config['vocab_size'],
+        num_layers=config['num_layers'],
+        dropout=config['dropout']
+    )
+    # Load weights
+    model.load_state_dict(torch.load(model_dir / 'caption_model_final.pth',
+                                     map_location=device))
+    model = model.to(device)
+    model.eval()
+    return model, vocab
+def load_action_model(device, model_dir='../models'):
+    """Load action recognition model"""
+    model_dir = Path(model_dir)
+    # Load configuration
+    with open(model_dir / 'action_model_config.pkl', 'rb') as f:
+        config = pickle.load(f)
+    # Create model
+    model = ActionRecognitionModel(
+        num_classes=config['num_classes'],
+        dropout=config['dropout']
+    )
+    # Load weights
+    model.load_state_dict(torch.load(model_dir / 'action_model_final.pth',
+                                     map_location=device))
+    model = model.to(device)
+    model.eval()
+    return model