Spaces:

CXM06
/

ai-image-caption-generation

Sleeping

App Files Files Community

CXM06 commited on Nov 11, 2025

Commit

d0c8d86

1 Parent(s): b293ac0

First commit - Individual test scripts present

Browse files

Files changed (17) hide show

.dockerignore +0 -0
.gitattributes +45 -0
.gitignore +129 -0
Dockerfile +75 -0
README.md +226 -2
app.py +396 -0
config.py +327 -0
docker-compose.yml +0 -0
requirements.txt +24 -0
src/__init__.py +10 -0
src/models/__init__.py +35 -0
src/models/caption_model.py +490 -0
src/models/style_model.py +361 -0
src/utils/__init__.py +54 -0
src/utils/analytics.py +373 -0
src/utils/cache_manager.py +403 -0
src/utils/image_processor.py +373 -0

.dockerignore ADDED Viewed

File without changes

.gitattributes ADDED Viewed

	@@ -0,0 +1,45 @@

+# ============================================================================
+# GIT LFS CONFIGURATION FOR HUGGINGFACE SPACES
+# ============================================================================
+# Track large model files with Git LFS
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+# Track large media files
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+# Track large data files
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+# Python files as text
+*.py text eol=lf
+*.md text eol=lf
+*.txt text eol=lf
+*.json text eol=lf
+*.yaml text eol=lf
+*.yml text eol=lf
+# Configuration files
+*.toml text eol=lf
+*.ini text eol=lf
+*.cfg text eol=lf
+# Docker files
+Dockerfile text eol=lf
+*.dockerfile text eol=lf
+docker-compose.yml text eol=lf
+# Shell scripts
+*.sh text eol=lf
+*.bash text eol=lf

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# ============================================================================
+# AI IMAGE CAPTION GENERATOR - GIT IGNORE
+# ============================================================================
+# Environment Variables
+.env
+.env.local
+.env.*.local
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environment
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# Jupyter Notebooks
+.ipynb_checkpoints/
+*.ipynb
+# Model Cache & Downloads
+cache/
+*.pth
+*.bin
+*.safetensors
+*.onnx
+downloads/
+# Analytics & Logs
+*.log
+logs/
+analytics.json
+cache/analytics.json
+test_analytics.json
+# Test Coverage
+.coverage
+.pytest_cache/
+htmlcov/
+.tox/
+.nox/
+coverage.xml
+*.cover
+.hypothesis/
+# Gradio
+gradio_cached_examples/
+flagged/
+# Docker
+*.pid
+*.seed
+*.pid.lock
+# Temporary Files
+*.tmp
+*.temp
+.tmp/
+temp/
+# OS Files
+Thumbs.db
+.DS_Store
+desktop.ini
+# Backup Files
+*.bak
+*.backup
+*~
+# Large Files (use Git LFS instead)
+*.mp4
+*.avi
+*.mov
+*.zip
+*.tar.gz
+# HuggingFace Spaces specific
+spaces/
+# PyTorch
+*.pt
+lightning_logs/
+# Profiling
+*.prof
+# Documentation builds
+docs/_build/
+docs/.doctrees/
+# ============================================================================
+# KEEP THESE DIRECTORIES (create .gitkeep files)
+# ============================================================================
+!cache/.gitkeep
+!static/images/examples/.gitkeep

Dockerfile ADDED Viewed

	@@ -0,0 +1,75 @@

+# ============================================================================
+# AI IMAGE CAPTION GENERATOR - DOCKERFILE
+# ============================================================================
+# Multi-stage build for optimized production image
+# Compatible with HuggingFace Spaces and local deployment
+# ============================================================================
+FROM python:3.10-slim as base
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    DEBIAN_FRONTEND=noninteractive
+# Set working directory
+WORKDIR /app
+# ============================================================================
+# DEPENDENCIES STAGE
+# ============================================================================
+FROM base as dependencies
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    curl \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+# ============================================================================
+# RUNTIME STAGE
+# ============================================================================
+FROM base as runtime
+# Install runtime dependencies only
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy Python dependencies from builder
+COPY --from=dependencies /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
+COPY --from=dependencies /usr/local/bin /usr/local/bin
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p cache/models cache/analytics static/images/examples
+# Set permissions
+RUN chmod -R 755 /app
+# Expose Gradio default port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:7860/ || exit 1
+# Run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,2 +1,226 @@
-# ai-image-caption-generator
-🤖 Multi-model AI image captioning with customizable styles. Powered by BLIP, GIT, and Groq API. Built with Gradio for easy deployment.

+# 🖼️ AI Image Caption Generator
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![PyTorch](https://img.shields.io/badge/PyTorch-2.1.0-EE4C2C.svg)](https://pytorch.org/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/ChinmayM06/ai-image-caption-generator)
+> Generate AI-powered image captions with multiple style options—completely free, no API costs.
+A lightweight, GPU-accelerated image captioning tool using state-of-the-art vision-language models (BLIP & GIT) with style customization powered by Groq's free LLM API.
+---
+## ✨ Features
+- 🎯 **Dual Model Support**: Both BLIP-base (fast) and GIT-large (high quality) run simultaneously
+- 🎨 **5 Caption Styles**: None, Creative, Social Media, Professional, Technical
+- ⚡ **GPU Accelerated**: Optimized for NVIDIA GPUs (works on CPU too)
+- 💾 **Smart Caching**: LRU cache with configurable TTL for faster repeated requests
+- 📊 **Analytics Tracking**: Built-in usage statistics and performance metrics
+- 🖼️ **Image Processing**: Automatic validation, resizing, and format conversion
+- 🔄 **Fallback Mechanisms**: Graceful degradation when API is unavailable
+- 💰 **100% Free**: No OpenAI credits, no hidden costs
+- 🔒 **Privacy First**: Local inference option available
+---
+## 🚀 Live Demo
+Try it out without any installation:
+**[🎮 Launch Live Demo →](https://huggingface.co/spaces/ChinmayM06/ai-image-caption-generator)**
+*Add your Hugging Face Spaces URL above after deployment*
+---
+## 🛠️ Tech Stack
+| Component | Technology |
+|-----------|-----------|
+| **Vision Models** | BLIP-base, GIT-large (Hugging Face) |
+| **Style LLM** | Groq API (free tier) |
+| **Framework** | PyTorch 2.1.0 + CUDA 11.8 |
+| **Interface** | Gradio 4.8.0 |
+| **Deployment** | Hugging Face Spaces (T4 GPU) |
+---
+## 📦 Quick Start
+### Prerequisites
+- Python 3.10+
+- NVIDIA GPU with 4GB+ VRAM (recommended) or CPU
+- CUDA 11.8 (for GPU acceleration)
+### Installation
+```bash
+# Clone repository
+git clone https://github.com/ChinmayM06/ai-image-caption-generator.git
+cd ai-image-caption-generator
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate  # Windows: venv\Scripts\activate
+# Install PyTorch with CUDA support
+pip install torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
+# Install dependencies
+pip install -r requirements.txt
+# Set up environment variables (optional)
+# Create a .env file in the project root with:
+# GROQ_API_KEY=your_groq_api_key_here
+# Get your free API key at https://console.groq.com
+# Note: The app works without API key but styling features will use fallback templates
+# Run the application
+python app.py
+```
+Access at `http://localhost:7860`
+---
+## 🎯 Usage
+### Basic Usage
+```python
+from src.models import get_model_manager, get_style_model
+from src.utils import get_image_processor
+from PIL import Image
+# Initialize components (singleton pattern)
+model_manager = get_model_manager()
+style_model = get_style_model()
+image_processor = get_image_processor()
+# Load models (BLIP and GIT)
+blip_success, git_success = model_manager.load_all_models()
+# Load and preprocess image
+image = Image.open("your_image.jpg")
+processed_img, metadata = image_processor.preprocess_image(image)
+# Generate captions from both models
+captions = model_manager.generate_captions(processed_img)
+blip_caption = captions["blip"]
+git_caption = captions["git"]
+# Apply style (optional)
+styled_blip = style_model.style_caption(blip_caption, style="Professional")
+styled_git = style_model.style_caption(git_caption, style="Creative")
+```
+### Available Models
+Both models run simultaneously to provide comparison:
+- **BLIP-base**: Fast inference (~1-2s), good quality, efficient
+- **GIT-large**: Slower (~3-4s), superior caption quality, more detailed
+### Caption Styles
+| Style | Use Case | Example |
+|-------|----------|---------|
+| **None** | Raw model output | "A dog sitting on grass" |
+| **Creative** | Artistic, imaginative | "A joyful golden retriever basking in nature's embrace" |
+| **Social Media** | Engaging, hashtag-ready | "Meet this good boy enjoying sunny vibes! 🐕☀️ #DogLife" |
+| **Professional** | Business, formal | "Canine subject positioned in outdoor environment" |
+| **Technical** | Detailed, analytical | "Golden retriever breed, seated posture, natural lighting, outdoor setting" |
+---
+## 🐳 Docker Deployment
+```bash
+# Build image
+docker build -t caption-generator .
+# Run container (with GPU)
+docker run --gpus all -p 7860:7860 caption-generator
+# Run container (CPU only)
+docker run -p 7860:7860 -e DEVICE=cpu caption-generator
+```
+---
+## ⚙️ Configuration
+### Environment Variables
+Create a `.env` file in the project root (optional):
+```bash
+# Groq API Key (required for advanced styling, fallback available)
+GROQ_API_KEY=your_groq_api_key_here
+# Hardware Configuration (optional, defaults to 'cuda' if available)
+DEVICE=cuda  # or 'cpu'
+# Logging Level (optional)
+LOG_LEVEL=INFO  # DEBUG, INFO, WARNING, ERROR
+```
+---
+## 🎓 Why This Project?
+Built as a learning project to explore:
+- **GenAI Fundamentals**: Vision-language models, prompt engineering
+- **Practical ML Skills**: GPU optimization, model deployment, API integration
+- **Cost Optimization**: Demonstrating production-quality AI without expensive APIs
+- **Software Architecture**: Caching, analytics, error handling, thread safety
+Perfect for understanding how modern image captioning works under the hood while keeping infrastructure costs at zero.
+---
+## 🤝 Contributing
+Contributions welcome! Feel free to:
+- Report bugs
+- Suggest features
+- Submit pull requests
+- Improve documentation
+- Add new caption styles
+- Optimize performance
+---
+## 📝 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+---
+## 🙏 Acknowledgments
+- [Salesforce BLIP](https://github.com/salesforce/BLIP) - Image captioning model
+- [Microsoft GIT](https://github.com/microsoft/GenerativeImage2Text) - High-quality captions
+- [Groq](https://groq.com) - Free LLM inference API
+- [Hugging Face](https://huggingface.co) - Model hosting & deployment
+---
+## 📬 Contact
+**Chinmay M** - [@ChinmayM06](https://github.com/ChinmayM06)
+Project Link: [https://github.com/ChinmayM06/ai-image-caption-generator](https://github.com/ChinmayM06/ai-image-caption-generator)
+---
+<div align="center">
+**[⭐ Star this repo](https://github.com/ChinmayM06/ai-image-caption-generator)** if you find it helpful!
+Made with ❤️ and lots of ☕
+</div>

app.py ADDED Viewed

	@@ -0,0 +1,396 @@

+"""
+AI Image Caption Generator - Main Application
+Gradio-based web interface for generating image captions using BLIP and GIT models
+with customizable styling via Groq API.
+"""
+import gradio as gr
+import time
+import numpy as np
+from pathlib import Path
+from PIL import Image
+from typing import Tuple, Optional
+# Import our modules
+from config import ui_config, performance_config
+from src.utils import (
+    get_image_processor,
+    get_caption_cache,
+    get_analytics_manager,
+    ImageProcessingError
+)
+from src.models import (
+    get_model_manager,
+    get_style_model,
+    CaptionModelError,
+    StyleModelError
+)
+class CaptionGeneratorApp:
+    """
+    Main application class for the caption generator
+    Manages the Gradio interface and coordinates all components
+    """
+    def __init__(self):
+        """Initialize the application"""
+        print("=" * 60)
+        print("🚀 INITIALIZING AI IMAGE CAPTION GENERATOR")
+        print("=" * 60)
+        # Initialize components
+        self.image_processor = get_image_processor()
+        self.model_manager = get_model_manager()
+        self.style_model = get_style_model()
+        self.cache = get_caption_cache()
+        self.analytics = get_analytics_manager()
+        print("\n✓ Components initialized")
+        # Load models
+        print("\n📦 Loading AI models (this may take a few minutes on first run)...")
+        blip_success, git_success = self.model_manager.load_all_models()
+        if not (blip_success and git_success):
+            print("\n⚠️  Warning: Some models failed to load")
+            print(f"   BLIP: {'✓' if blip_success else '✗'}")
+            print(f"   GIT: {'✓' if git_success else '✗'}")
+        else:
+            print("\n✓ All models loaded successfully")
+        # Check style model
+        if self.style_model.is_api_available():
+            print("✓ Groq API connected")
+        else:
+            print("⚠️  Groq API not available - using fallback styling")
+        print("\n" + "=" * 60)
+        print("✅ INITIALIZATION COMPLETE")
+        print("=" * 60 + "\n")
+    def generate_captions(
+        self,
+        image,  # Changed: Can be path or PIL Image
+        style: str,
+        progress=gr.Progress()
+    ) -> Tuple[str, str, str]:
+        """
+        Generate captions for an image
+        Args:
+            image: Image path (str) or PIL Image
+            style: Style to apply
+            progress: Gradio progress tracker
+        Returns:
+            Tuple[str, str, str]: (blip_caption, git_caption, stats_text)
+        """
+        start_time = time.time()
+        try:
+            # Step 1: Validate and preprocess image
+            progress(0.1, desc="Validating image...")
+            if image is None:
+                return (
+                    "❌ Error: No image provided",
+                    "❌ Error: No image provided",
+                    "⚠️ Please upload an image"
+                )
+            # Convert to PIL Image from various formats
+            try:
+                if isinstance(image, str):
+                    # File path
+                    pil_image = Image.open(image)
+                elif isinstance(image, Image.Image):
+                    # Already PIL Image
+                    pil_image = image
+                elif hasattr(image, 'shape'):
+                    # Numpy array
+                    import numpy as np
+                    if isinstance(image, np.ndarray):
+                        pil_image = Image.fromarray(image.astype('uint8'))
+                    else:
+                        raise ValueError("Unsupported array type")
+                else:
+                    return (
+                        f"❌ Error: Unsupported image type: {type(image)}",
+                        f"❌ Error: Unsupported image type: {type(image)}",
+                        "⚠️ Image format not supported"
+                    )
+            except Exception as e:
+                return (
+                    f"❌ Error: Cannot load image - {str(e)}",
+                    f"❌ Error: Cannot load image - {str(e)}",
+                    "⚠️ Image loading failed"
+                )
+            # Validate image
+            is_valid, error_msg = self.image_processor.validate_image(pil_image)
+            if not is_valid:
+                return (
+                    f"❌ Error: {error_msg}",
+                    f"❌ Error: {error_msg}",
+                    "⚠️ Image validation failed"
+                )
+            # Preprocess image
+            progress(0.2, desc="Processing image...")
+            processed_img, metadata = self.image_processor.preprocess_image(pil_image)
+            # Generate image hash for caching
+            image_hash = self.image_processor.generate_image_hash(processed_img)
+            # Step 2: Check cache
+            progress(0.3, desc="Checking cache...")
+            blip_cached = self.cache.get_caption(image_hash, "blip", style)
+            git_cached = self.cache.get_caption(image_hash, "git", style)
+            # Step 3: Generate captions if not cached
+            raw_captions = {}
+            if blip_cached is None or git_cached is None:
+                progress(0.4, desc="Generating captions...")
+                raw_captions = self.model_manager.generate_captions(processed_img)
+            # Step 4: Apply styling
+            progress(0.6, desc=f"Applying {style} style...")
+            styled_captions = {}
+            # BLIP caption
+            if blip_cached:
+                styled_captions["blip"] = blip_cached
+            else:
+                blip_raw = raw_captions.get("blip", "Error generating caption")
+                styled_captions["blip"] = self.style_model.style_caption(blip_raw, style)
+                self.cache.set_caption(image_hash, "blip", style, styled_captions["blip"])
+            # GIT caption
+            if git_cached:
+                styled_captions["git"] = git_cached
+            else:
+                git_raw = raw_captions.get("git", "Error generating caption")
+                styled_captions["git"] = self.style_model.style_caption(git_raw, style)
+                self.cache.set_caption(image_hash, "git", style, styled_captions["git"])
+            # Step 5: Record analytics
+            progress(0.9, desc="Finalizing...")
+            processing_time = time.time() - start_time
+            # Record for each model
+            self.analytics.record_caption_generation("blip", style, processing_time / 2, True)
+            self.analytics.record_caption_generation("git", style, processing_time / 2, True)
+            # Get stats
+            stats_text = self.analytics.get_display_stats()
+            stats_text += f" | ⏱️ This generation: {processing_time:.2f}s"
+            progress(1.0, desc="Complete!")
+            return (
+                styled_captions.get("blip", "Error"),
+                styled_captions.get("git", "Error"),
+                stats_text
+            )
+        except ImageProcessingError as e:
+            error_msg = f"❌ Image Error: {str(e)}"
+            return error_msg, error_msg, "⚠️ Image processing failed"
+        except CaptionModelError as e:
+            error_msg = f"❌ Model Error: {str(e)}"
+            return error_msg, error_msg, "⚠️ Caption generation failed"
+        except Exception as e:
+            error_msg = f"❌ Unexpected Error: {str(e)}"
+            print(f"Error in generate_captions: {e}")
+            # Record error
+            self.analytics.record_caption_generation("unknown", style, 0, False)
+            return error_msg, error_msg, "⚠️ An error occurred"
+    def create_interface(self) -> gr.Blocks:
+        """
+        Create Gradio interface
+        Returns:
+            gr.Blocks: Configured Gradio interface
+        """
+        with gr.Blocks(
+            theme=gr.themes.Soft(),
+            title=ui_config.TITLE,
+            css=self._get_custom_css()
+        ) as interface:
+            # Header
+            gr.Markdown(f"# {ui_config.TITLE}")
+            gr.Markdown(ui_config.DESCRIPTION)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    # Input section
+                    gr.Markdown("### 📤 Upload Image")
+                    image_input = gr.Image(
+                        label="Upload your image",
+                        type="pil",
+                        height=ui_config.IMAGE_HEIGHT
+                    )
+                    style_dropdown = gr.Dropdown(
+                        choices=self.style_model.get_available_styles(),
+                        value="Professional",
+                        label="🎨 Choose Caption Style",
+                        info="Select how you want your caption to be styled"
+                    )
+                    generate_btn = gr.Button(
+                        "✨ Generate Captions",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column(scale=1):
+                    # Output section
+                    gr.Markdown("### 📝 Generated Captions")
+                    with gr.Group():
+                        gr.Markdown("**🤖 BLIP Caption**")
+                        blip_output = gr.Textbox(
+                            label="",
+                            placeholder="BLIP caption will appear here...",
+                            lines=3,
+                            show_copy_button=True
+                        )
+                    with gr.Group():
+                        gr.Markdown("**🤖 GIT Caption**")
+                        git_output = gr.Textbox(
+                            label="",
+                            placeholder="GIT caption will appear here...",
+                            lines=3,
+                            show_copy_button=True
+                        )
+            # Statistics section
+            with gr.Row():
+                stats_display = gr.Markdown(
+                    value=self.analytics.get_display_stats(),
+                    elem_id="stats-display"
+                )
+            # Examples section (if examples exist)
+            examples_dir = ui_config.EXAMPLES_DIR
+            if examples_dir.exists() and list(examples_dir.glob("*.jpg")):
+                gr.Markdown("### 💡 Try These Examples")
+                gr.Examples(
+                    examples=[str(p) for p in examples_dir.glob("*.jpg")[:3]],
+                    inputs=image_input,
+                    label=""
+                )
+            # Footer
+            gr.Markdown(
+                """
+                ---
+                <div style='text-align: center; color: #666; font-size: 0.9em;'>
+                    <p>🚀 Powered by BLIP, GIT, and Groq API | Built with ❤️ using Gradio</p>
+                    <p>⚡ Free and Open Source | 📊 All processing done securely</p>
+                </div>
+                """,
+                elem_id="footer"
+            )
+            # Event handlers
+            generate_btn.click(
+                fn=self.generate_captions,
+                inputs=[image_input, style_dropdown],
+                outputs=[blip_output, git_output, stats_display],
+                api_name="generate"
+            )
+        return interface
+    def _get_custom_css(self) -> str:
+        """Get custom CSS for the interface"""
+        return """
+        #stats-display {
+            padding: 15px;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            border-radius: 10px;
+            text-align: center;
+            font-weight: 500;
+            margin: 20px 0;
+        }
+        #footer {
+            margin-top: 30px;
+        }
+        .gr-button-primary {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+            border: none !important;
+            font-weight: 600 !important;
+        }
+        .gr-button-primary:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
+            transition: all 0.3s ease;
+        }
+        """
+    def launch(
+        self,
+        share: bool = False,
+        server_name: str = "0.0.0.0",
+        server_port: int = 7860
+    ):
+        """
+        Launch the Gradio interface
+        Args:
+            share: Create public URL
+            server_name: Server host
+            server_port: Server port
+        """
+        interface = self.create_interface()
+        interface.launch(
+            share=share,
+            server_name=server_name,
+            server_port=server_port,
+            show_api=ui_config.SHOW_API,
+            show_error=ui_config.SHOW_ERROR
+        )
+def main():
+    """Main entry point"""
+    try:
+        app = CaptionGeneratorApp()
+        app.launch(
+            share=False,  # Set to True to create public URL
+            server_name="0.0.0.0",
+            server_port=7860
+        )
+    except KeyboardInterrupt:
+        print("\n\n👋 Shutting down gracefully...")
+    except Exception as e:
+        print(f"\n❌ Fatal error: {e}")
+        raise
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+Centralized Configuration Module
+This module contains all configuration settings for the AI Image Caption Generator.
+Follows the single source of truth principle for easy maintenance and deployment.
+"""
+import os
+from pathlib import Path
+from typing import Dict, List, Final
+from dataclasses import dataclass
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# ============================================================================
+# PROJECT PATHS
+# ============================================================================
+PROJECT_ROOT: Final[Path] = Path(__file__).parent
+CACHE_DIR: Final[Path] = PROJECT_ROOT / "cache"
+MODEL_CACHE_DIR: Final[Path] = CACHE_DIR / "models"
+ANALYTICS_FILE: Final[Path] = CACHE_DIR / "analytics.json"
+STATIC_DIR: Final[Path] = PROJECT_ROOT / "static"
+# Create directories if they don't exist
+for directory in [CACHE_DIR, MODEL_CACHE_DIR, STATIC_DIR]:
+    directory.mkdir(parents=True, exist_ok=True)
+# ============================================================================
+# MODEL CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class ModelConfig:
+    """Configuration for caption generation models"""
+    # BLIP Model
+    BLIP_MODEL_NAME: str = "Salesforce/blip-image-captioning-base"
+    BLIP_MAX_LENGTH: int = 50
+    BLIP_NUM_BEAMS: int = 3
+    # GIT Model
+    GIT_MODEL_NAME: str = "microsoft/git-large-coco"
+    GIT_MAX_LENGTH: int = 50
+    GIT_NUM_BEAMS: int = 3
+    # Device Configuration
+    DEVICE: str = "cuda"  # Will auto-fallback to CPU if CUDA unavailable
+    # Memory Management
+    MODEL_CACHE_DIR: Path = MODEL_CACHE_DIR
+    LOW_MEMORY_MODE: bool = False  # Enable for systems with <8GB GPU memory
+# ============================================================================
+# IMAGE PROCESSING CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class ImageConfig:
+    """Configuration for image validation and preprocessing"""
+    # Size Constraints
+    MAX_FILE_SIZE_MB: int = 5
+    MAX_FILE_SIZE_BYTES: int = MAX_FILE_SIZE_MB * 1024 * 1024
+    MAX_DIMENSION: int = 512  # Max width/height for model input
+    MIN_DIMENSION: int = 32   # Minimum acceptable dimension
+    # Supported Formats
+    ALLOWED_FORMATS: tuple = ("JPEG", "PNG", "WEBP", "JPG")
+    ALLOWED_EXTENSIONS: tuple = (".jpg", ".jpeg", ".png", ".webp")
+    # Processing
+    RESIZE_QUALITY: int = 95  # JPEG quality after resize
+    MAINTAIN_ASPECT_RATIO: bool = True
+# ============================================================================
+# GROQ API CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class GroqConfig:
+    """Configuration for Groq API styling"""
+    # API Settings
+    API_KEY: str = os.getenv("GROQ_API_KEY", "")
+    MODEL_NAME: str = "llama-3.1-8b-instant"
+    # Request Parameters
+    MAX_TOKENS: int = 150
+    TEMPERATURE: float = 0.7
+    TOP_P: float = 0.9
+    TIMEOUT_SECONDS: int = 10
+    # Retry Logic
+    MAX_RETRIES: int = 3
+    RETRY_DELAY_SECONDS: float = 1.0
+    # Rate Limiting
+    REQUESTS_PER_MINUTE: int = 30
+# ============================================================================
+# STYLE CONFIGURATION
+# ============================================================================
+class StyleConfig:
+    """Configuration for caption styling options"""
+    STYLES: Final[Dict[str, str]] = {
+        "None": "Keep the original caption without any modifications.",
+        "Professional": "Rewrite this image caption in a professional, business-appropriate tone. Make it clear, formal, and suitable for corporate presentations or reports.",
+        "Creative": "Transform this caption into a creative, artistic, and imaginative description. Use vivid language and engaging expressions.",
+        "Social Media": "Rewrite this caption for social media platforms. Make it engaging, add relevant emojis, and make it shareable. Keep it under 280 characters.",
+        "Technical": "Rewrite this caption with technical precision and detailed analysis. Focus on specific elements, composition, and visual characteristics."
+    }
+    DEFAULT_STYLE: Final[str] = "Professional"
+    # Fallback templates when API fails
+    FALLBACK_TEMPLATES: Final[Dict[str, str]] = {
+        "Professional": "Image Description: {caption}",
+        "Creative": "✨ {caption} ✨",
+        "Social Media": "📸 {caption} #AI #ImageCaption",
+        "Technical": "Visual Analysis: {caption}",
+        "None": "{caption}"
+    }
+# ============================================================================
+# CACHE CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class CacheConfig:
+    """Configuration for caching system"""
+    # Cache Settings
+    MAX_CACHE_SIZE: int = 100  # Maximum number of cached items
+    CACHE_TTL_SECONDS: int = 3600  # Time to live: 1 hour
+    # Cache Keys
+    ENABLE_CAPTION_CACHE: bool = True
+    CACHE_KEY_ALGO: str = "md5"  # Hashing algorithm for cache keys
+# ============================================================================
+# ANALYTICS CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class AnalyticsConfig:
+    """Configuration for usage analytics"""
+    # Storage
+    ANALYTICS_FILE: Path = ANALYTICS_FILE
+    SAVE_INTERVAL_SECONDS: int = 30  # Auto-save every 30 seconds
+    # Metrics to Track
+    TRACK_PROCESSING_TIME: bool = True
+    TRACK_STYLE_USAGE: bool = True
+    TRACK_MODEL_USAGE: bool = True
+    TRACK_ERROR_RATE: bool = True
+# ============================================================================
+# GRADIO UI CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class UIConfig:
+    """Configuration for Gradio interface"""
+    # App Metadata
+    TITLE: str = "🖼️ AI Image Caption Generator"
+    DESCRIPTION: str = """
+    Generate professional image captions using state-of-the-art AI models.
+    Upload an image and choose your preferred style - get instant captions from both BLIP and GIT models.
+    """
+    # UI Settings
+    THEME: str = "soft"  # Gradio theme
+    SHOW_API: bool = False
+    SHOW_ERROR: bool = True
+    # Component Settings
+    IMAGE_HEIGHT: int = 400
+    MAX_QUEUE_SIZE: int = 10
+    # Example Images
+    EXAMPLES_DIR: Path = STATIC_DIR / "images" / "examples"
+# ============================================================================
+# LOGGING CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class LogConfig:
+    """Configuration for logging"""
+    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
+    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
+# ============================================================================
+# PERFORMANCE CONFIGURATION
+# ============================================================================
+@dataclass(frozen=True)
+class PerformanceConfig:
+    """Configuration for performance optimization"""
+    # Processing Timeouts
+    MAX_PROCESSING_TIME_SECONDS: int = 30
+    # Model Loading
+    LAZY_LOAD_MODELS: bool = False  # Load models on first use vs startup
+    # Batch Processing (future feature)
+    ENABLE_BATCH_PROCESSING: bool = False
+    MAX_BATCH_SIZE: int = 1
+# ============================================================================
+# INSTANTIATE CONFIGURATIONS
+# ============================================================================
+# Create singleton instances
+model_config = ModelConfig()
+image_config = ImageConfig()
+groq_config = GroqConfig()
+style_config = StyleConfig()
+cache_config = CacheConfig()
+analytics_config = AnalyticsConfig()
+ui_config = UIConfig()
+log_config = LogConfig()
+performance_config = PerformanceConfig()
+# ============================================================================
+# VALIDATION
+# ============================================================================
+def validate_config() -> tuple[bool, list[str]]:
+    """
+    Validate all configuration settings
+    Returns:
+        tuple: (is_valid, list_of_errors)
+    """
+    errors = []
+    # Check Groq API Key
+    if not groq_config.API_KEY:
+        errors.append("GROQ_API_KEY not found in environment variables")
+    # Check required directories
+    required_dirs = [CACHE_DIR, MODEL_CACHE_DIR]
+    for directory in required_dirs:
+        if not directory.exists():
+            errors.append(f"Required directory not found: {directory}")
+    # Validate image constraints
+    if image_config.MAX_DIMENSION < image_config.MIN_DIMENSION:
+        errors.append("MAX_DIMENSION must be greater than MIN_DIMENSION")
+    # Validate style options
+    if not style_config.STYLES:
+        errors.append("No style options configured")
+    if style_config.DEFAULT_STYLE not in style_config.STYLES:
+        errors.append(f"Default style '{style_config.DEFAULT_STYLE}' not in available styles")
+    return len(errors) == 0, errors
+# ============================================================================
+# CONFIGURATION SUMMARY
+# ============================================================================
+def print_config_summary() -> None:
+    """Print configuration summary for debugging"""
+    print("=" * 60)
+    print("AI IMAGE CAPTION GENERATOR - CONFIGURATION SUMMARY")
+    print("=" * 60)
+    print(f"Project Root: {PROJECT_ROOT}")
+    print(f"Cache Directory: {CACHE_DIR}")
+    print(f"Model Cache: {MODEL_CACHE_DIR}")
+    print(f"\nModels:")
+    print(f"  - BLIP: {model_config.BLIP_MODEL_NAME}")
+    print(f"  - GIT: {model_config.GIT_MODEL_NAME}")
+    print(f"  - Device: {model_config.DEVICE}")
+    print(f"\nGroq API:")
+    print(f"  - Model: {groq_config.MODEL_NAME}")
+    print(f"  - API Key: {'✓ Configured' if groq_config.API_KEY else '✗ Missing'}")
+    print(f"\nImage Processing:")
+    print(f"  - Max Size: {image_config.MAX_FILE_SIZE_MB}MB")
+    print(f"  - Max Dimension: {image_config.MAX_DIMENSION}px")
+    print(f"  - Formats: {', '.join(image_config.ALLOWED_FORMATS)}")
+    print(f"\nStyle Options: {len(style_config.STYLES)}")
+    for style in style_config.STYLES.keys():
+        print(f"  - {style}")
+    print(f"\nCache: {cache_config.MAX_CACHE_SIZE} items")
+    print(f"Analytics: {analytics_config.ANALYTICS_FILE}")
+    print("=" * 60)
+    # Validate configuration
+    is_valid, errors = validate_config()
+    if not is_valid:
+        print("\n⚠️  CONFIGURATION ERRORS:")
+        for error in errors:
+            print(f"  - {error}")
+        print("=" * 60)
+    else:
+        print("\n✓ Configuration validated successfully")
+        print("=" * 60)
+if __name__ == "__main__":
+    # Run configuration validation when executed directly
+    print_config_summary()

docker-compose.yml ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# Core Framework
+torch==2.1.0
+torchvision==0.16.0
+transformers==4.35.0
+gradio==4.8.0
+accelerate==0.25.0
+# Image Processing
+Pillow==10.0.1
+opencv-python==4.8.1.78
+# API Integration
+groq>=0.33.0
+requests==2.31.0
+# Utilities
+python-dotenv==1.0.0
+numpy==1.24.3
+tqdm==4.66.1
+# Development
+pytest==7.4.3
+black==23.9.1
+flake8==6.1.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Source Package Initialization
+Makes src a proper Python package and exposes key components.
+"""
+__version__ = "1.0.0"
+__author__ = "AI Caption Generator Team"
+# This file makes src a proper Python package

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Models Package
+Provides caption generation and styling models.
+"""
+from .caption_model import (
+    CaptionModel,
+    BLIPModel,
+    GITModel,
+    CaptionModelManager,
+    CaptionModelError,
+    get_model_manager
+)
+from .style_model import (
+    StyleModel,
+    StyleModelError,
+    get_style_model
+)
+__all__ = [
+    # Caption Models
+    "CaptionModel",
+    "BLIPModel",
+    "GITModel",
+    "CaptionModelManager",
+    "CaptionModelError",
+    "get_model_manager",
+    # Style Model
+    "StyleModel",
+    "StyleModelError",
+    "get_style_model",
+]

src/models/caption_model.py ADDED Viewed

	@@ -0,0 +1,490 @@

+"""
+Caption Model Module
+Manages BLIP and GIT models for image caption generation.
+Handles model loading, inference, and memory management.
+"""
+import torch
+from PIL import Image
+from typing import Optional, Dict, Tuple
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    AutoProcessor,
+    AutoModelForCausalLM
+)
+import gc
+from config import model_config
+class CaptionModelError(Exception):
+    """Custom exception for caption model errors"""
+    pass
+class CaptionModel:
+    """
+    Base class for caption generation models
+    Provides common interface for BLIP and GIT models
+    """
+    def __init__(self, model_name: str, device: str = "cuda"):
+        """
+        Initialize caption model
+        Args:
+            model_name: HuggingFace model identifier
+            device: Device to load model on (cuda/cpu)
+        """
+        self.model_name = model_name
+        self.device = self._get_device(device)
+        self.processor = None
+        self.model = None
+        self._is_loaded = False
+    def _get_device(self, requested_device: str) -> str:
+        """
+        Determine available device
+        Args:
+            requested_device: Requested device (cuda/cpu)
+        Returns:
+            str: Available device
+        """
+        if requested_device == "cuda" and torch.cuda.is_available():
+            return "cuda"
+        return "cpu"
+    def load(self) -> bool:
+        """
+        Load model into memory
+        Returns:
+            bool: True if successful
+        """
+        raise NotImplementedError("Subclass must implement load()")
+    def generate_caption(
+        self,
+        image: Image.Image,
+        max_length: int = 50,
+        num_beams: int = 3
+    ) -> str:
+        """
+        Generate caption for image
+        Args:
+            image: PIL Image
+            max_length: Maximum caption length
+            num_beams: Number of beams for beam search
+        Returns:
+            str: Generated caption
+        """
+        raise NotImplementedError("Subclass must implement generate_caption()")
+    def unload(self) -> None:
+        """Unload model from memory"""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.processor is not None:
+            del self.processor
+            self.processor = None
+        gc.collect()
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        self._is_loaded = False
+    def is_loaded(self) -> bool:
+        """Check if model is loaded"""
+        return self._is_loaded
+    def get_info(self) -> dict:
+        """Get model information"""
+        return {
+            "model_name": self.model_name,
+            "device": self.device,
+            "is_loaded": self._is_loaded
+        }
+class BLIPModel(CaptionModel):
+    """
+    BLIP (Bootstrapping Language-Image Pre-training) model
+    Fast and efficient model for image captioning
+    """
+    def __init__(self, device: str = "cuda"):
+        """Initialize BLIP model"""
+        super().__init__(model_config.BLIP_MODEL_NAME, device)
+        self.max_length = model_config.BLIP_MAX_LENGTH
+        self.num_beams = model_config.BLIP_NUM_BEAMS
+    def load(self) -> bool:
+        """
+        Load BLIP model and processor
+        Returns:
+            bool: True if successful
+        """
+        try:
+            print(f"Loading BLIP model on {self.device}...")
+            # Load processor
+            self.processor = BlipProcessor.from_pretrained(
+                self.model_name,
+                cache_dir=model_config.MODEL_CACHE_DIR
+            )
+            # Load model
+            self.model = BlipForConditionalGeneration.from_pretrained(
+                self.model_name,
+                cache_dir=model_config.MODEL_CACHE_DIR,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            ).to(self.device)
+            # Set to evaluation mode
+            self.model.eval()
+            self._is_loaded = True
+            print(f"✓ BLIP model loaded successfully on {self.device}")
+            return True
+        except Exception as e:
+            print(f"Error loading BLIP model: {e}")
+            self._is_loaded = False
+            return False
+    def generate_caption(
+        self,
+        image: Image.Image,
+        max_length: Optional[int] = None,
+        num_beams: Optional[int] = None
+    ) -> str:
+        """
+        Generate caption using BLIP
+        Args:
+            image: PIL Image
+            max_length: Maximum caption length
+            num_beams: Number of beams for beam search
+        Returns:
+            str: Generated caption
+        Raises:
+            CaptionModelError: If generation fails
+        """
+        if not self._is_loaded:
+            raise CaptionModelError("BLIP model not loaded")
+        try:
+            # Use default values if not provided
+            max_length = max_length or self.max_length
+            num_beams = num_beams or self.num_beams
+            # Preprocess image
+            inputs = self.processor(
+                images=image,
+                return_tensors="pt"
+            ).to(self.device)
+            # Generate caption
+            with torch.no_grad():
+                output_ids = self.model.generate(
+                    **inputs,
+                    max_length=max_length,
+                    num_beams=num_beams,
+                    early_stopping=True
+                )
+            # Decode caption
+            caption = self.processor.decode(
+                output_ids[0],
+                skip_special_tokens=True
+            )
+            return caption.strip()
+        except Exception as e:
+            raise CaptionModelError(f"BLIP caption generation failed: {e}")
+class GITModel(CaptionModel):
+    """
+    GIT (Generative Image-to-text Transformer) model
+    More detailed and accurate captions compared to BLIP
+    """
+    def __init__(self, device: str = "cuda"):
+        """Initialize GIT model"""
+        super().__init__(model_config.GIT_MODEL_NAME, device)
+        self.max_length = model_config.GIT_MAX_LENGTH
+        self.num_beams = model_config.GIT_NUM_BEAMS
+    def load(self) -> bool:
+        """
+        Load GIT model and processor
+        Returns:
+            bool: True if successful
+        """
+        try:
+            print(f"Loading GIT model on {self.device}...")
+            # Load processor
+            self.processor = AutoProcessor.from_pretrained(
+                self.model_name,
+                cache_dir=model_config.MODEL_CACHE_DIR
+            )
+            # Load model
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                cache_dir=model_config.MODEL_CACHE_DIR,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            ).to(self.device)
+            # Set to evaluation mode
+            self.model.eval()
+            self._is_loaded = True
+            print(f"✓ GIT model loaded successfully on {self.device}")
+            return True
+        except Exception as e:
+            print(f"Error loading GIT model: {e}")
+            self._is_loaded = False
+            return False
+    def generate_caption(
+        self,
+        image: Image.Image,
+        max_length: Optional[int] = None,
+        num_beams: Optional[int] = None
+    ) -> str:
+        """
+        Generate caption using GIT
+        Args:
+            image: PIL Image
+            max_length: Maximum caption length
+            num_beams: Number of beams for beam search
+        Returns:
+            str: Generated caption
+        Raises:
+            CaptionModelError: If generation fails
+        """
+        if not self._is_loaded:
+            raise CaptionModelError("GIT model not loaded")
+        try:
+            # Use default values if not provided
+            max_length = max_length or self.max_length
+            num_beams = num_beams or self.num_beams
+            # Preprocess image
+            inputs = self.processor(
+                images=image,
+                return_tensors="pt"
+            ).to(self.device)
+            # Generate caption
+            with torch.no_grad():
+                output_ids = self.model.generate(
+                    pixel_values=inputs.pixel_values,
+                    max_length=max_length,
+                    num_beams=num_beams,
+                    early_stopping=True
+                )
+            # Decode caption
+            caption = self.processor.batch_decode(
+                output_ids,
+                skip_special_tokens=True
+            )[0]
+            return caption.strip()
+        except Exception as e:
+            raise CaptionModelError(f"GIT caption generation failed: {e}")
+class CaptionModelManager:
+    """
+    Manager for both BLIP and GIT models
+    Provides unified interface and handles model lifecycle
+    """
+    def __init__(self, device: Optional[str] = None):
+        """
+        Initialize model manager
+        Args:
+            device: Device to use (cuda/cpu), auto-detects if None
+        """
+        self.device = device or model_config.DEVICE
+        # Initialize models
+        self.blip_model = BLIPModel(self.device)
+        self.git_model = GITModel(self.device)
+        # Track which models are loaded
+        self._loaded_models = set()
+    def load_all_models(self) -> Tuple[bool, bool]:
+        """
+        Load both models
+        Returns:
+            Tuple[bool, bool]: (blip_success, git_success)
+        """
+        blip_success = self.blip_model.load()
+        if blip_success:
+            self._loaded_models.add("blip")
+        git_success = self.git_model.load()
+        if git_success:
+            self._loaded_models.add("git")
+        return blip_success, git_success
+    def load_model(self, model_name: str) -> bool:
+        """
+        Load specific model
+        Args:
+            model_name: Model to load ("blip" or "git")
+        Returns:
+            bool: True if successful
+        """
+        if model_name.lower() == "blip":
+            success = self.blip_model.load()
+            if success:
+                self._loaded_models.add("blip")
+            return success
+        elif model_name.lower() == "git":
+            success = self.git_model.load()
+            if success:
+                self._loaded_models.add("git")
+            return success
+        else:
+            raise ValueError(f"Unknown model: {model_name}")
+    def generate_captions(
+        self,
+        image: Image.Image
+    ) -> Dict[str, str]:
+        """
+        Generate captions from all loaded models
+        Args:
+            image: PIL Image
+        Returns:
+            Dict[str, str]: Captions from each model
+        """
+        captions = {}
+        if "blip" in self._loaded_models:
+            try:
+                captions["blip"] = self.blip_model.generate_caption(image)
+            except Exception as e:
+                captions["blip"] = f"Error: {str(e)}"
+        if "git" in self._loaded_models:
+            try:
+                captions["git"] = self.git_model.generate_caption(image)
+            except Exception as e:
+                captions["git"] = f"Error: {str(e)}"
+        return captions
+    def unload_all_models(self) -> None:
+        """Unload all models from memory"""
+        self.blip_model.unload()
+        self.git_model.unload()
+        self._loaded_models.clear()
+    def get_status(self) -> dict:
+        """Get status of all models"""
+        return {
+            "device": self.device,
+            "blip": {
+                "loaded": self.blip_model.is_loaded(),
+                "info": self.blip_model.get_info()
+            },
+            "git": {
+                "loaded": self.git_model.is_loaded(),
+                "info": self.git_model.get_info()
+            },
+            "loaded_models": list(self._loaded_models)
+        }
+# Singleton instance
+_model_manager = None
+def get_model_manager() -> CaptionModelManager:
+    """Get singleton CaptionModelManager instance"""
+    global _model_manager
+    if _model_manager is None:
+        _model_manager = CaptionModelManager()
+    return _model_manager
+if __name__ == "__main__":
+    # Test the caption models
+    print("=" * 60)
+    print("CAPTION MODELS - TEST MODE")
+    print("=" * 60)
+    # Initialize manager
+    manager = CaptionModelManager()
+    print(f"\n✓ Model manager initialized")
+    print(f"  Device: {manager.device}")
+    print("\n" + "=" * 60)
+    print("Loading models (this may take a few minutes)...")
+    print("=" * 60)
+    # Load models
+    blip_success, git_success = manager.load_all_models()
+    print(f"\nBLIP: {'✓ Loaded' if blip_success else '✗ Failed'}")
+    print(f"GIT: {'✓ Loaded' if git_success else '✗ Failed'}")
+    print("\n" + "=" * 60)
+    print("Model Status:")
+    print("=" * 60)
+    status = manager.get_status()
+    for key, value in status.items():
+        if isinstance(value, dict):
+            print(f"{key}:")
+            for k, v in value.items():
+                print(f"  {k}: {v}")
+        else:
+            print(f"{key}: {value}")
+    print("\n" + "=" * 60)
+    print("✓ Caption models test complete")
+    print("=" * 60)
+    print("\nTo test caption generation, provide a test image:")
+    print("  from PIL import Image")
+    print("  img = Image.open('your_image.jpg')")
+    print("  captions = manager.generate_captions(img)")
+    print("  print(captions)")

src/models/style_model.py ADDED Viewed

	@@ -0,0 +1,361 @@

+"""
+Style Model Module
+Handles caption styling using Groq API with fallback mechanisms.
+Applies different writing styles to generated captions.
+"""
+import time
+from typing import Optional
+from groq import Groq
+import requests
+from config import groq_config, style_config
+class StyleModelError(Exception):
+    """Custom exception for style model errors"""
+    pass
+class StyleModel:
+    """
+    Caption styling using Groq LLM API
+    Features:
+    - Multiple style options
+    - Automatic retry logic
+    - Fallback to rule-based styling
+    - Rate limiting handling
+    """
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initialize style model
+        Args:
+            api_key: Groq API key (uses config if not provided)
+        """
+        self.api_key = api_key or groq_config.API_KEY
+        self.model_name = groq_config.MODEL_NAME
+        self.max_tokens = groq_config.MAX_TOKENS
+        self.temperature = groq_config.TEMPERATURE
+        self.timeout = groq_config.TIMEOUT_SECONDS
+        # Initialize Groq client
+        if self.api_key:
+            try:
+                self.client = Groq(
+                    api_key=self.api_key
+                )
+                self._api_available = True
+                _ = self.client.models.list()
+            except Exception as e:
+                print(f"Warning: Groq client initialization failed: {e}")
+                print(f"Attempting alternative initialization...")
+                try:
+                    # Alternative: Create client without extra params
+                    import groq
+                    self.client = groq.Client(api_key=self.api_key)
+                    self._api_available = True
+                except Exception as e2:
+                    print(f"Alternative initialization also failed: {e2}")
+                    self.client = None
+                    self._api_available = False
+        else:
+            print("Warning: No Groq API key provided")
+            self.client = None
+            self._api_available = False
+        # Retry configuration
+        self.max_retries = groq_config.MAX_RETRIES
+        self.retry_delay = groq_config.RETRY_DELAY_SECONDS
+    def style_caption(
+        self,
+        caption: str,
+        style: str = "Professional"
+    ) -> str:
+        """
+        Apply style to caption
+        Args:
+            caption: Original caption
+            style: Style to apply
+        Returns:
+            str: Styled caption
+        """
+        # If "None" style or no API, return original
+        if style == "None" or not self._api_available:
+            if style != "None":
+                # Use fallback styling if API unavailable
+                return self._fallback_style(caption, style)
+            return caption
+        # Try API styling with retries
+        for attempt in range(self.max_retries):
+            try:
+                styled_caption = self._style_with_api(caption, style)
+                return styled_caption
+            except Exception as e:
+                print(f"API styling attempt {attempt + 1} failed: {e}")
+                # If last attempt, use fallback
+                if attempt == self.max_retries - 1:
+                    print(f"Using fallback styling for: {style}")
+                    return self._fallback_style(caption, style)
+                # Wait before retry
+                time.sleep(self.retry_delay)
+        # Fallback if all retries failed
+        return self._fallback_style(caption, style)
+    def _style_with_api(self, caption: str, style: str) -> str:
+        """
+        Style caption using Groq API
+        Args:
+            caption: Original caption
+            style: Style to apply
+        Returns:
+            str: Styled caption
+        Raises:
+            StyleModelError: If API call fails
+        """
+        if not self._api_available:
+            raise StyleModelError("API not available")
+        # Get style prompt
+        style_prompt = style_config.STYLES.get(
+            style,
+            style_config.STYLES[style_config.DEFAULT_STYLE]
+        )
+        # Construct messages
+        messages = [
+            {
+                "role": "system",
+                "content": "You are an expert at rewriting image captions in different styles. Keep the core meaning but adapt the tone and style as requested. Be concise."
+            },
+            {
+                "role": "user",
+                "content": f"{style_prompt}\n\nOriginal caption: {caption}\n\nStyled caption:"
+            }
+        ]
+        try:
+            # Make API call
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                max_tokens=self.max_tokens,
+                temperature=self.temperature,
+                top_p=groq_config.TOP_P,
+                timeout=self.timeout
+            )
+            # Extract styled caption
+            styled_caption = response.choices[0].message.content.strip()
+            # Clean up common artifacts
+            styled_caption = self._clean_response(styled_caption)
+            return styled_caption
+        except requests.exceptions.Timeout:
+            raise StyleModelError("API request timed out")
+        except requests.exceptions.RequestException as e:
+            raise StyleModelError(f"API request failed: {e}")
+        except Exception as e:
+            raise StyleModelError(f"Unexpected error: {e}")
+    def _fallback_style(self, caption: str, style: str) -> str:
+        """
+        Apply rule-based styling as fallback
+        Args:
+            caption: Original caption
+            style: Style to apply
+        Returns:
+            str: Styled caption using templates
+        """
+        template = style_config.FALLBACK_TEMPLATES.get(
+            style,
+            style_config.FALLBACK_TEMPLATES["Professional"]
+        )
+        return template.format(caption=caption)
+    def _clean_response(self, text: str) -> str:
+        """
+        Clean up API response
+        Args:
+            text: Raw response text
+        Returns:
+            str: Cleaned text
+        """
+        # Remove common prefixes
+        prefixes = [
+            "Styled caption:",
+            "Caption:",
+            "Here's the styled caption:",
+            "Here is the caption:",
+        ]
+        for prefix in prefixes:
+            if text.lower().startswith(prefix.lower()):
+                text = text[len(prefix):].strip()
+        # Remove quotes if the entire text is quoted
+        if (text.startswith('"') and text.endswith('"')) or \
+           (text.startswith("'") and text.endswith("'")):
+            text = text[1:-1]
+        return text.strip()
+    def batch_style_captions(
+        self,
+        captions: dict,
+        style: str = "Professional"
+    ) -> dict:
+        """
+        Style multiple captions at once
+        Args:
+            captions: Dictionary of {model_name: caption}
+            style: Style to apply
+        Returns:
+            dict: Dictionary of {model_name: styled_caption}
+        """
+        styled_captions = {}
+        for model_name, caption in captions.items():
+            try:
+                styled_caption = self.style_caption(caption, style)
+                styled_captions[model_name] = styled_caption
+            except Exception as e:
+                print(f"Error styling {model_name} caption: {e}")
+                # Use original caption on error
+                styled_captions[model_name] = caption
+        return styled_captions
+    def is_api_available(self) -> bool:
+        """Check if API is available"""
+        return self._api_available
+    def test_connection(self) -> bool:
+        """
+        Test API connection
+        Returns:
+            bool: True if API is working
+        """
+        if not self._api_available:
+            return False
+        try:
+            # Simple test call
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[
+                    {"role": "user", "content": "Hello"}
+                ],
+                max_tokens=10,
+                timeout=5
+            )
+            return True
+        except Exception as e:
+            print(f"API connection test failed: {e}")
+            return False
+    def get_available_styles(self) -> list:
+        """Get list of available styles"""
+        return list(style_config.STYLES.keys())
+    def get_info(self) -> dict:
+        """Get model information"""
+        return {
+            "model_name": self.model_name,
+            "api_available": self._api_available,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "available_styles": self.get_available_styles()
+        }
+# Singleton instance
+_style_model = None
+def get_style_model() -> StyleModel:
+    """Get singleton StyleModel instance"""
+    global _style_model
+    if _style_model is None:
+        _style_model = StyleModel()
+    return _style_model
+if __name__ == "__main__":
+    # Test the style model
+    print("=" * 60)
+    print("STYLE MODEL - TEST MODE")
+    print("=" * 60)
+    # Initialize model
+    style_model = StyleModel()
+    print(f"\n✓ Style model initialized")
+    print(f"  API Available: {style_model.is_api_available()}")
+    print(f"  Model: {style_model.model_name}")
+    # Get info
+    print("\nModel Info:")
+    info = style_model.get_info()
+    for key, value in info.items():
+        if isinstance(value, list):
+            print(f"  {key}:")
+            for item in value:
+                print(f"    - {item}")
+        else:
+            print(f"  {key}: {value}")
+    # Test connection if API available
+    if style_model.is_api_available():
+        print("\nTesting API connection...")
+        connection_ok = style_model.test_connection()
+        print(f"  Connection: {'✓ Success' if connection_ok else '✗ Failed'}")
+        if connection_ok:
+            # Test styling
+            print("\nTesting caption styling:")
+            test_caption = "A cat sitting on a windowsill looking outside"
+            for style in ["Professional", "Creative", "Social Media"]:
+                print(f"\n  {style}:")
+                try:
+                    styled = style_model.style_caption(test_caption, style)
+                    print(f"    Original: {test_caption}")
+                    print(f"    Styled: {styled}")
+                except Exception as e:
+                    print(f"    Error: {e}")
+    else:
+        print("\n⚠️  API not available, testing fallback styling:")
+        test_caption = "A cat sitting on a windowsill looking outside"
+        for style in ["Professional", "Creative", "Social Media"]:
+            styled = style_model.style_caption(test_caption, style)
+            print(f"\n  {style}: {styled}")
+    print("\n" + "=" * 60)
+    print("✓ Style model test complete")
+    print("=" * 60)

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Utils Package
+Provides utility functions for image processing, caching, and analytics.
+"""
+from .image_processor import (
+    ImageProcessor,
+    ImageProcessingError,
+    get_image_processor,
+    validate_image,
+    preprocess_image,
+    generate_image_hash
+)
+from .cache_manager import (
+    CacheManager,
+    CaptionCache,
+    get_cache_manager,
+    get_caption_cache
+)
+from .analytics import (
+    AnalyticsManager,
+    get_analytics_manager,
+    record_generation,
+    get_stats,
+    get_summary,
+    get_display_stats
+)
+__all__ = [
+    # Image Processing
+    "ImageProcessor",
+    "ImageProcessingError",
+    "get_image_processor",
+    "validate_image",
+    "preprocess_image",
+    "generate_image_hash",
+    # Cache Management
+    "CacheManager",
+    "CaptionCache",
+    "get_cache_manager",
+    "get_caption_cache",
+    # Analytics
+    "AnalyticsManager",
+    "get_analytics_manager",
+    "record_generation",
+    "get_stats",
+    "get_summary",
+    "get_display_stats",
+]

src/utils/analytics.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""
+Analytics Module
+Tracks usage statistics and performance metrics for the caption generator.
+Provides insights into model usage, processing times, and popular styles.
+"""
+import json
+import threading
+from pathlib import Path
+from typing import Dict, Optional
+from datetime import datetime
+from dataclasses import dataclass, asdict
+from config import analytics_config, style_config
+@dataclass
+class AnalyticsData:
+    """Container for analytics data"""
+    total_captions: int = 0
+    style_usage: Dict[str, int] = None
+    avg_processing_time: float = 0.0
+    total_processing_time: float = 0.0
+    model_usage: Dict[str, int] = None
+    error_count: int = 0
+    last_updated: Optional[str] = None
+    def __post_init__(self):
+        if self.style_usage is None:
+            self.style_usage = {style: 0 for style in style_config.STYLES.keys()}
+        if self.model_usage is None:
+            self.model_usage = {"blip": 0, "git": 0}
+    def to_dict(self) -> dict:
+        """Convert to dictionary"""
+        return asdict(self)
+class AnalyticsManager:
+    """
+    Thread-safe analytics manager for tracking usage metrics
+    Features:
+    - Real-time metric tracking
+    - Persistent storage
+    - Thread-safe operations
+    - Automatic calculations
+    """
+    def __init__(self, storage_path: Optional[Path] = None):
+        """
+        Initialize analytics manager
+        Args:
+            storage_path: Path to analytics JSON file
+        """
+        self.storage_path = storage_path or analytics_config.ANALYTICS_FILE
+        self._lock = threading.RLock()
+        # Load existing data or initialize new
+        self.data = self._load_data()
+    def _load_data(self) -> AnalyticsData:
+        """
+        Load analytics data from file
+        Returns:
+            AnalyticsData: Loaded or initialized data
+        """
+        if self.storage_path.exists():
+            try:
+                with open(self.storage_path, 'r') as f:
+                    data_dict = json.load(f)
+                    return AnalyticsData(**data_dict)
+            except Exception as e:
+                print(f"Warning: Failed to load analytics: {e}")
+                return AnalyticsData()
+        else:
+            return AnalyticsData()
+    def _save_data(self) -> bool:
+        """
+        Save analytics data to file
+        Returns:
+            bool: True if successful
+        """
+        try:
+            # Ensure directory exists
+            self.storage_path.parent.mkdir(parents=True, exist_ok=True)
+            # Update timestamp
+            self.data.last_updated = datetime.now().isoformat()
+            # Write to file
+            with open(self.storage_path, 'w') as f:
+                json.dump(self.data.to_dict(), f, indent=4)
+            return True
+        except Exception as e:
+            print(f"Error saving analytics: {e}")
+            return False
+    def record_caption_generation(
+        self,
+        model_name: str,
+        style: str,
+        processing_time: float,
+        success: bool = True
+    ) -> None:
+        """
+        Record a caption generation event
+        Args:
+            model_name: Name of the model used (blip/git)
+            style: Style applied
+            processing_time: Time taken in seconds
+            success: Whether generation was successful
+        """
+        with self._lock:
+            if success:
+                # Increment counters
+                self.data.total_captions += 1
+                # Update style usage
+                if style in self.data.style_usage:
+                    self.data.style_usage[style] += 1
+                # Update model usage
+                model_key = model_name.lower()
+                if model_key in self.data.model_usage:
+                    self.data.model_usage[model_key] += 1
+                # Update processing time
+                self.data.total_processing_time += processing_time
+                self.data.avg_processing_time = (
+                    self.data.total_processing_time / self.data.total_captions
+                )
+            else:
+                self.data.error_count += 1
+            # Save to disk
+            self._save_data()
+    def record_batch_generation(
+        self,
+        generations: list[dict]
+    ) -> None:
+        """
+        Record multiple caption generations at once
+        Args:
+            generations: List of generation records
+                Each record: {model_name, style, processing_time, success}
+        """
+        with self._lock:
+            for gen in generations:
+                self.record_caption_generation(
+                    model_name=gen.get("model_name", "unknown"),
+                    style=gen.get("style", "None"),
+                    processing_time=gen.get("processing_time", 0.0),
+                    success=gen.get("success", True)
+                )
+    def get_stats(self) -> dict:
+        """
+        Get current statistics
+        Returns:
+            dict: Current analytics data
+        """
+        with self._lock:
+            return self.data.to_dict()
+    def get_summary(self) -> dict:
+        """
+        Get formatted summary of analytics
+        Returns:
+            dict: Human-readable summary
+        """
+        with self._lock:
+            total = self.data.total_captions
+            # Calculate percentages for styles
+            style_percentages = {}
+            if total > 0:
+                for style, count in self.data.style_usage.items():
+                    style_percentages[style] = round((count / total) * 100, 1)
+            # Calculate percentages for models
+            model_percentages = {}
+            if total > 0:
+                for model, count in self.data.model_usage.items():
+                    model_percentages[model] = round((count / total) * 100, 1)
+            # Find most popular style
+            popular_style = max(
+                self.data.style_usage.items(),
+                key=lambda x: x[1]
+            )[0] if self.data.style_usage else "None"
+            return {
+                "total_captions": total,
+                "avg_processing_time": round(self.data.avg_processing_time, 2),
+                "error_rate": round(
+                    (self.data.error_count / (total + self.data.error_count) * 100)
+                    if (total + self.data.error_count) > 0 else 0,
+                    2
+                ),
+                "most_popular_style": popular_style,
+                "style_distribution": style_percentages,
+                "model_distribution": model_percentages,
+                "last_updated": self.data.last_updated
+            }
+    def get_display_stats(self) -> str:
+        """
+        Get formatted stats for UI display
+        Returns:
+            str: Formatted statistics string
+        """
+        with self._lock:
+            summary = self.get_summary()
+            stats_text = (
+                f"📊 Total Captions: {summary['total_captions']} | "
+                f"⚡ Avg Time: {summary['avg_processing_time']}s | "
+                f"🎨 Popular Style: {summary['most_popular_style']}"
+            )
+            return stats_text
+    def reset_stats(self) -> bool:
+        """
+        Reset all statistics
+        Returns:
+            bool: True if successful
+        """
+        with self._lock:
+            self.data = AnalyticsData()
+            return self._save_data()
+    def export_stats(self, export_path: Optional[Path] = None) -> bool:
+        """
+        Export statistics to a file
+        Args:
+            export_path: Path to export file (default: timestamped file)
+        Returns:
+            bool: True if successful
+        """
+        with self._lock:
+            if export_path is None:
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                export_path = self.storage_path.parent / f"analytics_export_{timestamp}.json"
+            try:
+                with open(export_path, 'w') as f:
+                    export_data = {
+                        "exported_at": datetime.now().isoformat(),
+                        "statistics": self.data.to_dict(),
+                        "summary": self.get_summary()
+                    }
+                    json.dump(export_data, f, indent=4)
+                return True
+            except Exception as e:
+                print(f"Error exporting analytics: {e}")
+                return False
+# Singleton instance
+_analytics_manager = None
+_manager_lock = threading.Lock()
+def get_analytics_manager() -> AnalyticsManager:
+    """Get singleton AnalyticsManager instance"""
+    global _analytics_manager
+    if _analytics_manager is None:
+        with _manager_lock:
+            if _analytics_manager is None:
+                _analytics_manager = AnalyticsManager()
+    return _analytics_manager
+# Convenience functions
+def record_generation(
+    model_name: str,
+    style: str,
+    processing_time: float,
+    success: bool = True
+) -> None:
+    """Record a caption generation (convenience function)"""
+    get_analytics_manager().record_caption_generation(
+        model_name, style, processing_time, success
+    )
+def get_stats() -> dict:
+    """Get current statistics (convenience function)"""
+    return get_analytics_manager().get_stats()
+def get_summary() -> dict:
+    """Get analytics summary (convenience function)"""
+    return get_analytics_manager().get_summary()
+def get_display_stats() -> str:
+    """Get formatted display stats (convenience function)"""
+    return get_analytics_manager().get_display_stats()
+if __name__ == "__main__":
+    # Test the analytics manager
+    print("=" * 60)
+    print("ANALYTICS MANAGER - TEST MODE")
+    print("=" * 60)
+    # Initialize manager with test path
+    test_path = Path("cache/test_analytics.json")
+    analytics = AnalyticsManager(storage_path=test_path)
+    print("\n1. Initial state:")
+    print(f"   {analytics.get_display_stats()}")
+    print("\n2. Recording test generations:")
+    analytics.record_caption_generation("blip", "Professional", 2.5, True)
+    analytics.record_caption_generation("git", "Creative", 3.2, True)
+    analytics.record_caption_generation("blip", "Professional", 2.1, True)
+    analytics.record_caption_generation("git", "Social Media", 2.8, True)
+    analytics.record_caption_generation("blip", "Technical", 2.3, False)
+    print(f"   Recorded 5 generations (4 success, 1 error)")
+    print("\n3. Current statistics:")
+    stats = analytics.get_stats()
+    for key, value in stats.items():
+        if isinstance(value, dict):
+            print(f"   {key}:")
+            for k, v in value.items():
+                print(f"      {k}: {v}")
+        else:
+            print(f"   {key}: {value}")
+    print("\n4. Summary:")
+    summary = analytics.get_summary()
+    for key, value in summary.items():
+        if isinstance(value, dict):
+            print(f"   {key}:")
+            for k, v in value.items():
+                print(f"      {k}: {v}")
+        else:
+            print(f"   {key}: {value}")
+    print("\n5. Display format:")
+    print(f"   {analytics.get_display_stats()}")
+    print("\n6. File saved to:")
+    print(f"   {test_path}")
+    print("\n" + "=" * 60)
+    print("✓ Analytics manager tests complete")
+    print("=" * 60)
+    # Cleanup test file
+    if test_path.exists():
+        test_path.unlink()
+        print("\n✓ Test file cleaned up")

src/utils/cache_manager.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""
+Cache Management Module
+Implements intelligent caching for caption generation results.
+Uses LRU (Least Recently Used) eviction policy for memory efficiency.
+"""
+import time
+import json
+from typing import Optional, Any, Dict
+from collections import OrderedDict
+from dataclasses import dataclass, asdict
+from datetime import datetime
+import threading
+from config import cache_config
+@dataclass
+class CacheEntry:
+    """Represents a single cache entry with metadata"""
+    key: str
+    value: Any
+    timestamp: float
+    access_count: int = 0
+    last_accessed: float = None
+    def __post_init__(self):
+        if self.last_accessed is None:
+            self.last_accessed = self.timestamp
+    def to_dict(self) -> dict:
+        """Convert to dictionary"""
+        return asdict(self)
+class CacheManager:
+    """
+    Thread-safe LRU cache manager for caption results
+    Features:
+    - Automatic expiration based on TTL
+    - LRU eviction when max size reached
+    - Thread-safe operations
+    - Access statistics
+    - Memory-efficient storage
+    """
+    def __init__(
+        self,
+        max_size: int = cache_config.MAX_CACHE_SIZE,
+        ttl_seconds: int = cache_config.CACHE_TTL_SECONDS
+    ):
+        """
+        Initialize cache manager
+        Args:
+            max_size: Maximum number of cached items
+            ttl_seconds: Time to live for cache entries
+        """
+        self.max_size = max_size
+        self.ttl_seconds = ttl_seconds
+        # OrderedDict maintains insertion order and enables O(1) LRU
+        self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
+        # Thread safety
+        self._lock = threading.RLock()
+        # Statistics
+        self._stats = {
+            "hits": 0,
+            "misses": 0,
+            "evictions": 0,
+            "expirations": 0,
+            "total_sets": 0
+        }
+    def get(self, key: str) -> Optional[Any]:
+        """
+        Retrieve value from cache
+        Args:
+            key: Cache key
+        Returns:
+            Optional[Any]: Cached value or None if not found/expired
+        """
+        with self._lock:
+            if key not in self._cache:
+                self._stats["misses"] += 1
+                return None
+            entry = self._cache[key]
+            # Check if expired
+            if self._is_expired(entry):
+                self._remove_entry(key)
+                self._stats["expirations"] += 1
+                self._stats["misses"] += 1
+                return None
+            # Update access statistics
+            entry.access_count += 1
+            entry.last_accessed = time.time()
+            # Move to end (most recently used)
+            self._cache.move_to_end(key)
+            self._stats["hits"] += 1
+            return entry.value
+    def set(self, key: str, value: Any) -> bool:
+        """
+        Store value in cache
+        Args:
+            key: Cache key
+            value: Value to cache
+        Returns:
+            bool: True if successfully cached
+        """
+        with self._lock:
+            current_time = time.time()
+            # If key exists, update it
+            if key in self._cache:
+                entry = self._cache[key]
+                entry.value = value
+                entry.timestamp = current_time
+                entry.last_accessed = current_time
+                self._cache.move_to_end(key)
+            else:
+                # Check if we need to evict
+                if len(self._cache) >= self.max_size:
+                    self._evict_oldest()
+                # Add new entry
+                entry = CacheEntry(
+                    key=key,
+                    value=value,
+                    timestamp=current_time,
+                    last_accessed=current_time
+                )
+                self._cache[key] = entry
+            self._stats["total_sets"] += 1
+            return True
+    def delete(self, key: str) -> bool:
+        """
+        Remove entry from cache
+        Args:
+            key: Cache key
+        Returns:
+            bool: True if entry was deleted
+        """
+        with self._lock:
+            if key in self._cache:
+                del self._cache[key]
+                return True
+            return False
+    def clear(self) -> None:
+        """Clear all cache entries"""
+        with self._lock:
+            self._cache.clear()
+    def _is_expired(self, entry: CacheEntry) -> bool:
+        """Check if cache entry has expired"""
+        return (time.time() - entry.timestamp) > self.ttl_seconds
+    def _remove_entry(self, key: str) -> None:
+        """Remove entry without stats update"""
+        if key in self._cache:
+            del self._cache[key]
+    def _evict_oldest(self) -> None:
+        """Evict least recently used entry"""
+        if self._cache:
+            # OrderedDict: first item is least recently used
+            oldest_key = next(iter(self._cache))
+            del self._cache[oldest_key]
+            self._stats["evictions"] += 1
+    def cleanup_expired(self) -> int:
+        """
+        Remove all expired entries
+        Returns:
+            int: Number of entries removed
+        """
+        with self._lock:
+            current_time = time.time()
+            expired_keys = [
+                key for key, entry in self._cache.items()
+                if (current_time - entry.timestamp) > self.ttl_seconds
+            ]
+            for key in expired_keys:
+                del self._cache[key]
+            if expired_keys:
+                self._stats["expirations"] += len(expired_keys)
+            return len(expired_keys)
+    def get_stats(self) -> dict:
+        """
+        Get cache statistics
+        Returns:
+            dict: Cache statistics including hit rate
+        """
+        with self._lock:
+            total_requests = self._stats["hits"] + self._stats["misses"]
+            hit_rate = (
+                (self._stats["hits"] / total_requests * 100)
+                if total_requests > 0 else 0
+            )
+            return {
+                **self._stats,
+                "size": len(self._cache),
+                "max_size": self.max_size,
+                "hit_rate": round(hit_rate, 2),
+                "total_requests": total_requests
+            }
+    def get_info(self) -> dict:
+        """
+        Get detailed cache information
+        Returns:
+            dict: Detailed cache state
+        """
+        with self._lock:
+            entries_info = []
+            for key, entry in self._cache.items():
+                age_seconds = time.time() - entry.timestamp
+                entries_info.append({
+                    "key": key[:50] + "..." if len(key) > 50 else key,
+                    "age_seconds": round(age_seconds, 2),
+                    "access_count": entry.access_count,
+                    "size_estimate": len(str(entry.value))
+                })
+            return {
+                "stats": self.get_stats(),
+                "entries": entries_info[:10],  # Show top 10
+                "config": {
+                    "max_size": self.max_size,
+                    "ttl_seconds": self.ttl_seconds
+                }
+            }
+class CaptionCache:
+    """
+    Specialized cache for image captions
+    Manages caching of caption generation results with image hash keys
+    """
+    def __init__(self):
+        """Initialize caption cache"""
+        self.cache = CacheManager(
+            max_size=cache_config.MAX_CACHE_SIZE,
+            ttl_seconds=cache_config.CACHE_TTL_SECONDS
+        )
+        self.enabled = cache_config.ENABLE_CAPTION_CACHE
+    def get_caption(
+        self,
+        image_hash: str,
+        model_name: str,
+        style: str
+    ) -> Optional[str]:
+        """
+        Retrieve cached caption
+        Args:
+            image_hash: Hash of the image
+            model_name: Name of the caption model
+            style: Style applied
+        Returns:
+            Optional[str]: Cached caption or None
+        """
+        if not self.enabled:
+            return None
+        cache_key = self._generate_key(image_hash, model_name, style)
+        return self.cache.get(cache_key)
+    def set_caption(
+        self,
+        image_hash: str,
+        model_name: str,
+        style: str,
+        caption: str
+    ) -> bool:
+        """
+        Store caption in cache
+        Args:
+            image_hash: Hash of the image
+            model_name: Name of the caption model
+            style: Style applied
+            caption: Generated caption
+        Returns:
+            bool: True if successfully cached
+        """
+        if not self.enabled:
+            return False
+        cache_key = self._generate_key(image_hash, model_name, style)
+        return self.cache.set(cache_key, caption)
+    def _generate_key(self, image_hash: str, model_name: str, style: str) -> str:
+        """Generate cache key from components"""
+        return f"{image_hash}:{model_name}:{style}"
+    def get_stats(self) -> dict:
+        """Get cache statistics"""
+        return self.cache.get_stats()
+    def clear(self) -> None:
+        """Clear all cached captions"""
+        self.cache.clear()
+    def cleanup(self) -> int:
+        """Clean up expired entries"""
+        return self.cache.cleanup_expired()
+# Singleton instances
+_cache_manager = None
+_caption_cache = None
+def get_cache_manager() -> CacheManager:
+    """Get singleton CacheManager instance"""
+    global _cache_manager
+    if _cache_manager is None:
+        _cache_manager = CacheManager()
+    return _cache_manager
+def get_caption_cache() -> CaptionCache:
+    """Get singleton CaptionCache instance"""
+    global _caption_cache
+    if _caption_cache is None:
+        _caption_cache = CaptionCache()
+    return _caption_cache
+if __name__ == "__main__":
+    # Test the cache manager
+    print("=" * 60)
+    print("CACHE MANAGER - TEST MODE")
+    print("=" * 60)
+    # Test basic cache operations
+    cache = CacheManager(max_size=3, ttl_seconds=5)
+    print("\n1. Testing SET operations:")
+    cache.set("key1", "value1")
+    cache.set("key2", "value2")
+    cache.set("key3", "value3")
+    print(f"   Added 3 items")
+    print(f"   Cache size: {len(cache._cache)}")
+    print("\n2. Testing GET operations:")
+    result = cache.get("key1")
+    print(f"   Get 'key1': {result}")
+    print(f"   Stats: {cache.get_stats()}")
+    print("\n3. Testing LRU eviction:")
+    cache.set("key4", "value4")  # Should evict key2
+    print(f"   Added 'key4'")
+    print(f"   Cache size: {len(cache._cache)}")
+    print(f"   Keys in cache: {list(cache._cache.keys())}")
+    print("\n4. Testing TTL expiration:")
+    print(f"   Waiting 6 seconds for expiration...")
+    time.sleep(6)
+    expired = cache.cleanup_expired()
+    print(f"   Expired entries: {expired}")
+    print(f"   Cache size: {len(cache._cache)}")
+    print("\n5. Final stats:")
+    stats = cache.get_stats()
+    for key, value in stats.items():
+        print(f"   {key}: {value}")
+    print("\n" + "=" * 60)
+    print("✓ Cache manager tests complete")
+    print("=" * 60)

src/utils/image_processor.py ADDED Viewed

	@@ -0,0 +1,373 @@

+"""
+Image Processing Module
+Handles image validation, preprocessing, and optimization for caption generation.
+Ensures images meet model requirements while maintaining quality.
+"""
+import io
+import hashlib
+from pathlib import Path
+from typing import Tuple, Union
+from PIL import Image, ImageOps
+from config import image_config
+class ImageProcessingError(Exception):
+    """Custom exception for image processing errors"""
+    pass
+class ImageProcessor:
+    """
+    Enterprise-grade image processor for caption generation pipeline
+    Responsibilities:
+    - Validate image format and size
+    - Resize and optimize images
+    - Generate cache keys
+    - Handle edge cases and errors gracefully
+    """
+    def __init__(self):
+        """Initialize image processor with configuration"""
+        self.max_size = image_config.MAX_FILE_SIZE_BYTES
+        self.max_dimension = image_config.MAX_DIMENSION
+        self.min_dimension = image_config.MIN_DIMENSION
+        self.allowed_formats = image_config.ALLOWED_FORMATS
+        self.quality = image_config.RESIZE_QUALITY
+    def validate_image(self, image: Union[str, Path, Image.Image, bytes]) -> Tuple[bool, str]:
+        """
+        Validate image meets all requirements
+        Args:
+            image: Image path, PIL Image, or bytes
+        Returns:
+            Tuple[bool, str]: (is_valid, error_message)
+        """
+        try:
+            # Load image if path or bytes provided
+            if isinstance(image, (str, Path)):
+                img = Image.open(image)
+            elif isinstance(image, bytes):
+                img = Image.open(io.BytesIO(image))
+            elif isinstance(image, Image.Image):
+                img = image
+            else:
+                return False, f"Unsupported image type: {type(image)}"
+            # Check format (handle None format from Gradio)
+            # When Gradio passes PIL images with type="pil", format can be None
+            if hasattr(img, 'format') and img.format is not None:
+                if img.format.upper() not in [fmt.upper() for fmt in self.allowed_formats]:
+                    return False, f"Unsupported format: {img.format}. Allowed: {', '.join(self.allowed_formats)}"
+            else:
+                # Format is None - likely from Gradio's PIL conversion
+                # We'll validate by checking if it's a valid PIL image
+                print(f"DEBUG: Image format is None (from Gradio), skipping format check")
+            # Check dimensions
+            width, height = img.size
+            if width < self.min_dimension or height < self.min_dimension:
+                return False, f"Image too small. Minimum: {self.min_dimension}x{self.min_dimension}px"
+            if width > 10000 or height > 10000:
+                return False, "Image dimensions too large (max: 10000x10000px)"
+            # Check file size (if path provided)
+            if isinstance(image, (str, Path)):
+                file_size = Path(image).stat().st_size
+                if file_size > self.max_size:
+                    max_mb = self.max_size / (1024 * 1024)
+                    actual_mb = file_size / (1024 * 1024)
+                    return False, f"File too large: {actual_mb:.1f}MB (max: {max_mb}MB)"
+            # Try to verify image integrity (skip if format is None)
+            if hasattr(img, 'format') and img.format is not None:
+                # Create a copy before verify (verify closes the file)
+                img_copy = img.copy()
+                img_copy.verify()
+            return True, ""
+        except Exception as e:
+            return False, f"Image validation failed: {str(e)}"
+    def preprocess_image(
+        self,
+        image: Union[str, Path, Image.Image, bytes]
+    ) -> Tuple[Image.Image, dict]:
+        """
+        Preprocess image for model input
+        Args:
+            image: Image path, PIL Image, or bytes
+        Returns:
+            Tuple[Image.Image, dict]: (processed_image, metadata)
+        Raises:
+            ImageProcessingError: If preprocessing fails
+        """
+        try:
+            print(f"DEBUG: Preprocessing image of type: {type(image)}")
+            # Validate first
+            is_valid, error_msg = self.validate_image(image)
+            if not is_valid:
+                print(f"DEBUG: Validation failed: {error_msg}")
+                raise ImageProcessingError(error_msg)
+            # Load image
+            if isinstance(image, (str, Path)):
+                img = Image.open(image)
+            elif isinstance(image, bytes):
+                img = Image.open(io.BytesIO(image))
+            elif isinstance(image, Image.Image):
+                img = image.copy()  # Don't modify original
+            else:
+                raise ImageProcessingError(f"Unsupported image type: {type(image)}")
+            # Store original metadata
+            original_size = img.size
+            original_format = img.format if hasattr(img, 'format') else 'Unknown'
+            original_mode = img.mode
+            print(f"DEBUG: Original format: {original_format}, mode: {original_mode}, size: {original_size}")
+            # Convert to RGB if needed (handles RGBA, grayscale, etc.)
+            if img.mode != "RGB":
+                if img.mode == "RGBA":
+                    # Create white background for transparent images
+                    background = Image.new("RGB", img.size, (255, 255, 255))
+                    background.paste(img, mask=img.split()[-1])  # Use alpha channel as mask
+                    img = background
+                else:
+                    img = img.convert("RGB")
+            # Auto-orient based on EXIF data
+            img = ImageOps.exif_transpose(img)
+            # Resize if needed
+            if max(img.size) > self.max_dimension:
+                img = self._resize_image(img)
+            # Generate metadata
+            metadata = {
+                "original_size": original_size,
+                "original_format": original_format,
+                "original_mode": original_mode,
+                "processed_size": img.size,
+                "processed_mode": img.mode,
+                "was_resized": original_size != img.size,
+                "was_converted": original_mode != img.mode
+            }
+            print(f"DEBUG: Preprocessing complete. Final size: {img.size}, mode: {img.mode}")
+            return img, metadata
+        except ImageProcessingError:
+            raise
+        except Exception as e:
+            print(f"DEBUG: Exception during preprocessing: {str(e)}")
+            raise ImageProcessingError(f"Preprocessing failed: {str(e)}")
+    def _resize_image(self, img: Image.Image) -> Image.Image:
+        """
+        Resize image maintaining aspect ratio
+        Args:
+            img: PIL Image
+        Returns:
+            Image.Image: Resized image
+        """
+        width, height = img.size
+        if image_config.MAINTAIN_ASPECT_RATIO:
+            # Calculate new dimensions maintaining aspect ratio
+            if width > height:
+                new_width = self.max_dimension
+                new_height = int((height / width) * self.max_dimension)
+            else:
+                new_height = self.max_dimension
+                new_width = int((width / height) * self.max_dimension)
+        else:
+            new_width = self.max_dimension
+            new_height = self.max_dimension
+        # Use high-quality resampling
+        img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        return img
+    def generate_image_hash(
+        self,
+        image: Union[str, Path, Image.Image, bytes],
+        algorithm: str = "md5"
+    ) -> str:
+        """
+        Generate unique hash for image (for caching)
+        Args:
+            image: Image path, PIL Image, or bytes
+            algorithm: Hash algorithm (md5, sha256)
+        Returns:
+            str: Hexadecimal hash string
+        """
+        try:
+            # Convert to bytes
+            if isinstance(image, (str, Path)):
+                with open(image, "rb") as f:
+                    image_bytes = f.read()
+            elif isinstance(image, bytes):
+                image_bytes = image
+            elif isinstance(image, Image.Image):
+                buffer = io.BytesIO()
+                image.save(buffer, format="PNG")
+                image_bytes = buffer.getvalue()
+            else:
+                raise ValueError(f"Unsupported type for hashing: {type(image)}")
+            # Generate hash
+            if algorithm == "md5":
+                return hashlib.md5(image_bytes).hexdigest()
+            elif algorithm == "sha256":
+                return hashlib.sha256(image_bytes).hexdigest()
+            else:
+                raise ValueError(f"Unsupported hash algorithm: {algorithm}")
+        except Exception as e:
+            raise ImageProcessingError(f"Hash generation failed: {str(e)}")
+    def image_to_bytes(self, img: Image.Image, format: str = "PNG") -> bytes:
+        """
+        Convert PIL Image to bytes
+        Args:
+            img: PIL Image
+            format: Output format (PNG, JPEG)
+        Returns:
+            bytes: Image bytes
+        """
+        buffer = io.BytesIO()
+        img.save(buffer, format=format, quality=self.quality)
+        return buffer.getvalue()
+    def get_image_info(self, image: Union[str, Path, Image.Image]) -> dict:
+        """
+        Get detailed image information
+        Args:
+            image: Image path or PIL Image
+        Returns:
+            dict: Image information
+        """
+        try:
+            if isinstance(image, (str, Path)):
+                img = Image.open(image)
+                file_size = Path(image).stat().st_size
+            elif isinstance(image, Image.Image):
+                img = image
+                file_size = len(self.image_to_bytes(img))
+            else:
+                raise ValueError(f"Unsupported type: {type(image)}")
+            return {
+                "format": img.format,
+                "mode": img.mode,
+                "size": img.size,
+                "width": img.size[0],
+                "height": img.size[1],
+                "file_size": file_size,
+                "file_size_mb": file_size / (1024 * 1024),
+                "aspect_ratio": img.size[0] / img.size[1],
+                "megapixels": (img.size[0] * img.size[1]) / 1_000_000
+            }
+        except Exception as e:
+            raise ImageProcessingError(f"Failed to get image info: {str(e)}")
+# ============================================================================
+# SINGLETON INSTANCE AND CONVENIENCE FUNCTIONS
+# ============================================================================
+_image_processor = None
+def get_image_processor() -> ImageProcessor:
+    """Get singleton ImageProcessor instance"""
+    global _image_processor
+    if _image_processor is None:
+        _image_processor = ImageProcessor()
+    return _image_processor
+# Convenience wrapper functions for backward compatibility
+def validate_image(image: Union[str, Path, Image.Image, bytes]) -> Tuple[bool, str]:
+    """
+    Convenience function: Validate image using singleton processor
+    Args:
+        image: Image path, PIL Image, or bytes
+    Returns:
+        Tuple[bool, str]: (is_valid, error_message)
+    """
+    return get_image_processor().validate_image(image)
+def preprocess_image(
+    image: Union[str, Path, Image.Image, bytes]
+) -> Tuple[Image.Image, dict]:
+    """
+    Convenience function: Preprocess image using singleton processor
+    Args:
+        image: Image path, PIL Image, or bytes
+    Returns:
+        Tuple[Image.Image, dict]: (processed_image, metadata)
+    """
+    return get_image_processor().preprocess_image(image)
+def generate_image_hash(
+    image: Union[str, Path, Image.Image, bytes],
+    algorithm: str = "md5"
+) -> str:
+    """
+    Convenience function: Generate image hash using singleton processor
+    Args:
+        image: Image path, PIL Image, or bytes
+        algorithm: Hash algorithm (md5, sha256)
+    Returns:
+        str: Hexadecimal hash string
+    """
+    return get_image_processor().generate_image_hash(image, algorithm)
+if __name__ == "__main__":
+    # Test the image processor
+    print("=" * 60)
+    print("IMAGE PROCESSOR - TEST MODE")
+    print("=" * 60)
+    processor = get_image_processor()
+    print(f"✓ ImageProcessor initialized")
+    print(f"  - Max file size: {processor.max_size / (1024*1024):.1f}MB")
+    print(f"  - Max dimension: {processor.max_dimension}px")
+    print(f"  - Allowed formats: {', '.join(processor.allowed_formats)}")
+    print(f"  - Quality: {processor.quality}")
+    print("=" * 60)
+    print("Ready for testing with actual images")
+    print("=" * 60)