Spaces:

accessibilitychecker
/

AccessibilityCheckerBackend

Running

File size: 12,978 Bytes

bbfde3f

"""

Local AI Vision Models for Alt Text Generation (100% FREE)

Uses Hugging Face transformers to run models locally - no API costs!



Supported models:

- BLIP: Good balance of speed and quality

- GIT: More detailed descriptions

- LLAVA: Most advanced (requires more resources)

"""

import os
from typing import Optional
from pathlib import Path
import io

try:
    from PIL import Image
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False
    print("⚠️  Pillow not installed. Run: pip install pillow")

try:
    from transformers import BlipProcessor, BlipForConditionalGeneration
    from transformers import AutoProcessor, AutoModelForCausalLM
    import torch
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️  Transformers not installed. Run: pip install transformers torch")


class LocalVisionModel:
    """

    Local AI model for generating image descriptions

    Runs on your computer - 100% FREE with no API limits!

    """
    
    def __init__(self, model_name: str = "blip-base"):
        """

        Initialize local vision model

        

        Args:

            model_name: Model to use

                - "blip-base" (default): Fast, good quality, ~1GB

                - "blip-large": Better quality, slower, ~2GB

                - "git-base": Alternative model, ~1.5GB

        """
        self.model_name = model_name
        self.enabled = False
        self.model = None
        self.processor = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        if not TRANSFORMERS_AVAILABLE:
            print("❌ Transformers library not available")
            print("   Install with: pip install transformers torch")
            return
        
        if not PIL_AVAILABLE:
            print("❌ Pillow not available")
            print("   Install with: pip install pillow")
            return
        
        # Load model
        try:
            print(f"📥 Loading {model_name} model... (this may take a minute on first run)")
            
            if "blip" in model_name.lower():
                self._load_blip_model(model_name)
            elif "git" in model_name.lower():
                self._load_git_model()
            else:
                print(f"⚠️  Unknown model: {model_name}, defaulting to BLIP")
                self._load_blip_model("blip-base")
            
            self.enabled = True
            print(f"✅ {model_name} model loaded successfully on {self.device}")
            
        except Exception as e:
            print(f"❌ Failed to load model: {e}")
            self.enabled = False
    
    def _load_blip_model(self, model_name: str):
        """Load BLIP model (recommended for most use cases)"""
        if "large" in model_name:
            model_id = "Salesforce/blip-image-captioning-large"
        else:
            model_id = "Salesforce/blip-image-captioning-base"
        
        self.processor = BlipProcessor.from_pretrained(model_id)
        self.model = BlipForConditionalGeneration.from_pretrained(model_id)
        self.model.to(self.device)
        self.model_type = "blip"
    
    def _load_git_model(self):
        """Load GIT model (alternative to BLIP)"""
        model_id = "microsoft/git-base"
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(model_id)
        self.model.to(self.device)
        self.model_type = "git"
    
    def is_enabled(self) -> bool:
        """Check if model is loaded and ready"""
        return self.enabled and self.model is not None
    
    def generate_alt_text(

        self,

        image_data: bytes,

        shape_name: str = "",

        slide_number: int = 0,

        max_length: int = 250

    ) -> Optional[str]:
        """

        Generate alt text for an image using local AI

        

        Args:

            image_data: Raw image bytes

            shape_name: Shape name (for context)

            slide_number: Slide number (for context)

            max_length: Maximum alt text length

            

        Returns:

            Generated alt text or None if failed

        """
        if not self.is_enabled():
            return None
        
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_data)).convert("RGB")
            
            # Check if image looks decorative (very small, likely a logo/icon)
            if image.size[0] < 100 and image.size[1] < 100:
                # Small image - likely decorative
                if any(hint in shape_name.lower() for hint in ["logo", "icon", "background", "border"]):
                    return "decorative"
            
            # Generate description
            if self.model_type == "blip":
                alt_text = self._generate_blip(image)
            elif self.model_type == "git":
                alt_text = self._generate_git(image)
            else:
                return None
            
            # Clean up the text
            alt_text = self._clean_alt_text(alt_text, max_length)
            
            return alt_text
            
        except Exception as e:
            print(f"Error generating alt text: {e}")
            return None
    
    def _generate_blip(self, image: Image.Image) -> str:
        """Generate caption using BLIP model"""
        # Process image
        inputs = self.processor(image, return_tensors="pt").to(self.device)
        
        # Generate caption
        with torch.no_grad():
            out = self.model.generate(
                **inputs,
                max_length=50,
                num_beams=5,  # Better quality with beam search
                early_stopping=True
            )
        
        caption = self.processor.decode(out[0], skip_special_tokens=True)
        return caption
    
    def _generate_git(self, image: Image.Image) -> str:
        """Generate caption using GIT model"""
        # Process image
        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
        
        # Generate caption
        with torch.no_grad():
            generated_ids = self.model.generate(
                pixel_values=inputs.pixel_values,
                max_length=50
            )
        
        caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return caption
    
    def _clean_alt_text(self, alt_text: str, max_length: int) -> str:
        """Clean and format generated alt text"""
        # Remove common prefixes that BLIP adds
        prefixes_to_remove = [
            "a picture of ",
            "an image of ",
            "a photo of ",
            "there is ",
            "arafed ",  # Common BLIP artifact
        ]
        
        alt_text_lower = alt_text.lower()
        for prefix in prefixes_to_remove:
            if alt_text_lower.startswith(prefix):
                alt_text = alt_text[len(prefix):]
                break
        
        # Capitalize first letter
        if alt_text:
            alt_text = alt_text[0].upper() + alt_text[1:]
        
        # Truncate if needed
        if len(alt_text) > max_length:
            alt_text = alt_text[:max_length-3] + "..."
        
        return alt_text.strip()


class HuggingFaceInferenceAPI:
    """

    Hugging Face Inference API (FREE tier available)

    Falls back to this if local models don't work

    """
    
    def __init__(self, api_token: Optional[str] = None):
        """

        Initialize Hugging Face Inference API

        

        Args:

            api_token: HF token (if None, reads from HF_TOKEN env var)

                      Get free token at: https://huggingface.co/settings/tokens

        """
        self.api_token = api_token or os.getenv("HF_TOKEN")
        self.enabled = False
        
        if not self.api_token:
            print("⚠️  No Hugging Face token found. Set HF_TOKEN environment variable.")
            print("   Get free token at: https://huggingface.co/settings/tokens")
            return
        
        try:
            import requests
            self.requests = requests
            self.enabled = True
            self.api_url = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
            print("✅ Hugging Face Inference API initialized")
        except ImportError:
            print("❌ 'requests' library not available. Run: pip install requests")
    
    def is_enabled(self) -> bool:
        """Check if API is ready"""
        return self.enabled and self.api_token is not None
    
    def generate_alt_text(

        self,

        image_data: bytes,

        shape_name: str = "",

        slide_number: int = 0,

        max_length: int = 250

    ) -> Optional[str]:
        """

        Generate alt text using Hugging Face Inference API

        

        Args:

            image_data: Raw image bytes

            shape_name: Shape name

            slide_number: Slide number

            max_length: Maximum length

            

        Returns:

            Generated alt text or None

        """
        if not self.is_enabled():
            return None
        
        try:
            headers = {"Authorization": f"Bearer {self.api_token}"}
            response = self.requests.post(
                self.api_url,
                headers=headers,
                data=image_data,
                timeout=30
            )
            
            if response.status_code == 200:
                result = response.json()
                if isinstance(result, list) and len(result) > 0:
                    caption = result[0].get("generated_text", "")
                    return self._clean_alt_text(caption, max_length)
            else:
                print(f"HF API error: {response.status_code}")
                return None
                
        except Exception as e:
            print(f"HF API request failed: {e}")
            return None
    
    def _clean_alt_text(self, alt_text: str, max_length: int) -> str:
        """Clean generated text"""
        # Remove common prefixes
        prefixes = ["a picture of ", "an image of ", "a photo of "]
        alt_text_lower = alt_text.lower()
        for prefix in prefixes:
            if alt_text_lower.startswith(prefix):
                alt_text = alt_text[len(prefix):]
                break
        
        # Capitalize first letter
        if alt_text:
            alt_text = alt_text[0].upper() + alt_text[1:]
        
        # Truncate if needed
        if len(alt_text) > max_length:
            alt_text = alt_text[:max_length-3] + "..."
        
        return alt_text.strip()


# Singleton instances
_local_model: Optional[LocalVisionModel] = None
_hf_api: Optional[HuggingFaceInferenceAPI] = None


def get_vision_model() -> Optional[LocalVisionModel]:
    """Get or create local vision model singleton"""
    global _local_model
    if _local_model is None:
        model_name = os.getenv("LOCAL_VISION_MODEL", "blip-base")
        _local_model = LocalVisionModel(model_name)
    return _local_model


def get_hf_api() -> Optional[HuggingFaceInferenceAPI]:
    """Get or create Hugging Face API singleton"""
    global _hf_api
    if _hf_api is None:
        _hf_api = HuggingFaceInferenceAPI()
    return _hf_api


def generate_alt_text_free(

    image_data: bytes,

    shape_name: str = "",

    slide_number: int = 0,

    max_length: int = 250

) -> Optional[str]:
    """

    Generate alt text using FREE methods (tries local first, then HF API)

    

    Priority:

    1. Local AI model (completely free, unlimited)

    2. Hugging Face Inference API (free tier)

    3. None (fallback to placeholder in main code)

    

    Args:

        image_data: Raw image bytes

        shape_name: Shape name

        slide_number: Slide number

        max_length: Maximum length

        

    Returns:

        Generated alt text or None

    """
    # Try local model first (best option - free and unlimited)
    local_model = get_vision_model()
    if local_model and local_model.is_enabled():
        result = local_model.generate_alt_text(image_data, shape_name, slide_number, max_length)
        if result:
            return result
    
    # Fallback to Hugging Face API (free tier)
    hf_api = get_hf_api()
    if hf_api and hf_api.is_enabled():
        result = hf_api.generate_alt_text(image_data, shape_name, slide_number, max_length)
        if result:
            return result
    
    # If both fail, return None (main code will use placeholder)
    return None